Merge remote-tracking branch 'upstream/master' into vectorize_earliest_num

This commit is contained in:
Soumyava Das 2023-08-15 08:39:32 -07:00
commit aa971815a8
325 changed files with 11007 additions and 8050 deletions

View File

@ -4,6 +4,7 @@ updates:
directory: "/"
schedule:
interval: "daily"
open-pull-requests-limit: 20
ignore:
- dependency-name: "com.google.guava:guava"
# pin ZooKeeper dependencies to 3.5.x
@ -18,3 +19,6 @@ updates:
# Even then this will involve significant effort.
# See https://github.com/apache/druid/pull/12258
- dependency-name: "org.apache.calcite"
# jclouds 2.1 needs Guava 18+
- dependency-name: "org.apache.jclouds"
versions: "[2.1,)"

View File

@ -15,7 +15,7 @@
#!bin/bash
${MVN} ${MAVEN_SKIP} dependency:analyze -DoutputXML=true -DignoreNonCompile=true -DfailOnWarning=true ${HADOOP_PROFILE} ||
${MVN} ${MAVEN_SKIP} dependency:analyze -DoutputXML=true -DignoreNonCompile=true -DfailOnWarning=true ||
{ echo "
The dependency analysis has found a dependency that is either:

View File

@ -20,7 +20,7 @@ set -e
./.github/scripts/setup_generate_license.sh
${MVN} apache-rat:check -Prat --fail-at-end \
-Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer.Slf4jMavenTransferListener=warn \
-Drat.consoleOutput=true ${HADOOP_PROFILE}
-Drat.consoleOutput=true
# Generate dependency reports and checks they are valid.
mkdir -p target
distribution/bin/generate-license-dependency-reports.py . target --clean-maven-artifact-transfer --parallel 2

View File

@ -111,8 +111,6 @@ jobs:
name: security vulnerabilities
strategy:
fail-fast: false
matrix:
HADOOP_PROFILE: [ '', '-Phadoop2' ]
runs-on: ubuntu-latest
steps:
- name: Checkout branch
@ -129,10 +127,8 @@ jobs:
run: mvn clean install dependency:go-offline -P dist -P skip-static-checks,skip-tests -Dmaven.javadoc.skip=true -Dcyclonedx.skip=true -Dweb.console.skip=true
- name: security vulnerabilities check
env:
HADOOP_PROFILE: ${{ matrix.HADOOP_PROFILE }}
run: |
mvn dependency-check:purge dependency-check:check ${HADOOP_PROFILE} || { echo "
mvn dependency-check:purge dependency-check:check || { echo "
The OWASP dependency check has found security vulnerabilities. Please use a newer version
of the dependency that does not have vulnerabilities. To see a report run
`mvn dependency-check:check`

View File

@ -95,13 +95,6 @@ jobs:
run: |
./.github/scripts/analyze_dependencies_script.sh
- name: analyze dependencies for hadoop2
if: ${{ matrix.java == 'jdk8' }}
env:
HADOOP_PROFILE: -Phadoop2
run: |
./.github/scripts/analyze_dependencies_script.sh
- name: animal sniffer checks
if: ${{ matrix.java == 'jdk8' }}
run: ${MVN} animal-sniffer:check --fail-at-end

View File

@ -261,7 +261,7 @@
</plugin>
<plugin>
<artifactId>maven-source-plugin</artifactId>
<version>2.2.1</version>
<version>3.3.0</version>
</plugin>
<plugin>
<artifactId>maven-surefire-plugin</artifactId>

View File

@ -76,8 +76,8 @@
<artifactId>aws-java-sdk-core</artifactId>
</dependency>
<dependency>
<groupId>javax.validation</groupId>
<artifactId>validation-api</artifactId>
<groupId>jakarta.validation</groupId>
<artifactId>jakarta.validation-api</artifactId>
</dependency>
<!-- Runtime -->

View File

@ -22,11 +22,50 @@ package org.apache.druid.common.aws;
import com.amazonaws.AmazonClientException;
import com.amazonaws.AmazonServiceException;
import com.amazonaws.retry.RetryUtils;
import com.amazonaws.services.s3.model.DeleteObjectsRequest;
import com.amazonaws.services.s3.model.MultiObjectDeleteException;
import com.google.common.collect.ImmutableSet;
import java.io.IOException;
import java.util.Set;
public class AWSClientUtil
{
/**
* This list of error code come from {@link RetryUtils}, and
* <a href="https://docs.aws.amazon.com/AmazonS3/latest/API/ErrorResponses.html">...</a>. At the moment, aws sdk
* does not expose a good way of retrying
* {@link com.amazonaws.services.s3.AmazonS3#deleteObjects(DeleteObjectsRequest)} requests. This request is used in
* org.apache.druid.storage.s3.S3DataSegmentKiller to delete a batch of segments from deep storage.
*/
private static final Set<String> RECOVERABLE_ERROR_CODES = ImmutableSet.of(
"503 SlowDown",
"AuthFailure",
"BandwidthLimitExceeded",
"EC2ThrottledException",
"IDPCommunicationError",
"InternalError",
"InvalidSignatureException",
"PriorRequestNotComplete",
"ProvisionedThroughputExceededException",
"RequestExpired",
"RequestInTheFuture",
"RequestLimitExceeded",
"RequestThrottled",
"RequestThrottledException",
"RequestTimeTooSkewed",
"RequestTimeout",
"RequestTimeoutException",
"ServiceUnavailable",
"SignatureDoesNotMatch",
"SlowDown",
"ThrottledException",
"ThrottlingException",
"TooManyRequestsException",
"TransactionInProgressException",
"Throttling"
);
/**
* Checks whether an exception can be retried or not. Implementation is copied
* from {@link com.amazonaws.retry.PredefinedRetryPolicies.SDKDefaultRetryCondition} except deprecated methods
@ -54,6 +93,19 @@ public class AWSClientUtil
return true;
}
return RetryUtils.isClockSkewError(exception);
if (RetryUtils.isClockSkewError(exception)) {
return true;
}
if (exception instanceof MultiObjectDeleteException) {
MultiObjectDeleteException multiObjectDeleteException = (MultiObjectDeleteException) exception;
for (MultiObjectDeleteException.DeleteError error : multiObjectDeleteException.getErrors()) {
if (RECOVERABLE_ERROR_CODES.contains(error.getCode())) {
return true;
}
}
}
return false;
}
}

View File

@ -21,6 +21,8 @@ package org.apache.druid.common.aws;
import com.amazonaws.AmazonClientException;
import com.amazonaws.AmazonServiceException;
import com.amazonaws.services.s3.model.MultiObjectDeleteException;
import com.google.common.collect.ImmutableList;
import org.junit.Assert;
import org.junit.Test;
@ -82,6 +84,20 @@ public class AWSClientUtilTest
Assert.assertTrue(AWSClientUtil.isClientExceptionRecoverable(ex));
}
@Test
public void testRecoverableException_MultiObjectDeleteException()
{
MultiObjectDeleteException.DeleteError retryableError = new MultiObjectDeleteException.DeleteError();
retryableError.setCode("RequestLimitExceeded");
MultiObjectDeleteException.DeleteError nonRetryableError = new MultiObjectDeleteException.DeleteError();
nonRetryableError.setCode("nonRetryableError");
MultiObjectDeleteException ex = new MultiObjectDeleteException(
ImmutableList.of(retryableError, nonRetryableError),
ImmutableList.of()
);
Assert.assertTrue(AWSClientUtil.isClientExceptionRecoverable(ex));
}
@Test
public void testNonRecoverableException_RuntimeException()
{

View File

@ -115,191 +115,6 @@
</build>
<profiles>
<profile>
<id>dist-hadoop2</id>
<activation>
<activeByDefault>false</activeByDefault>
<property>
<name>tar</name>
</property>
</activation>
<build>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
<executions>
<execution>
<id>generate-readme</id>
<phase>initialize</phase>
<goals>
<goal>exec</goal>
</goals>
<configuration>
<executable>${project.basedir}/bin/build-textfile-readme.sh</executable>
<arguments>
<argument>${project.basedir}/../</argument>
<argument>${project.parent.version}</argument>
</arguments>
</configuration>
</execution>
<execution>
<id>generate-binary-license</id>
<phase>initialize</phase>
<goals>
<goal>exec</goal>
</goals>
<configuration>
<executable>${project.basedir}/bin/generate-binary-license.py</executable>
<arguments>
<argument>${project.parent.basedir}/licenses/APACHE2</argument>
<argument>${project.parent.basedir}/licenses.yaml</argument>
<argument>${project.parent.basedir}/LICENSE.BINARY</argument>
</arguments>
</configuration>
</execution>
<execution>
<id>generate-binary-notice</id>
<phase>initialize</phase>
<goals>
<goal>exec</goal>
</goals>
<configuration>
<executable>${project.basedir}/bin/generate-binary-notice.py</executable>
<arguments>
<argument>${project.parent.basedir}/NOTICE</argument>
<argument>${project.parent.basedir}/licenses.yaml</argument>
<argument>${project.parent.basedir}/NOTICE.BINARY</argument>
</arguments>
</configuration>
</execution>
<execution>
<id>pull-deps</id>
<phase>package</phase>
<goals>
<goal>exec</goal>
</goals>
<configuration>
<executable>${project.parent.basedir}/examples/bin/run-java</executable>
<arguments>
<argument>-classpath</argument>
<classpath />
<argument>-Ddruid.extensions.loadList=[]</argument>
<argument>-Ddruid.extensions.directory=${project.build.directory}/extensions
</argument>
<argument>
-Ddruid.extensions.hadoopDependenciesDir=${project.build.directory}/hadoop-dependencies
</argument>
<argument>-Dhadoop2.enabled=true</argument>
<argument>org.apache.druid.cli.Main</argument>
<argument>tools</argument>
<argument>pull-deps</argument>
<argument>--clean</argument>
<argument>--defaultVersion</argument>
<argument>${project.parent.version}</argument>
<argument>-l</argument>
<argument>${settings.localRepository}</argument>
<argument>-h</argument>
<argument>org.apache.hadoop:hadoop-client:${hadoop.compile.version}</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-avro-extensions</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-azure-extensions</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-bloom-filter</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-datasketches</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-hdfs-storage</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-histogram</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-kafka-extraction-namespace</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-kafka-indexing-service</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-kinesis-indexing-service</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-lookups-cached-global</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-lookups-cached-single</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-multi-stage-query</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-protobuf-extensions</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:mysql-metadata-storage</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-orc-extensions</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-parquet-extensions</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:postgresql-metadata-storage</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-kerberos</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-s3-extensions</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-aws-rds-extensions</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-ec2-extensions</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-google-extensions</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-stats</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:simple-client-sslcontext</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-basic-security</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-pac4j</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-ranger-security</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-kubernetes-extensions</argument>
<argument>-c</argument>
<argument>org.apache.druid.extensions:druid-catalog</argument>
<argument>${druid.distribution.pulldeps.opts}</argument>
</arguments>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<executions>
<execution>
<id>distro-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
<configuration>
<finalName>apache-druid-${project.parent.version}</finalName>
<tarLongFileMode>posix</tarLongFileMode>
<descriptors>
<descriptor>src/assembly/assembly.xml</descriptor>
</descriptors>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>license-maven-plugin</artifactId>
<executions>
<execution>
<id>download-licenses</id>
<goals>
<goal>download-licenses</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</profile>
<profile>
<id>dist</id>
<activation>

View File

@ -27,23 +27,44 @@ sidebar_label: SQL JDBC driver
> This document describes the SQL language.
You can make [Druid SQL](../querying/sql.md) queries using the [Avatica JDBC driver](https://calcite.apache.org/avatica/downloads/). We recommend using Avatica JDBC driver version 1.17.0 or later. Note that as of the time of this writing, Avatica 1.17.0, the latest version, does not support passing connection string parameters from the URL to Druid, so you must pass them using a `Properties` object. Once you've downloaded the Avatica client jar, add it to your classpath and use the connect string `jdbc:avatica:remote:url=http://BROKER:8082/druid/v2/sql/avatica/`.
You can make [Druid SQL](../querying/sql.md) queries using the [Avatica JDBC driver](https://calcite.apache.org/avatica/downloads/).
We recommend using Avatica JDBC driver version 1.22.0 or later.
Once you've downloaded the Avatica client jar, add it to your classpath.
When using the JDBC connector for the [examples](#examples) or in general, it's helpful to understand the parts of the connect string stored in the `url` variable:
Example connection string:
- `jdbc:avatica:remote:url=` is prepended to the hostname and port.
- The hostname and port number for your Druid deployment depends on whether you want to connect to the Router or a specific Broker. For more information, see [Connection stickiness](#connection-stickiness). In the case of the quickstart deployment, the hostname and port are `http://localhost:8888`, which connects to the Router running on your local machine.
- The SQL endpoint in Druid for the Avatica driver is `/druid/v2/sql/avatica/`.
```
jdbc:avatica:remote:url=http://localhost:8888/druid/v2/sql/avatica/;transparent_reconnect=true
```
Example code:
Or, to use the protobuf protocol instead of JSON:
```
jdbc:avatica:remote:url=http://localhost:8888/druid/v2/sql/avatica-protobuf/;transparent_reconnect=true;serialization=protobuf
```
The `url` is the `/druid/v2/sql/avatica/` endpoint on the Router, which routes JDBC connections to a consistent Broker.
For more information, see [Connection stickiness](#connection-stickiness).
Set `transparent_reconnect` to `true` so your connection is not interrupted if the pool of Brokers changes membership,
or if a Broker is restarted.
Set `serialization` to `protobuf` if using the protobuf endpoint.
Note that as of the time of this writing, Avatica 1.23.0, the latest version, does not support passing
[connection context parameters](../querying/sql-query-context.md) from the JDBC connection string to Druid. These context parameters
must be passed using a `Properties` object instead. Refer to the Java code below for an example.
Example Java code:
```java
// Connect to /druid/v2/sql/avatica/ on your Broker.
String url = "jdbc:avatica:remote:url=http://localhost:8082/druid/v2/sql/avatica/";
String url = "jdbc:avatica:remote:url=http://localhost:8888/druid/v2/sql/avatica/;transparent_reconnect=true";
// Set any connection context parameters you need here
// Or leave empty for default behavior.
// Set any connection context parameters you need here.
// Any property from https://druid.apache.org/docs/latest/querying/sql-query-context.html can go here.
Properties connectionProperties = new Properties();
connectionProperties.setProperty("sqlTimeZone", "Etc/UTC");
try (Connection connection = DriverManager.getConnection(url, connectionProperties)) {
try (
@ -62,7 +83,7 @@ For a runnable example that includes a query that you might run, see [Examples](
It is also possible to use a protocol buffers JDBC connection with Druid, this offer reduced bloat and potential performance
improvements for larger result sets. To use it apply the following connection URL instead, everything else remains the same
```
String url = "jdbc:avatica:remote:url=http://localhost:8082/druid/v2/sql/avatica-protobuf/;serialization=protobuf";
String url = "jdbc:avatica:remote:url=http://localhost:8888/druid/v2/sql/avatica-protobuf/;transparent_reconnect=true;serialization=protobuf";
```
> The protobuf endpoint is also known to work with the official [Golang Avatica driver](https://github.com/apache/calcite-avatica-go)
@ -130,11 +151,12 @@ public class JdbcListColumns {
{
// Connect to /druid/v2/sql/avatica/ on your Router.
// You can connect to a Broker but must configure connection stickiness if you do.
String url = "jdbc:avatica:remote:url=http://localhost:8888/druid/v2/sql/avatica/";
String url = "jdbc:avatica:remote:url=http://localhost:8888/druid/v2/sql/avatica/;transparent_reconnect=true";
String query = "SELECT COLUMN_NAME,* FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 'wikipedia' and TABLE_SCHEMA='druid'";
// Set any connection context parameters you need here
// Or leave empty for default behavior.
// Set any connection context parameters you need here.
// Any property from https://druid.apache.org/docs/latest/querying/sql-query-context.html can go here.
Properties connectionProperties = new Properties();
try (Connection connection = DriverManager.getConnection(url, connectionProperties)) {
@ -169,12 +191,13 @@ public class JdbcCountryAndTime {
{
// Connect to /druid/v2/sql/avatica/ on your Router.
// You can connect to a Broker but must configure connection stickiness if you do.
String url = "jdbc:avatica:remote:url=http://localhost:8888/druid/v2/sql/avatica/";
String url = "jdbc:avatica:remote:url=http://localhost:8888/druid/v2/sql/avatica/;transparent_reconnect=true";
//The query you want to run.
String query = "SELECT __time, isRobot, countryName, comment FROM wikipedia WHERE countryName='Japan'";
// Set any connection context parameters you need here
// Or leave empty for default behavior.
// Set any connection context parameters you need here.
// Any property from https://druid.apache.org/docs/latest/querying/sql-query-context.html can go here.
Properties connectionProperties = new Properties();
connectionProperties.setProperty("sqlTimeZone", "America/Los_Angeles");

View File

@ -934,6 +934,8 @@ A sample Coordinator dynamic config JSON object is shown below:
"replicantLifetime": 15,
"replicationThrottleLimit": 10,
"killDataSourceWhitelist": ["wikipedia", "testDatasource"],
"killTaskSlotRatio": 0.10,
"maxKillTaskSlots": 5,
"decommissioningNodes": ["localhost:8182", "localhost:8282"],
"decommissioningMaxPercentOfMaxSegmentsToMove": 70,
"pauseCoordination": false,
@ -944,25 +946,27 @@ A sample Coordinator dynamic config JSON object is shown below:
Issuing a GET request at the same URL will return the spec that is currently in place. A description of the config setup spec is shown below.
|Property|Description|Default|
|--------|-----------|-------|
|`millisToWaitBeforeDeleting`|How long does the Coordinator need to be a leader before it can start marking overshadowed segments as unused in metadata storage.|900000 (15 mins)|
|`mergeBytesLimit`|The maximum total uncompressed size in bytes of segments to merge.|524288000L|
|`mergeSegmentsLimit`|The maximum number of segments that can be in a single [append task](../ingestion/tasks.md).|100|
|`smartSegmentLoading`|Enables ["smart" segment loading mode](#smart-segment-loading) which dynamically computes the optimal values of several properties that maximize Coordinator performance.|true|
|`maxSegmentsToMove`|The maximum number of segments that can be moved at any given time.|100|
|`replicantLifetime`|The maximum number of Coordinator runs for which a segment can wait in the load queue of a Historical before Druid raises an alert.|15|
|`replicationThrottleLimit`|The maximum number of segment replicas that can be assigned to a historical tier in a single Coordinator run. This property prevents historicals from becoming overwhelmed when loading extra replicas of segments that are already available in the cluster.|500|
|`balancerComputeThreads`|Thread pool size for computing moving cost of segments during segment balancing. Consider increasing this if you have a lot of segments and moving segments begins to stall.|1|
|`killDataSourceWhitelist`|List of specific data sources for which kill tasks are sent if property `druid.coordinator.kill.on` is true. This can be a list of comma-separated data source names or a JSON array.|none|
|`killPendingSegmentsSkipList`|List of data sources for which pendingSegments are _NOT_ cleaned up if property `druid.coordinator.kill.pendingSegments.on` is true. This can be a list of comma-separated data sources or a JSON array.|none|
|`maxSegmentsInNodeLoadingQueue`|The maximum number of segments allowed in the load queue of any given server. Use this parameter to load segments faster if, for example, the cluster contains slow-loading nodes or if there are too many segments to be replicated to a particular node (when faster loading is preferred to better segments distribution). The optimal value depends on the loading speed of segments, acceptable replication time and number of nodes. |500|
|`useRoundRobinSegmentAssignment`|Boolean flag for whether segments should be assigned to historicals in a round robin fashion. When disabled, segment assignment is done using the chosen balancer strategy. When enabled, this can speed up segment assignments leaving balancing to move the segments to their optimal locations (based on the balancer strategy) lazily. |true|
|`decommissioningNodes`| List of historical servers to 'decommission'. Coordinator will not assign new segments to 'decommissioning' servers, and segments will be moved away from them to be placed on non-decommissioning servers at the maximum rate specified by `decommissioningMaxPercentOfMaxSegmentsToMove`.|none|
|`decommissioningMaxPercentOfMaxSegmentsToMove`| Upper limit of segments the Coordinator can move from decommissioning servers to active non-decommissioning servers during a single run. This value is relative to the total maximum number of segments that can be moved at any given time based upon the value of `maxSegmentsToMove`.<br /><br />If `decommissioningMaxPercentOfMaxSegmentsToMove` is 0, the Coordinator does not move segments to decommissioning servers, effectively putting them in a type of "maintenance" mode. In this case, decommissioning servers do not participate in balancing or assignment by load rules. The Coordinator still considers segments on decommissioning servers as candidates to replicate on active servers.<br /><br />Decommissioning can stall if there are no available active servers to move the segments to. You can use the maximum percent of decommissioning segment movements to prioritize balancing or to decrease commissioning time to prevent active servers from being overloaded. The value must be between 0 and 100.|70|
|`pauseCoordination`| Boolean flag for whether or not the coordinator should execute its various duties of coordinating the cluster. Setting this to true essentially pauses all coordination work while allowing the API to remain up. Duties that are paused include all classes that implement the `CoordinatorDuty` Interface. Such duties include: Segment balancing, Segment compaction, Submitting kill tasks for unused segments (if enabled), Logging of used segments in the cluster, Marking of newly unused or overshadowed segments, Matching and execution of load/drop rules for used segments, Unloading segments that are no longer marked as used from Historical servers. An example of when an admin may want to pause coordination would be if they are doing deep storage maintenance on HDFS Name Nodes with downtime and don't want the coordinator to be directing Historical Nodes to hit the Name Node with API requests until maintenance is done and the deep store is declared healthy for use again. |false|
|`replicateAfterLoadTimeout`| Boolean flag for whether or not additional replication is needed for segments that have failed to load due to the expiry of `druid.coordinator.load.timeout`. If this is set to true, the coordinator will attempt to replicate the failed segment on a different historical server. This helps improve the segment availability if there are a few slow historicals in the cluster. However, the slow historical may still load the segment later and the coordinator may issue drop requests if the segment is over-replicated.|false|
|`maxNonPrimaryReplicantsToLoad`|The maximum number of replicas that can be assigned across all tiers in a single Coordinator run. This parameter serves the same purpose as `replicationThrottleLimit` except this limit applies at the cluster-level instead of per tier. The default value does not apply a limit to the number of replicas assigned per coordination cycle. If you want to use a non-default value for this property, you may want to start with `~20%` of the number of segments found on the historical server with the most segments. Use the Druid metric, `coordinator/time` with the filter `duty=org.apache.druid.server.coordinator.duty.RunRules` to see how different values of this property impact your Coordinator execution time.|`Integer.MAX_VALUE` (no limit)|
|Property| Description | Default |
|--------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------|
|`millisToWaitBeforeDeleting`| How long does the Coordinator need to be a leader before it can start marking overshadowed segments as unused in metadata storage. | 900000 (15 mins) |
|`mergeBytesLimit`| The maximum total uncompressed size in bytes of segments to merge. | 524288000L |
|`mergeSegmentsLimit`| The maximum number of segments that can be in a single [append task](../ingestion/tasks.md). | 100 |
|`smartSegmentLoading`| Enables ["smart" segment loading mode](#smart-segment-loading) which dynamically computes the optimal values of several properties that maximize Coordinator performance. | true |
|`maxSegmentsToMove`| The maximum number of segments that can be moved at any given time. | 100 |
|`replicantLifetime`| The maximum number of Coordinator runs for which a segment can wait in the load queue of a Historical before Druid raises an alert. | 15 |
|`replicationThrottleLimit`| The maximum number of segment replicas that can be assigned to a historical tier in a single Coordinator run. This property prevents historicals from becoming overwhelmed when loading extra replicas of segments that are already available in the cluster. | 500 |
|`balancerComputeThreads`| Thread pool size for computing moving cost of segments during segment balancing. Consider increasing this if you have a lot of segments and moving segments begins to stall. | 1 |
|`killDataSourceWhitelist`| List of specific data sources for which kill tasks are sent if property `druid.coordinator.kill.on` is true. This can be a list of comma-separated data source names or a JSON array. | none |
|`killTaskSlotRatio`| Ratio of total available task slots, including autoscaling if applicable that will be allowed for kill tasks. This limit only applies for kill tasks that are spawned automatically by the coordinator's auto kill duty, which is enabled when `druid.coordinator.kill.on` is true. | 1 - all task slots can be used |
|`maxKillTaskSlots`| Maximum number of tasks that will be allowed for kill tasks. This limit only applies for kill tasks that are spawned automatically by the coordinator's auto kill duty, which is enabled when `druid.coordinator.kill.on` is true. | 2147483647 - no limit |
|`killPendingSegmentsSkipList`| List of data sources for which pendingSegments are _NOT_ cleaned up if property `druid.coordinator.kill.pendingSegments.on` is true. This can be a list of comma-separated data sources or a JSON array. | none |
|`maxSegmentsInNodeLoadingQueue`| The maximum number of segments allowed in the load queue of any given server. Use this parameter to load segments faster if, for example, the cluster contains slow-loading nodes or if there are too many segments to be replicated to a particular node (when faster loading is preferred to better segments distribution). The optimal value depends on the loading speed of segments, acceptable replication time and number of nodes. | 500 |
|`useRoundRobinSegmentAssignment`| Boolean flag for whether segments should be assigned to historicals in a round robin fashion. When disabled, segment assignment is done using the chosen balancer strategy. When enabled, this can speed up segment assignments leaving balancing to move the segments to their optimal locations (based on the balancer strategy) lazily. | true |
|`decommissioningNodes`| List of historical servers to 'decommission'. Coordinator will not assign new segments to 'decommissioning' servers, and segments will be moved away from them to be placed on non-decommissioning servers at the maximum rate specified by `decommissioningMaxPercentOfMaxSegmentsToMove`. | none |
|`decommissioningMaxPercentOfMaxSegmentsToMove`| Upper limit of segments the Coordinator can move from decommissioning servers to active non-decommissioning servers during a single run. This value is relative to the total maximum number of segments that can be moved at any given time based upon the value of `maxSegmentsToMove`.<br /><br />If `decommissioningMaxPercentOfMaxSegmentsToMove` is 0, the Coordinator does not move segments to decommissioning servers, effectively putting them in a type of "maintenance" mode. In this case, decommissioning servers do not participate in balancing or assignment by load rules. The Coordinator still considers segments on decommissioning servers as candidates to replicate on active servers.<br /><br />Decommissioning can stall if there are no available active servers to move the segments to. You can use the maximum percent of decommissioning segment movements to prioritize balancing or to decrease commissioning time to prevent active servers from being overloaded. The value must be between 0 and 100. | 70 |
|`pauseCoordination`| Boolean flag for whether or not the coordinator should execute its various duties of coordinating the cluster. Setting this to true essentially pauses all coordination work while allowing the API to remain up. Duties that are paused include all classes that implement the `CoordinatorDuty` Interface. Such duties include: Segment balancing, Segment compaction, Submitting kill tasks for unused segments (if enabled), Logging of used segments in the cluster, Marking of newly unused or overshadowed segments, Matching and execution of load/drop rules for used segments, Unloading segments that are no longer marked as used from Historical servers. An example of when an admin may want to pause coordination would be if they are doing deep storage maintenance on HDFS Name Nodes with downtime and don't want the coordinator to be directing Historical Nodes to hit the Name Node with API requests until maintenance is done and the deep store is declared healthy for use again. | false |
|`replicateAfterLoadTimeout`| Boolean flag for whether or not additional replication is needed for segments that have failed to load due to the expiry of `druid.coordinator.load.timeout`. If this is set to true, the coordinator will attempt to replicate the failed segment on a different historical server. This helps improve the segment availability if there are a few slow historicals in the cluster. However, the slow historical may still load the segment later and the coordinator may issue drop requests if the segment is over-replicated. | false |
|`maxNonPrimaryReplicantsToLoad`| The maximum number of replicas that can be assigned across all tiers in a single Coordinator run. This parameter serves the same purpose as `replicationThrottleLimit` except this limit applies at the cluster-level instead of per tier. The default value does not apply a limit to the number of replicas assigned per coordination cycle. If you want to use a non-default value for this property, you may want to start with `~20%` of the number of segments found on the historical server with the most segments. Use the Druid metric, `coordinator/time` with the filter `duty=org.apache.druid.server.coordinator.duty.RunRules` to see how different values of this property impact your Coordinator execution time. | `Integer.MAX_VALUE` (no limit) |
##### Smart segment loading
@ -1534,7 +1538,7 @@ Additional peon configs include:
|`druid.indexer.task.baseDir`|Base temporary working directory.|`System.getProperty("java.io.tmpdir")`|
|`druid.indexer.task.baseTaskDir`|Base temporary working directory for tasks.|`${druid.indexer.task.baseDir}/persistent/task`|
|`druid.indexer.task.batchProcessingMode`| Batch ingestion tasks have three operating modes to control construction and tracking for intermediary segments: `OPEN_SEGMENTS`, `CLOSED_SEGMENTS`, and `CLOSED_SEGMENT_SINKS`. `OPEN_SEGMENTS` uses the streaming ingestion code path and performs a `mmap` on intermediary segments to build a timeline to make these segments available to realtime queries. Batch ingestion doesn't require intermediary segments, so the default mode, `CLOSED_SEGMENTS`, eliminates `mmap` of intermediary segments. `CLOSED_SEGMENTS` mode still tracks the entire set of segments in heap. The `CLOSED_SEGMENTS_SINKS` mode is the most aggressive configuration and should have the smallest memory footprint. It eliminates in-memory tracking and `mmap` of intermediary segments produced during segment creation. `CLOSED_SEGMENTS_SINKS` mode isn't as well tested as other modes so is currently considered experimental. You can use `OPEN_SEGMENTS` mode if problems occur with the 2 newer modes. |`CLOSED_SEGMENTS`|
|`druid.indexer.task.defaultHadoopCoordinates`|Hadoop version to use with HadoopIndexTasks that do not request a particular version.|org.apache.hadoop:hadoop-client:2.8.5|
|`druid.indexer.task.defaultHadoopCoordinates`|Hadoop version to use with HadoopIndexTasks that do not request a particular version.|`org.apache.hadoop:hadoop-client-api:3.3.6`, `org.apache.hadoop:hadoop-client-runtime:3.3.6`|
|`druid.indexer.task.defaultRowFlushBoundary`|Highest row count before persisting to disk. Used for indexing generating tasks.|75000|
|`druid.indexer.task.directoryLockTimeout`|Wait this long for zombie peons to exit before giving up on their replacements.|PT10M|
|`druid.indexer.task.gracefulShutdownTimeout`|Wait this long on middleManager restart for restorable tasks to gracefully exit.|PT5M|
@ -1605,7 +1609,7 @@ then the value from the configuration below is used:
|`druid.worker.numConcurrentMerges`|Maximum number of segment persist or merge operations that can run concurrently across all tasks.|`druid.worker.capacity` / 2, rounded down|
|`druid.indexer.task.baseDir`|Base temporary working directory.|`System.getProperty("java.io.tmpdir")`|
|`druid.indexer.task.baseTaskDir`|Base temporary working directory for tasks.|`${druid.indexer.task.baseDir}/persistent/tasks`|
|`druid.indexer.task.defaultHadoopCoordinates`|Hadoop version to use with HadoopIndexTasks that do not request a particular version.|org.apache.hadoop:hadoop-client:2.8.5|
|`druid.indexer.task.defaultHadoopCoordinates`|Hadoop version to use with HadoopIndexTasks that do not request a particular version.|`org.apache.hadoop:hadoop-client-api:3.3.6`, `org.apache.hadoop:hadoop-client-runtime:3.3.6`|
|`druid.indexer.task.gracefulShutdownTimeout`|Wait this long on Indexer restart for restorable tasks to gracefully exit.|PT5M|
|`druid.indexer.task.hadoopWorkingPath`|Temporary working directory for Hadoop tasks.|`/tmp/druid-indexing`|
|`druid.indexer.task.restoreTasksOnRestart`|If true, the Indexer will attempt to stop tasks gracefully on shutdown and restore them on restart.|false|

View File

@ -112,7 +112,7 @@ example properties. Please follow the instructions at
[https://github.com/GoogleCloudPlatform/bigdata-interop/blob/master/gcs/INSTALL.md](https://github.com/GoogleCloudPlatform/bigdata-interop/blob/master/gcs/INSTALL.md)
for more details.
For more configurations, [GCS core default](https://github.com/GoogleCloudDataproc/hadoop-connectors/blob/v2.0.0/gcs/conf/gcs-core-default.xml)
and [GCS core template](https://github.com/GoogleCloudPlatform/bdutil/blob/master/conf/hadoop2/gcs-core-template.xml).
and [GCS core template](https://github.com/GoogleCloudDataproc/hadoop-connectors/blob/master/gcs/src/test/resources/core-site.xml).
```xml
<property>
@ -147,8 +147,6 @@ and [GCS core template](https://github.com/GoogleCloudPlatform/bdutil/blob/maste
</property>
```
Tested with Druid 0.17.0, Hadoop 2.8.5 and gcs-connector jar 2.0.0-hadoop2.
## Reading data from HDFS or Cloud Storage
### Native batch ingestion

View File

@ -53,6 +53,7 @@ This topic contains configuration reference information for the Apache Kafka sup
|`earlyMessageRejectionPeriod`|ISO8601 Period|Configure tasks to reject messages with timestamps later than this period after the task reached its taskDuration; for example if this is set to `PT1H`, the taskDuration is set to `PT1H` and the supervisor creates a task at *2016-01-01T12:00Z*, messages with timestamps later than *2016-01-01T14:00Z* will be dropped. **Note:** Tasks sometimes run past their task duration, for example, in cases of supervisor failover. Setting earlyMessageRejectionPeriod too low may cause messages to be dropped unexpectedly whenever a task runs past its originally configured task duration.|no (default == none)|
|`autoScalerConfig`|Object|Defines auto scaling behavior for Kafka ingest tasks. See [Tasks Autoscaler Properties](#task-autoscaler-properties).|no (default == null)|
|`idleConfig`|Object|Defines how and when Kafka Supervisor can become idle. See [Idle Supervisor Configuration](#idle-supervisor-configuration) for more details.|no (default == null)|
|`multiTopic`|Boolean|Set this to true if you want to ingest data from multiple Kafka topics using a single supervisor. See [Ingesting from multiple topics](#ingesting-from-multiple-topics) for more details.|no (default == false)|
## Task Autoscaler Properties
@ -136,6 +137,12 @@ The following example demonstrates supervisor spec with `lagBased` autoScaler an
}
}
```
## Ingesting from multiple topics
To ingest from multiple topics, you have to set `multiTopic` in the supervisor IO config to `true`. Multiple topics
can be passed as a regex pattern as the value for `topic` in the IO config. For example, to ingest data from clicks and
impressions, you will set `topic` to `clicks|impressions` in the IO config. If new topics are added to the cluster that
match the regex, druid will automatically start ingesting from those new topics. If you enable multi-topic
ingestion for a datasource, downgrading will cause the ingestion to fail for that datasource.
## More on consumerProperties

View File

@ -34,7 +34,7 @@ Review the [Kinesis known issues](#kinesis-known-issues) before deploying the `d
## Supervisor spec
The following table outlines the high-level configuration options for the Kinesis supervisor object.
The following table outlines the high-level configuration options for the Kinesis supervisor object.
See [Supervisor API](../../api-reference/supervisor-api.md) for more information.
|Property|Type|Description|Required|
@ -428,14 +428,26 @@ This section describes how to use the [Supervisor API](../../api-reference/super
### AWS authentication
To authenticate with AWS, you must provide your AWS access key and AWS secret key using `runtime.properties`, for example:
Druid uses AWS access and secret keys to authenticate Kinesis API requests. There are a few ways to provide this information to Druid:
```text
1. Using roles or short-term credentials:
Druid looks for credentials set in [environment variables](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-envvars.html),
via [Web Identity Token](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_providers_oidc.html), in the
default [profile configuration file](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-files.html), and from the
EC2 instance profile provider (in this order).
2. Using long-term security credentials:
You can directly provide your AWS access key and AWS secret key in the `common.runtime.properties` file as shown in the example below:
```properties
druid.kinesis.accessKey=AKIAWxxxxxxxxxx4NCKS
druid.kinesis.secretKey=Jbytxxxxxxxxxxx2+555
```
Druid uses the AWS access key and AWS secret key to authenticate Kinesis API requests. If not provided, the service looks for credentials set in environment variables, via [Web Identity Token](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_providers_oidc.html), in the default profile configuration file, and from the EC2 instance profile provider (in this order).
> Note: AWS does not recommend providing long-term security credentials in configuration files since it might pose a security risk.
If you use this approach, it takes precedence over all other methods of providing credentials.
To ingest data from Kinesis, ensure that the policy attached to your IAM role contains the necessary permissions.
The required permissions depend on the value of `useListShards`.
@ -482,7 +494,7 @@ The following is an example policy:
},
{
"Effect": "Allow",
"Action": ["kinesis:DescribeStreams"],
"Action": ["kinesis:DescribeStream"],
"Resource": ["*"]
},
{

View File

@ -180,7 +180,7 @@ Once you install the GCS Connector jar in all MiddleManager and Indexer processe
your Google Cloud Storage paths in the inputSpec with the below job properties.
For more configurations, see the [instructions to configure Hadoop](https://github.com/GoogleCloudPlatform/bigdata-interop/blob/master/gcs/INSTALL.md#configure-hadoop),
[GCS core default](https://github.com/GoogleCloudDataproc/hadoop-connectors/blob/v2.0.0/gcs/conf/gcs-core-default.xml)
and [GCS core template](https://github.com/GoogleCloudPlatform/bdutil/blob/master/conf/hadoop2/gcs-core-template.xml).
and [GCS core template](https://github.com/GoogleCloudDataproc/hadoop-connectors/blob/master/gcs/src/test/resources/core-site.xml).
```
"paths" : "gs://billy-bucket/the/data/is/here/data.gz,gs://billy-bucket/the/data/is/here/moredata.gz,gs://billy-bucket/the/data/is/here/evenmoredata.gz"

View File

@ -186,7 +186,7 @@ Treat `__time` as a millisecond timestamp: the number of milliseconds since Jan
### `dimensionsSpec`
The `dimensionsSpec` is located in `dataSchema``dimensionsSpec` and is responsible for
configuring [dimensions](./schema-model.md#dimensions). An example `dimensionsSpec` is:
configuring [dimensions](./schema-model.md#dimensions).
You can either manually specify the dimensions or take advantage of schema auto-discovery where you allow Druid to infer all or some of the schema for your data. This means that you don't have to explicitly specify your dimensions and their type.

View File

@ -244,7 +244,7 @@ You should query for the number of ingested rows with:
Druid can infer the schema for your data in one of two ways:
- [Type-aware schema discovery (experimental)](#type-aware-schema-discovery) where Druid infers the schema and type for your data. Type-aware schema discovery is an experimental feature currently available for native batch and streaming ingestion.
- [Type-aware schema discovery](#type-aware-schema-discovery) where Druid infers the schema and type for your data. Type-aware schema discovery is available for native batch and streaming ingestion.
- [String-based schema discovery](#string-based-schema-discovery) where all the discovered columns are typed as either native string or multi-value string columns.
#### Type-aware schema discovery
@ -261,6 +261,8 @@ native boolean types, Druid ingests these values as strings if `druid.expression
the [array functions](../querying/sql-array-functions.md) or [UNNEST](../querying/sql-functions.md#unnest). Nested
columns can be queried with the [JSON functions](../querying/sql-json-functions.md).
We also highly recommend setting `druid.generic.useDefaultValueForNull=false` when using these columns since it also enables out of the box `ARRAY` type filtering. If not set to `false`, setting `sqlUseBoundsAndSelectors` to `false` on the [SQL query context](../querying/sql-query-context.md) can enable `ARRAY` filtering instead.
Mixed type columns are stored in the _least_ restrictive type that can represent all values in the column. For example:
- Mixed numeric columns are `DOUBLE`

View File

@ -203,8 +203,8 @@ If you see the error "Encountered multi-value dimension `x` that cannot be proce
groupByEnableMultiValueUnnesting set to false", then wrap that column in `MV_TO_ARRAY(x) AS x`.
The following [aggregation functions](../querying/sql-aggregations.md) are supported for rollup at ingestion time:
`COUNT` (but switch to `SUM` at query time), `SUM`, `MIN`, `MAX`, `EARLIEST` ([string only](known-issues.md#select-statement)),
`LATEST` ([string only](known-issues.md#select-statement)), `APPROX_COUNT_DISTINCT`, `APPROX_COUNT_DISTINCT_BUILTIN`,
`COUNT` (but switch to `SUM` at query time), `SUM`, `MIN`, `MAX`, `EARLIEST` and `EARLIEST_BY` ([string only](known-issues.md#select-statement)),
`LATEST` and `LATEST_BY` ([string only](known-issues.md#select-statement)), `APPROX_COUNT_DISTINCT`, `APPROX_COUNT_DISTINCT_BUILTIN`,
`APPROX_COUNT_DISTINCT_DS_HLL`, `APPROX_COUNT_DISTINCT_DS_THETA`, and `DS_QUANTILES_SKETCH` (but switch to
`APPROX_QUANTILE_DS` at query time). Do not use `AVG`; instead, use `SUM` and `COUNT` at ingest time and compute the
quotient at query time.

View File

@ -349,20 +349,35 @@ SQL-based ingestion supports using durable storage to store intermediate files t
### Durable storage configurations
The following common service properties control how durable storage behaves:
Durable storage is supported on Amazon S3 storage and Microsoft's Azure storage. There are a few common configurations that controls the behavior for both the services as documented below. Apart from the common configurations,
there are a few properties specific to each storage that must be set.
Common properties to configure the behavior of durable storage
|Parameter |Default | Description |
|-------------------|----------------------------------------|----------------------|
|`druid.msq.intermediate.storage.enable` | true | Required. Whether to enable durable storage for the cluster. For more information about enabling durable storage, see [Durable storage](../operations/durable-storage.md).|
|`druid.msq.intermediate.storage.type` | `s3` for Amazon S3 | Required. The type of storage to use. `s3` is the only supported storage type. |
|`druid.msq.intermediate.storage.bucket` | n/a | The S3 bucket to store intermediate files. |
|`druid.msq.intermediate.storage.prefix` | n/a | S3 prefix to store intermediate stage results. Provide a unique value for the prefix. Don't share the same prefix between clusters. If the location includes other files or directories, then they will get cleaned up as well. |
|`druid.msq.intermediate.storage.tempDir`| n/a | Required. Directory path on the local disk to temporarily store intermediate stage results. |
|`druid.msq.intermediate.storage.enable` | false | Whether to enable durable storage for the cluster. Set it to true to enable durable storage. For more information about enabling durable storage, see [Durable storage](../operations/durable-storage.md).|
|`druid.msq.intermediate.storage.type` | n/a | Required. The type of storage to use. Set it to `s3` for S3 and `azure` for Azure |
|`druid.msq.intermediate.storage.tempDir`| n/a | Required. Directory path on the local disk to store temporary files required while uploading and downloading the data |
|`druid.msq.intermediate.storage.maxRetry` | 10 | Optional. Defines the max number times to attempt S3 API calls to avoid failures due to transient errors. |
|`druid.msq.intermediate.storage.chunkSize` | 100MiB | Optional. Defines the size of each chunk to temporarily store in `druid.msq.intermediate.storage.tempDir`. The chunk size must be between 5 MiB and 5 GiB. A large chunk size reduces the API calls made to the durable storage, however it requires more disk space to store the temporary chunks. Druid uses a default of 100MiB if the value is not provided.|
Following properties need to be set in addition to the common properties to enable durable storage on S3
In addition to the common service properties, there are certain properties that you configure on the Overlord specifically to clean up intermediate files:
|Parameter |Default | Description |
|-------------------|----------------------------------------|----------------------|
|`druid.msq.intermediate.storage.bucket` | n/a | Required. The S3 bucket where the files are uploaded to and download from |
|`druid.msq.intermediate.storage.prefix` | n/a | Required. Path prepended to all the paths uploaded to the bucket to namespace the connector's files. Provide a unique value for the prefix and do not share the same prefix between different clusters. If the location includes other files or directories, then they might get cleaned up as well. |
Following properties must be set in addition to the common properties to enable durable storage on Azure.
|Parameter |Default | Description |
|-------------------|----------------------------------------|----------------------|
|`druid.msq.intermediate.storage.container` | n/a | Required. The Azure container where the files are uploaded to and downloaded from. |
|`druid.msq.intermediate.storage.prefix` | n/a | Required. Path prepended to all the paths uploaded to the container to namespace the connector's files. Provide a unique value for the prefix and do not share the same prefix between different clusters. If the location includes other files or directories, then they might get cleaned up as well. |
Durable storage creates files on the remote storage and is cleaned up once the job no longer requires those files. However, due to failures causing abrupt exit of the tasks, these files might not get cleaned up.
Therefore, there are certain properties that you configure on the Overlord specifically to clean up intermediate files for the tasks that have completed and would no longer require these files:
|Parameter |Default | Description |
|-------------------|----------------------------------------|----------------------|

View File

@ -154,7 +154,7 @@ If SQL is enabled, the Broker will emit the following metrics for SQL.
## Ingestion metrics
## General native ingestion metrics
### General native ingestion metrics
|Metric|Description|Dimensions|Normal value|
|------|-----------|----------|------------|
@ -203,6 +203,14 @@ These metrics apply to the [Kinesis indexing service](../development/extensions-
|`ingest/kinesis/avgLag/time`|Average lag time in milliseconds between the current message sequence number consumed by the Kinesis indexing tasks and latest sequence number in Kinesis across all shards. Minimum emission period for this metric is a minute.|`dataSource`, `stream`, `tags`|Greater than 0, up to max Kinesis retention period in milliseconds. |
|`ingest/kinesis/partitionLag/time`|Partition-wise lag time in milliseconds between the current message sequence number consumed by the Kinesis indexing tasks and latest sequence number in Kinesis. Minimum emission period for this metric is a minute.|`dataSource`, `stream`, `partition`, `tags`|Greater than 0, up to max Kinesis retention period in milliseconds. |
### Compaction metrics
[Compaction tasks](../data-management/compaction.md) emit the following metrics.
|Metric|Description|Dimensions|Normal value|
|------|-----------|----------|------------|
|`compact/segmentAnalyzer/fetchAndProcessMillis`|Time taken to fetch and process segments to infer the schema for the compaction task to run.|`dataSource`, `taskId`, `taskType`, `groupId`,`tags`| Varies. A high value indicates compaction tasks will speed up from explicitly setting the data schema. |
### Other ingestion metrics
Streaming ingestion tasks and certain types of
@ -232,7 +240,7 @@ batch ingestion emit the following metrics. These metrics are deltas for each em
|`ingest/notices/time`|Milliseconds taken to process a notice by the supervisor.|`dataSource`, `tags`| < 1s |
|`ingest/pause/time`|Milliseconds spent by a task in a paused state without ingesting.|`dataSource`, `taskId`, `tags`| < 10 seconds|
|`ingest/handoff/time`|Total number of milliseconds taken to handoff a set of segments.|`dataSource`, `taskId`, `taskType`, `groupId`, `tags`|Depends on the coordinator cycle time.|
|`ingest/handoff/time`|Total number of milliseconds taken to handoff a set of segments.|`dataSource`, `taskId`, `taskType`, `groupId`, `tags`|Depends on the coordinator cycle time.|
If the JVM does not support CPU time measurement for the current thread, `ingest/merge/cpu` and `ingest/persists/cpu` will be 0.
## Indexing service
@ -312,6 +320,9 @@ These metrics are for the Druid Coordinator and are reset each time the Coordina
|`compact/task/count`|Number of tasks issued in the auto compaction run.| |Varies|
|`compactTask/maxSlot/count`|Maximum number of task slots available for auto compaction tasks in the auto compaction run.| |Varies|
|`compactTask/availableSlot/count`|Number of available task slots that can be used for auto compaction tasks in the auto compaction run. This is the max number of task slots minus any currently running compaction tasks.| |Varies|
|`killTask/availableSlot/count`| Number of available task slots that can be used for auto kill tasks in the auto kill run. This is the max number of task slots minus any currently running auto kill tasks. | |Varies|
|`killTask/maxSlot/count`| Maximum number of task slots available for auto kill tasks in the auto kill run. | |Varies|
|`kill/task/count`| Number of tasks issued in the auto kill run. | |Varies|
|`segment/waitCompact/bytes`|Total bytes of this datasource waiting to be compacted by the auto compaction (only consider intervals/segments that are eligible for auto compaction).|`dataSource`|Varies|
|`segment/waitCompact/count`|Total number of segments of this datasource waiting to be compacted by the auto compaction (only consider intervals/segments that are eligible for auto compaction).|`dataSource`|Varies|
|`interval/waitCompact/count`|Total number of intervals of this datasource waiting to be compacted by the auto compaction (only consider intervals/segments that are eligible for auto compaction).|`dataSource`|Varies|

View File

@ -89,7 +89,7 @@ classloader.
2. Batch ingestion uses jars from `hadoop-dependencies/` to submit Map/Reduce jobs (location customizable via the
`druid.extensions.hadoopDependenciesDir` runtime property; see [Configuration](../configuration/index.md#extensions)).
`hadoop-client:2.8.5` is the default version of the Hadoop client bundled with Druid for both purposes. This works with
The default version of the Hadoop client bundled with Druid is `3.3.6`. This works with
many Hadoop distributions (the version does not necessarily need to match), but if you run into issues, you can instead
have Druid load libraries that exactly match your distribution. To do this, either copy the jars from your Hadoop
cluster, or use the `pull-deps` tool to download the jars from a Maven repository.

View File

@ -39,8 +39,14 @@ The following sections list the available aggregate functions. Unless otherwise
`count` computes the count of Druid rows that match the filters.
| Property | Description | Required |
| --- | --- | --- |
| `type` | Must be "count". | Yes |
| `name` | Output name of the aggregator | Yes |
Example:
```json
{ "type" : "count", "name" : <output_name> }
{ "type" : "count", "name" : "count" }
```
The `count` aggregator counts the number of Druid rows, which does not always reflect the number of raw events ingested.
@ -50,94 +56,121 @@ query time.
### Sum aggregators
| Property | Description | Required |
| --- | --- | --- |
| `type` | Must be "longSum", "doubleSum", or "floatSum". | Yes |
| `name` | Output name for the summed value. | Yes |
| `fieldName` | Name of the input column to sum over. | No. You must specify `fieldName` or `expression`. |
| `expression` | You can specify an inline [expression](./math-expr.md) as an alternative to `fieldName`. | No. You must specify `fieldName` or `expression`. |
#### `longSum` aggregator
Computes the sum of values as a 64-bit, signed integer.
Example:
```json
{ "type" : "longSum", "name" : <output_name>, "fieldName" : <metric_name> }
{ "type" : "longSum", "name" : "sumLong", "fieldName" : "aLong" }
```
The `longSum` aggregator takes the following properties:
* `name`: Output name for the summed value
* `fieldName`: Name of the metric column to sum over
#### `doubleSum` aggregator
Computes and stores the sum of values as a 64-bit floating point value. Similar to `longSum`.
Example:
```json
{ "type" : "doubleSum", "name" : <output_name>, "fieldName" : <metric_name> }
{ "type" : "doubleSum", "name" : "sumDouble", "fieldName" : "aDouble" }
```
#### `floatSum` aggregator
Computes and stores the sum of values as a 32-bit floating point value. Similar to `longSum` and `doubleSum`.
Example:
```json
{ "type" : "floatSum", "name" : <output_name>, "fieldName" : <metric_name> }
{ "type" : "floatSum", "name" : "sumFloat", "fieldName" : "aFloat" }
```
### Min and max aggregators
| Property | Description | Required |
| --- | --- | --- |
| `type` | Must be "doubleMin", "doubleMax", "floatMin", "floatMax", "longMin", or "longMax". | Yes |
| `name` | Output name for the min or max value. | Yes |
| `fieldName` | Name of the input column to compute the minimum or maximum value over. | No. You must specify `fieldName` or `expression`. |
| `expression` | You can specify an inline [expression](./math-expr.md) as an alternative to `fieldName`. | No. You must specify `fieldName` or `expression`. |
#### `doubleMin` aggregator
`doubleMin` computes the minimum of all metric values and Double.POSITIVE_INFINITY.
`doubleMin` computes the minimum of all input values and null if `druid.generic.useDefaultValueForNull` is false or Double.POSITIVE_INFINITY if true.
Example:
```json
{ "type" : "doubleMin", "name" : <output_name>, "fieldName" : <metric_name> }
{ "type" : "doubleMin", "name" : "maxDouble", "fieldName" : "aDouble" }
```
#### `doubleMax` aggregator
`doubleMax` computes the maximum of all metric values and Double.NEGATIVE_INFINITY.
`doubleMax` computes the maximum of all input values and null if `druid.generic.useDefaultValueForNull` is false or Double.NEGATIVE_INFINITY if true.
Example:
```json
{ "type" : "doubleMax", "name" : <output_name>, "fieldName" : <metric_name> }
{ "type" : "doubleMax", "name" : "minDouble", "fieldName" : "aDouble" }
```
#### `floatMin` aggregator
`floatMin` computes the minimum of all metric values and Float.POSITIVE_INFINITY.
`floatMin` computes the minimum of all input values and null if `druid.generic.useDefaultValueForNull` is false or Float.POSITIVE_INFINITY if true.
Example:
```json
{ "type" : "floatMin", "name" : <output_name>, "fieldName" : <metric_name> }
{ "type" : "floatMin", "name" : "minFloat", "fieldName" : "aFloat" }
```
#### `floatMax` aggregator
`floatMax` computes the maximum of all metric values and Float.NEGATIVE_INFINITY.
`floatMax` computes the maximum of all input values and null if `druid.generic.useDefaultValueForNull` is false or Float.NEGATIVE_INFINITY if true.
Example:
```json
{ "type" : "floatMax", "name" : <output_name>, "fieldName" : <metric_name> }
{ "type" : "floatMax", "name" : "maxFloat", "fieldName" : "aFloat" }
```
#### `longMin` aggregator
`longMin` computes the minimum of all metric values and Long.MAX_VALUE.
`longMin` computes the minimum of all input values and null if `druid.generic.useDefaultValueForNull` is false or Long.MAX_VALUE if true.
Example:
```json
{ "type" : "longMin", "name" : <output_name>, "fieldName" : <metric_name> }
{ "type" : "longMin", "name" : "minLong", "fieldName" : "aLong" }
```
#### `longMax` aggregator
`longMax` computes the maximum of all metric values and Long.MIN_VALUE.
`longMax` computes the maximum of all metric values and null if `druid.generic.useDefaultValueForNull` is false or Long.MIN_VALUE if true.
Example:
```json
{ "type" : "longMax", "name" : <output_name>, "fieldName" : <metric_name> }
{ "type" : "longMax", "name" : "maxLong", "fieldName" : "aLong" }
```
### `doubleMean` aggregator
Computes and returns the arithmetic mean of a column's values as a 64-bit floating point value. `doubleMean` is a query time aggregator only. It is not available for indexing.
Computes and returns the arithmetic mean of a column's values as a 64-bit floating point value.
To accomplish mean aggregation on ingestion, refer to the [Quantiles aggregator](../development/extensions-core/datasketches-quantiles.md#aggregator) from the DataSketches extension.
| Property | Description | Required |
| --- | --- | --- |
| `type` | Must be "doubleMean". | Yes |
| `name` | Output name for the mean value. | Yes |
| `fieldName` | Name of the input column to compute the arithmetic mean value over. | Yes |
Example:
```json
{ "type" : "doubleMean", "name" : <output_name>, "fieldName" : <metric_name> }
{ "type" : "doubleMean", "name" : "aMean", "fieldName" : "aDouble" }
```
`doubleMean` is a query time aggregator only. It is not available for indexing. To accomplish mean aggregation on ingestion, refer to the [Quantiles aggregator](../development/extensions-core/datasketches-quantiles.md#aggregator) from the DataSketches extension.
### First and last aggregators
The first and last aggregators determine the metric values that respectively correspond to the earliest and latest values of a time column.
@ -147,111 +180,131 @@ The string-typed aggregators, `stringFirst` and `stringLast`, are supported for
Queries with first or last aggregators on a segment created with rollup return the rolled up value, not the first or last value from the raw ingested data.
#### `doubleFirst` aggregator
#### Numeric first and last aggregators
`doubleFirst` computes the metric value with the minimum value for time column or 0 in default mode, or `null` in SQL-compatible mode if no row exists.
| Property | Description | Required |
| --- | --- | --- |
| `type` | Must be "doubleFirst", "doubleLast", "floatFirst", "floatLast", "longFirst", "longLast". | Yes |
| `name` | Output name for the first or last value. | Yes |
| `fieldName` | Name of the input column to compute the first or last value over. | Yes |
| `timeColumn` | Name of the input column to use for time values. Must be a LONG typed column. | No. Defaults to `__time`. |
##### `doubleFirst` aggregator
`doubleFirst` computes the input value with the minimum value for time column or 0 in default mode, or `null` in SQL-compatible mode if no row exists.
Example:
```json
{
"type" : "doubleFirst",
"name" : <output_name>,
"fieldName" : <metric_name>,
"timeColumn" : <time_column_name> # (optional, defaults to __time)
"name" : "firstDouble",
"fieldName" : "aDouble"
}
```
#### `doubleLast` aggregator
##### `doubleLast` aggregator
`doubleLast` computes the metric value with the maximum value for time column or 0 in default mode, or `null` in SQL-compatible mode if no row exists.
`doubleLast` computes the input value with the maximum value for time column or 0 in default mode, or `null` in SQL-compatible mode if no row exists.
Example:
```json
{
"type" : "doubleLast",
"name" : <output_name>,
"fieldName" : <metric_name>,
"timeColumn" : <time_column_name> # (optional, defaults to __time)
"name" : "lastDouble",
"fieldName" : "aDouble",
"timeColumn" : "longTime"
}
```
#### `floatFirst` aggregator
##### `floatFirst` aggregator
`floatFirst` computes the metric value with the minimum value for time column or 0 in default mode, or `null` in SQL-compatible mode if no row exists.
`floatFirst` computes the input value with the minimum value for time column or 0 in default mode, or `null` in SQL-compatible mode if no row exists.
Example:
```json
{
"type" : "floatFirst",
"name" : <output_name>,
"fieldName" : <metric_name>,
"timeColumn" : <time_column_name> # (optional, defaults to __time)
"name" : "firstFloat",
"fieldName" : "aFloat"
}
```
#### `floatLast` aggregator
##### `floatLast` aggregator
`floatLast` computes the metric value with the maximum value for time column or 0 in default mode, or `null` in SQL-compatible mode if no row exists.
Example:
```json
{
"type" : "floatLast",
"name" : <output_name>,
"fieldName" : <metric_name>,
"timeColumn" : <time_column_name> # (optional, defaults to __time)
"name" : "lastFloat",
"fieldName" : "aFloat"
}
```
#### `longFirst` aggregator
##### `longFirst` aggregator
`longFirst` computes the metric value with the minimum value for time column or 0 in default mode, or `null` in SQL-compatible mode if no row exists.
Example:
```json
{
"type" : "longFirst",
"name" : <output_name>,
"fieldName" : <metric_name>,
"timeColumn" : <time_column_name> # (optional, defaults to __time)
"name" : "firstLong",
"fieldName" : "aLong"
}
```
#### `longLast` aggregator
##### `longLast` aggregator
`longLast` computes the metric value with the maximum value for time column or 0 in default mode, or `null` in SQL-compatible mode if no row exists.
Example:
```json
{
"type" : "longLast",
"name" : <output_name>,
"fieldName" : <metric_name>,
"timeColumn" : <time_column_name> # (optional, defaults to __time)
"name" : "lastLong",
"fieldName" : "aLong",
"timeColumn" : "longTime"
}
```
#### String first and last aggregators
| Property | Description | Required |
| --- | --- | --- |
| `type` | Must be "stringFirst", "stringLast". | Yes |
| `name` | Output name for the first or last value. | Yes |
| `fieldName` | Name of the input column to compute the first or last value over. | Yes |
| `timeColumn` | Name of the input column to use for time values. Must be a LONG typed column. | No. Defaults to `__time`. |
| `maxStringBytes` | Maximum size of string values to accumulate when computing the first or last value per group. Values longer than this will be truncated. | No. Defaults to 1024. |
#### `stringFirst` aggregator
`stringFirst` computes the metric value with the minimum value for time column or `null` if no row exists.
Example:
```json
{
"type" : "stringFirst",
"name" : <output_name>,
"fieldName" : <metric_name>,
"maxStringBytes" : <integer> # (optional, defaults to 1024)
"timeColumn" : <time_column_name> # (optional, defaults to __time)
"name" : "firstString",
"fieldName" : "aString",
"maxStringBytes" : 2048,
"timeColumn" : "longTime"
}
```
#### `stringLast` aggregator
`stringLast` computes the metric value with the maximum value for time column or `null` if no row exists.
Example:
```json
{
"type" : "stringLast",
"name" : <output_name>,
"fieldName" : <metric_name>,
"maxStringBytes" : <integer> # (optional, defaults to 1024)
"timeColumn" : <time_column_name> # (optional, defaults to __time)
"name" : "lastString",
"fieldName" : "aString"
}
```
@ -261,88 +314,73 @@ Queries with first or last aggregators on a segment created with rollup return t
Returns any value including null. This aggregator can simplify and optimize the performance by returning the first encountered value (including null)
#### `doubleAny` aggregator
#### Numeric any aggregators
| Property | Description | Required |
| --- | --- | --- |
| `type` | Must be "doubleAny", "floatAny", or "longAny". | Yes |
| `name` | Output name for the value. | Yes |
| `fieldName` | Name of the input column to compute the value over. | Yes |
##### `doubleAny` aggregator
`doubleAny` returns any double metric value.
Example:
```json
{
"type" : "doubleAny",
"name" : <output_name>,
"fieldName" : <metric_name>
"name" : "anyDouble",
"fieldName" : "aDouble"
}
```
#### `floatAny` aggregator
##### `floatAny` aggregator
`floatAny` returns any float metric value.
Example:
```json
{
"type" : "floatAny",
"name" : <output_name>,
"fieldName" : <metric_name>
"name" : "anyFloat",
"fieldName" : "aFloat"
}
```
#### `longAny` aggregator
##### `longAny` aggregator
`longAny` returns any long metric value.
Example:
```json
{
"type" : "longAny",
"name" : <output_name>,
"fieldName" : <metric_name>,
"name" : "anyLong",
"fieldName" : "aLong"
}
```
#### `stringAny` aggregator
`stringAny` returns any string metric value.
`stringAny` returns any string value present in the input.
| Property | Description | Required |
| --- | --- | --- |
| `type` | Must be "stringAny". | Yes |
| `name` | Output name for the value. | Yes |
| `fieldName` | Name of the input column to compute the value over. | Yes |
| `maxStringBytes` | Maximum size of string values to accumulate when computing the first or last value per group. Values longer than this will be truncated. | No. Defaults to 1024. |
Example:
```json
{
"type" : "stringAny",
"name" : <output_name>,
"fieldName" : <metric_name>,
"maxStringBytes" : <integer> # (optional, defaults to 1024),
"name" : "anyString",
"fieldName" : "aString",
"maxStringBytes" : 2048
}
```
### JavaScript aggregator
Computes an arbitrary JavaScript function over a set of columns (both metrics and dimensions are allowed). Your
JavaScript functions are expected to return floating-point values.
```json
{ "type": "javascript",
"name": "<output_name>",
"fieldNames" : [ <column1>, <column2>, ... ],
"fnAggregate" : "function(current, column1, column2, ...) {
<updates partial aggregate (current) based on the current row values>
return <updated partial aggregate>
}",
"fnCombine" : "function(partialA, partialB) { return <combined partial results>; }",
"fnReset" : "function() { return <initial value>; }"
}
```
**Example**
```json
{
"type": "javascript",
"name": "sum(log(x)*y) + 10",
"fieldNames": ["x", "y"],
"fnAggregate" : "function(current, a, b) { return current + (Math.log(a) * b); }",
"fnCombine" : "function(partialA, partialB) { return partialA + partialB; }",
"fnReset" : "function() { return 10; }"
}
```
> JavaScript-based functionality is disabled by default. Please refer to the Druid [JavaScript programming guide](../development/javascript.md) for guidelines about using Druid's JavaScript functionality, including instructions on how to enable it.
<a name="approx"></a>
## Approximate aggregations
@ -422,6 +460,117 @@ It is not possible to determine a priori how well this aggregator will behave fo
For these reasons, we have deprecated this aggregator and recommend using the DataSketches Quantiles aggregator instead for new and existing use cases, although we will continue to support Approximate Histogram for backwards compatibility.
## Expression aggregations
### Expression aggregator
Aggregator applicable only at query time. Aggregates results using [Druid expressions](./math-expr.md) functions to facilitate building custom functions.
| Property | Description | Required |
| --- | --- | --- |
| `type` | Must be "expression". | Yes |
| `name` | The aggregator output name. | Yes |
| `fields` | The list of aggregator input columns. | Yes |
| `accumulatorIdentifier` | The variable which identifies the accumulator value in the `fold` and `combine` expressions. | No. Default `__acc`.|
| `fold` | The expression to accumulate values from `fields`. The result of the expression is stored in `accumulatorIdentifier` and available to the next computation. | Yes |
| `combine` | The expression to combine the results of various `fold` expressions of each segment when merging results. The input is available to the expression as a variable identified by the `name`. | No. Default to `fold` expression if the expression has a single input in `fields`.|
| `compare` | The comparator expression which can only refer to two input variables, `o1` and `o2`, where `o1` and `o2` are the output of `fold` or `combine` expressions, and must adhere to the Java comparator contract. If not set, the aggregator will try to fall back to an output type appropriate comparator. | No |
| `finalize` | The finalize expression which can only refer to a single input variable, `o`. This expression is used to perform any final transformation of the output of the `fold` or `combine` expressions. If not set, then the value is not transformed. | No |
| `initialValue` | The initial value of the accumulator for the `fold` (and `combine`, if `InitialCombineValue` is null) expression. | Yes |
| `initialCombineValue` | The initial value of the accumulator for the `combine` expression. | No. Default `initialValue`. |
| `isNullUnlessAggregated` | Indicates that the default output value should be `null` if the aggregator does not process any rows. If true, the value is `null`, if false, the result of running the expressions with initial values is used instead. | No. Defaults to the value of `druid.generic.useDefaultValueForNull`. |
| `shouldAggregateNullInputs` | Indicates if the `fold` expression should operate on any `null` input values. | No. Defaults to `true`. |
| `shouldCombineAggregateNullInputs` | Indicates if the `combine` expression should operate on any `null` input values. | No. Defaults to the value of `shouldAggregateNullInputs`. |
| `maxSizeBytes` | Maximum size in bytes that variably sized aggregator output types such as strings and arrays are allowed to grow to before the aggregation fails. | No. Default is 8192 bytes. |
#### Example: a "count" aggregator
The initial value is `0`. `fold` adds `1` for each row processed.
```json
{
"type": "expression",
"name": "expression_count",
"fields": [],
"initialValue": "0",
"fold": "__acc + 1",
"combine": "__acc + expression_count"
}
```
#### Example: a "sum" aggregator
The initial value is `0`. `fold` adds the numeric value `column_a` for each row processed.
```json
{
"type": "expression",
"name": "expression_sum",
"fields": ["column_a"],
"initialValue": "0",
"fold": "__acc + column_a"
}
```
#### Example: a "distinct array element" aggregator, sorted by array_length
The initial value is an empty array. `fold` adds the elements of `column_a` to the accumulator using set semantics, `combine` merges the sets, and `compare` orders the values by `array_length`.
```json
{
"type": "expression",
"name": "expression_array_agg_distinct",
"fields": ["column_a"],
"initialValue": "[]",
"fold": "array_set_add(__acc, column_a)",
"combine": "array_set_add_all(__acc, expression_array_agg_distinct)",
"compare": "if(array_length(o1) > array_length(o2), 1, if (array_length(o1) == array_length(o2), 0, -1))"
}
```
#### Example: an "approximate count" aggregator using the built-in hyper-unique
Similar to the cardinality aggregator, the default value is an empty hyper-unique sketch, `fold` adds the value of `column_a` to the sketch, `combine` merges the sketches, and `finalize` gets the estimated count from the accumulated sketch.
```json
{
"type": "expression",
"name": "expression_cardinality",
"fields": ["column_a"],
"initialValue": "hyper_unique()",
"fold": "hyper_unique_add(column_a, __acc)",
"combine": "hyper_unique_add(expression_cardinality, __acc)",
"finalize": "hyper_unique_estimate(o)"
}
```
### JavaScript aggregator
Computes an arbitrary JavaScript function over a set of columns (both metrics and dimensions are allowed). Your
JavaScript functions are expected to return floating-point values.
| Property | Description | Required |
| --- | --- | --- |
| `type` | Must be "javascript". | Yes |
| `name` | The aggregator output name. | Yes |
| `fieldNames` | The list of aggregator input columns. | Yes |
| `fnAggregate` | JavaScript function that updates partial aggregate based on the current row values, and returns the updated partial aggregate. | Yes |
| `fnCombine` | JavaScript function to combine partial aggregates and return the combined result. | Yes |
| `fnReset` | JavaScript function that returns the 'initial' value. | Yes |
#### Example
```json
{
"type": "javascript",
"name": "sum(log(x)*y) + 10",
"fieldNames": ["x", "y"],
"fnAggregate" : "function(current, a, b) { return current + (Math.log(a) * b); }",
"fnCombine" : "function(partialA, partialB) { return partialA + partialB; }",
"fnReset" : "function() { return 10; }"
}
```
> JavaScript functionality is disabled by default. Refer to the Druid [JavaScript programming guide](../development/javascript.md) for guidelines about using Druid's JavaScript functionality, including instructions on how to enable it.
## Miscellaneous aggregations
### Filtered aggregator
@ -430,17 +579,30 @@ A filtered aggregator wraps any given aggregator, but only aggregates the values
This makes it possible to compute the results of a filtered and an unfiltered aggregation simultaneously, without having to issue multiple queries, and use both results as part of post-aggregations.
*Note:* If only the filtered results are required, consider putting the filter on the query itself, which will be much faster since it does not require scanning all the data.
If only the filtered results are required, consider putting the filter on the query itself. This will be much faster since it does not require scanning all the data.
| Property | Description | Required |
| --- | --- | --- |
| `type` | Must be "filtered". | Yes |
| `name` | The aggregator output name. | No |
| `aggregator` | Inline aggregator specification. | Yes |
| `filter` | Inline [filter](./filters.md) specification. | Yes |
Example:
```json
{
"type" : "filtered",
"filter" : {
"type": "filtered",
"name": "filteredSumLong",
"filter": {
"type" : "selector",
"dimension" : <dimension>,
"value" : <dimension value>
"dimension" : "someColumn",
"value" : "abcdef"
},
"aggregator" : <aggregation>
"aggregator": {
"type": "longSum",
"name": "sumLong",
"fieldName": "aLong"
}
}
```
@ -450,7 +612,20 @@ A grouping aggregator can only be used as part of GroupBy queries which have a s
each output row that lets you infer whether a particular dimension is included in the sub-grouping used for that row. You can pass
a *non-empty* list of dimensions to this aggregator which *must* be a subset of dimensions that you are grouping on.
For example, if the aggregator has `["dim1", "dim2"]` as input dimensions and `[["dim1", "dim2"], ["dim1"], ["dim2"], []]` as subtotals, the
| Property | Description | Required |
| --- | --- | --- |
| `type` | Must be "grouping". | Yes |
| `name` | The aggregator output name. | Yes |
| `groupings` | The list of columns to use in the grouping set. | Yes |
For example, the following aggregator has `["dim1", "dim2"]` as input dimensions:
```json
{ "type" : "grouping", "name" : "someGrouping", "groupings" : ["dim1", "dim2"] }
```
and used in a grouping query with `[["dim1", "dim2"], ["dim1"], ["dim2"], []]` as subtotals, the
possible output of the aggregator is:
| subtotal used in query | Output | (bits representation) |
@ -463,6 +638,3 @@ possible output of the aggregator is:
As the example illustrates, you can think of the output number as an unsigned _n_ bit number where _n_ is the number of dimensions passed to the aggregator.
Druid sets the bit at position X for the number to 0 if the sub-grouping includes a dimension at position X in the aggregator input. Otherwise, Druid sets this bit to 1.
```json
{ "type" : "grouping", "name" : <output_name>, "groupings" : [<dimension>] }
```

View File

@ -35,199 +35,187 @@ Apache Druid supports the following types of filters.
## Selector filter
The simplest filter is a selector filter. The selector filter will match a specific dimension with a specific value. Selector filters can be used as the base filters for more complex Boolean expressions of filters.
The simplest filter is a selector filter. The selector filter matches a specific dimension with a specific value. Selector filters can be used as the base filters for more complex Boolean expressions of filters.
The grammar for a SELECTOR filter is as follows:
| Property | Description | Required |
| -------- | ----------- | -------- |
| `type` | Must be "selector".| Yes |
| `dimension` | Input column or virtual column name to filter. | Yes |
| `value` | String value to match. | No. If not specified the filter matches NULL values. |
| `extractionFn` | [Extraction function](./dimensionspecs.md#extraction-functions) to apply to `dimension` prior to value matching. See [filtering with extraction functions](#filtering-with-extraction-functions) for details. | No |
The selector filter can only match against `STRING` (single and multi-valued), `LONG`, `FLOAT`, `DOUBLE` types. Use the newer null and equality filters to match against `ARRAY` or `COMPLEX` types.
When the selector filter matches against numeric inputs, the string `value` will be best-effort coerced into a numeric value.
### Example: equivalent of `WHERE someColumn = 'hello'`
``` json
"filter": { "type": "selector", "dimension": <dimension_string>, "value": <dimension_value_string> }
{ "type": "selector", "dimension": "someColumn", "value": "hello" }
```
This is the equivalent of `WHERE <dimension_string> = '<dimension_value_string>'` or `WHERE <dimension_string> IS NULL`
(if the `value` is `null`).
The selector filter supports the use of extraction functions, see [Filtering with Extraction Functions](#filtering-with-extraction-functions) for details.
### Example: equivalent of `WHERE someColumn IS NULL`
``` json
{ "type": "selector", "dimension": "someColumn", "value": null }
```
## Equality Filter
The equality filter is a replacement for the selector filter with the ability to match against any type of column. The equality filter is designed to have more SQL compatible behavior than the selector filter and so can not match null values. To match null values use the null filter.
Druid's SQL planner uses the equality filter by default instead of selector filter whenever `druid.generic.useDefaultValueForNull=false`, or if `sqlUseBoundAndSelectors` is set to false on the [SQL query context](./sql-query-context.md).
| Property | Description | Required |
| -------- | ----------- | -------- |
| `type` | Must be "equality".| Yes |
| `column` | Input column or virtual column name to filter. | Yes |
| `matchValueType` | String specifying the type of value to match. For example `STRING`, `LONG`, `DOUBLE`, `FLOAT`, `ARRAY<STRING>`, `ARRAY<LONG>`, or any other Druid type. The `matchValueType` determines how Druid interprets the `matchValue` to assist in converting to the type of the matched `column`. | Yes |
| `matchValue` | Value to match, must not be null. | Yes |
### Example: equivalent of `WHERE someColumn = 'hello'`
```json
{ "type": "equality", "column": "someColumn", "matchValueType": "STRING", "matchValue": "hello" }
```
### Example: equivalent of `WHERE someNumericColumn = 1.23`
```json
{ "type": "equality", "column": "someNumericColumn", "matchValueType": "DOUBLE", "matchValue": 1.23 }
```
### Example: equivalent of `WHERE someArrayColumn = ARRAY[1, 2, 3]`
```json
{ "type": "equality", "column": "someArrayColumn", "matchValueType": "ARRAY<LONG>", "matchValue": [1, 2, 3] }
```
## Null Filter
The null filter is a partial replacement for the selector filter. It is dedicated to matching NULL values.
Druid's SQL planner uses the null filter by default instead of selector filter whenever `druid.generic.useDefaultValueForNull=false`, or if `sqlUseBoundAndSelectors` is set to false on the [SQL query context](./sql-query-context.md).
| Property | Description | Required |
| -------- | ----------- | -------- |
| `type` | Must be "null".| Yes |
| `column` | Input column or virtual column name to filter. | Yes |
### Example: equivalent of `WHERE someColumn IS NULL`
```json
{ "type": "null", "column": "someColumn" }
```
## Column comparison filter
The column comparison filter is similar to the selector filter, but instead compares dimensions to each other. For example:
The column comparison filter is similar to the selector filter, but compares dimensions to each other. For example:
``` json
"filter": { "type": "columnComparison", "dimensions": [<dimension_a>, <dimension_b>] }
```
This is the equivalent of `WHERE <dimension_a> = <dimension_b>`.
| Property | Description | Required |
| -------- | ----------- | -------- |
| `type` | Must be "selector".| Yes |
| `dimensions` | List of [`DimensionSpec`](./dimensionspecs.md) to compare. | Yes |
`dimensions` is list of [DimensionSpecs](./dimensionspecs.md), making it possible to apply an extraction function if needed.
## Regular expression filter
Note that the column comparison filter converts all values to strings prior to comparison. This allows differently-typed input columns to match without a cast operation.
The regular expression filter is similar to the selector filter, but using regular expressions. It matches the specified dimension with the given pattern. The pattern can be any standard [Java regular expression](http://docs.oracle.com/javase/6/docs/api/java/util/regex/Pattern.html).
### Example: equivalent of `WHERE someColumn = someLongColumn`
``` json
"filter": { "type": "regex", "dimension": <dimension_string>, "pattern": <pattern_string> }
{
"type": "columnComparison",
"dimensions": [
"someColumn",
{
"type" : "default",
"dimension" : someLongColumn,
"outputType": "LONG"
}
]
}
```
The regex filter supports the use of extraction functions, see [Filtering with Extraction Functions](#filtering-with-extraction-functions) for details.
## Logical expression filters
### AND
The grammar for an AND filter is as follows:
| Property | Description | Required |
| -------- | ----------- | -------- |
| `type` | Must be "and".| Yes |
| `fields` | List of filter JSON objects, such as any other filter defined on this page or provided by extensions. | Yes |
#### Example: equivalent of `WHERE someColumn = 'a' AND otherColumn = 1234 AND anotherColumn IS NULL`
``` json
"filter": { "type": "and", "fields": [<filter>, <filter>, ...] }
{
"type": "and",
"fields": [
{ "type": "equality", "column": "someColumn", "matchValue": "a", "matchValueType": "STRING" },
{ "type": "equality", "column": "otherColumn", "matchValue": 1234, "matchValueType": "LONG" },
{ "type": "null", "column": "anotherColumn" }
]
}
```
The filters in fields can be any other filter defined on this page.
### OR
The grammar for an OR filter is as follows:
| Property | Description | Required |
| -------- | ----------- | -------- |
| `type` | Must be "or".| Yes |
| `fields` | List of filter JSON objects, such as any other filter defined on this page or provided by extensions. | Yes |
#### Example: equivalent of `WHERE someColumn = 'a' OR otherColumn = 1234 OR anotherColumn IS NULL`
``` json
"filter": { "type": "or", "fields": [<filter>, <filter>, ...] }
{
"type": "or",
"fields": [
{ "type": "equality", "column": "someColumn", "matchValue": "a", "matchValueType": "STRING" },
{ "type": "equality", "column": "otherColumn", "matchValue": 1234, "matchValueType": "LONG" },
{ "type": "null", "column": "anotherColumn" }
]
}
```
The filters in fields can be any other filter defined on this page.
### NOT
The grammar for a NOT filter is as follows:
| Property | Description | Required |
| -------- | ----------- | -------- |
| `type` | Must be "not".| Yes |
| `field` | Filter JSON objects, such as any other filter defined on this page or provided by extensions. | Yes |
#### Example: equivalent of `WHERE someColumn IS NOT NULL`
```json
"filter": { "type": "not", "field": <filter> }
{ "type": "not", "field": { "type": "null", "column": "someColumn" }}
```
The filter specified at field can be any other filter defined on this page.
## JavaScript filter
The JavaScript filter matches a dimension against the specified JavaScript function predicate. The filter matches values for which the function returns true.
The function takes a single argument, the dimension value, and returns either true or false.
```json
{
"type" : "javascript",
"dimension" : <dimension_string>,
"function" : "function(value) { <...> }"
}
```
**Example**
The following matches any dimension values for the dimension `name` between `'bar'` and `'foo'`
```json
{
"type" : "javascript",
"dimension" : "name",
"function" : "function(x) { return(x >= 'bar' && x <= 'foo') }"
}
```
The JavaScript filter supports the use of extraction functions, see [Filtering with Extraction Functions](#filtering-with-extraction-functions) for details.
> JavaScript-based functionality is disabled by default. Please refer to the Druid [JavaScript programming guide](../development/javascript.md) for guidelines about using Druid's JavaScript functionality, including instructions on how to enable it.
## Extraction filter
> The extraction filter is now deprecated. The selector filter with an extraction function specified
> provides identical functionality and should be used instead.
Extraction filter matches a dimension using some specific [Extraction function](./dimensionspecs.md#extraction-functions).
The following filter matches the values for which the extraction function has transformation entry `input_key=output_value` where
`output_value` is equal to the filter `value` and `input_key` is present as dimension.
**Example**
The following matches dimension values in `[product_1, product_3, product_5]` for the column `product`
```json
{
"filter": {
"type": "extraction",
"dimension": "product",
"value": "bar_1",
"extractionFn": {
"type": "lookup",
"lookup": {
"type": "map",
"map": {
"product_1": "bar_1",
"product_5": "bar_1",
"product_3": "bar_1"
}
}
}
}
}
```
## Search filter
Search filters can be used to filter on partial string matches.
```json
{
"filter": {
"type": "search",
"dimension": "product",
"query": {
"type": "insensitive_contains",
"value": "foo"
}
}
}
```
|property|description|required?|
|--------|-----------|---------|
|type|This String should always be "search".|yes|
|dimension|The dimension to perform the search over.|yes|
|query|A JSON object for the type of search. See [search query spec](#search-query-spec) for more information.|yes|
|extractionFn|[Extraction function](#filtering-with-extraction-functions) to apply to the dimension|no|
The search filter supports the use of extraction functions, see [Filtering with Extraction Functions](#filtering-with-extraction-functions) for details.
### Search query spec
#### Contains
|property|description|required?|
|--------|-----------|---------|
|type|This String should always be "contains".|yes|
|value|A String value to run the search over.|yes|
|caseSensitive|Whether two string should be compared as case sensitive or not|no (default == false)|
#### Insensitive Contains
|property|description|required?|
|--------|-----------|---------|
|type|This String should always be "insensitive_contains".|yes|
|value|A String value to run the search over.|yes|
Note that an "insensitive_contains" search is equivalent to a "contains" search with "caseSensitive": false (or not
provided).
#### Fragment
|property|description|required?|
|--------|-----------|---------|
|type|This String should always be "fragment".|yes|
|values|A JSON array of String values to run the search over.|yes|
|caseSensitive|Whether strings should be compared as case sensitive or not. Default: false(insensitive)|no|
## In filter
The in filter can match input rows against a set of values, where a match occurs if the value is contained in the set.
In filter can be used to express the following SQL query:
| Property | Description | Required |
| -------- | ----------- | -------- |
| `type` | Must be "in".| Yes |
| `dimension` | Input column or virtual column name to filter. | Yes |
| `values` | List of string value to match. | Yes |
| `extractionFn` | [Extraction function](./dimensionspecs.md#extraction-functions) to apply to `dimension` prior to value matching. See [filtering with extraction functions](#filtering-with-extraction-functions) for details. | No |
```sql
SELECT COUNT(*) AS 'Count' FROM `table` WHERE `outlaw` IN ('Good', 'Bad', 'Ugly')
```
The grammar for a "in" filter is as follows:
If an empty `values` array is passed to the "in" filter, it will simply return an empty result.
If the `values` array contains `null`, the "in" filter matches null values. This differs from the SQL IN filter, which
does not match NULL values.
### Example: equivalent of `WHERE `outlaw` IN ('Good', 'Bad', 'Ugly')`
```json
{
@ -237,40 +225,6 @@ The grammar for a "in" filter is as follows:
}
```
The "in" filter supports the use of extraction functions, see [Filtering with Extraction Functions](#filtering-with-extraction-functions) for details.
If an empty `values` array is passed to the "in" filter, it will simply return an empty result.
If the `dimension` is a multi-valued dimension, the "in" filter will return true if one of the dimension values is
in the `values` array.
If the `values` array contains `null`, the "in" filter matches null values. This differs from the SQL IN filter, which
does not match NULL values.
## Like filter
Like filters can be used for basic wildcard searches. They are equivalent to the SQL LIKE operator. Special characters
supported are "%" (matches any number of characters) and "\_" (matches any one character).
|property|type|description|required?|
|--------|-----------|---------|---------|
|type|String|This should always be "like".|yes|
|dimension|String|The dimension to filter on|yes|
|pattern|String|LIKE pattern, such as "foo%" or "___bar".|yes|
|escape|String|An escape character that can be used to escape special characters.|no|
|extractionFn|[Extraction function](#filtering-with-extraction-functions)| Extraction function to apply to the dimension|no|
Like filters support the use of extraction functions, see [Filtering with Extraction Functions](#filtering-with-extraction-functions) for details.
This Like filter expresses the condition `last_name LIKE "D%"` (i.e. last_name starts with "D").
```json
{
"type": "like",
"dimension": "last_name",
"pattern": "D%"
}
```
## Bound filter
@ -278,20 +232,24 @@ Bound filters can be used to filter on ranges of dimension values. It can be use
greater than, less than, greater than or equal to, less than or equal to, and "between" (if both "lower" and
"upper" are set).
|property|type|description|required?|
|--------|-----------|---------|---------|
|type|String|This should always be "bound".|yes|
|dimension|String|The dimension to filter on|yes|
|lower|String|The lower bound for the filter|no|
|upper|String|The upper bound for the filter|no|
|lowerStrict|Boolean|Perform strict comparison on the lower bound (">" instead of ">=")|no, default: false|
|upperStrict|Boolean|Perform strict comparison on the upper bound ("<" instead of "<=")|no, default: false|
|ordering|String|Specifies the sorting order to use when comparing values against the bound. Can be one of the following values: "lexicographic", "alphanumeric", "numeric", "strlen", "version". See [Sorting Orders](./sorting-orders.md) for more details.|no, default: "lexicographic"|
|extractionFn|[Extraction function](#filtering-with-extraction-functions)| Extraction function to apply to the dimension|no|
| Property | Description | Required |
| -------- | ----------- | -------- |
| `type` | Must be "bound". | Yes |
| `dimension` | Input column or virtual column name to filter. | Yes |
| `lower` | The lower bound string match value for the filter. | No |
| `upper`| The upper bound string match value for the filter. | No |
| `lowerStrict` | Boolean indicating whether to perform strict comparison on the `lower` bound (">" instead of ">="). | No, default: `false` |
| `upperStrict` | Boolean indicating whether to perform strict comparison on the upper bound ("<" instead of "<="). | No, default: `false`|
| `ordering` | String that specifies the sorting order to use when comparing values against the bound. Can be one of the following values: `"lexicographic"`, `"alphanumeric"`, `"numeric"`, `"strlen"`, `"version"`. See [Sorting Orders](./sorting-orders.md) for more details. | No, default: `"lexicographic"`|
| `extractionFn` | [Extraction function](./dimensionspecs.md#extraction-functions) to apply to `dimension` prior to value matching. See [filtering with extraction functions](#filtering-with-extraction-functions) for details. | No |
Bound filters support the use of extraction functions, see [Filtering with Extraction Functions](#filtering-with-extraction-functions) for details.
When the bound filter matches against numeric inputs, the string `lower` and `upper` bound values are best-effort coerced into a numeric value when using the `"numeric"` mode of ordering.
The following bound filter expresses the condition `21 <= age <= 31`:
The bound filter can only match against `STRING` (single and multi-valued), `LONG`, `FLOAT`, `DOUBLE` types. Use the newer range to match against `ARRAY` or `COMPLEX` types.
Note that the bound filter matches null values if you don't specify a lower bound. Use the range filter if SQL-compatible behavior.
### Example: equivalent to `WHERE 21 <= age <= 31`
```json
{
@ -303,7 +261,7 @@ The following bound filter expresses the condition `21 <= age <= 31`:
}
```
This filter expresses the condition `foo <= name <= hoo`, using the default lexicographic sorting order.
### Example: equivalent to `WHERE 'foo' <= name <= 'hoo'`, using the default lexicographic sorting order
```json
{
@ -314,7 +272,7 @@ This filter expresses the condition `foo <= name <= hoo`, using the default lexi
}
```
Using strict bounds, this filter expresses the condition `21 < age < 31`
### Example: equivalent to `WHERE 21 < age < 31`
```json
{
@ -328,7 +286,7 @@ Using strict bounds, this filter expresses the condition `21 < age < 31`
}
```
The user can also specify a one-sided bound by omitting "upper" or "lower". This filter expresses `age < 31`.
### Example: equivalent to `WHERE age < 31`
```json
{
@ -340,7 +298,7 @@ The user can also specify a one-sided bound by omitting "upper" or "lower". This
}
```
Likewise, this filter expresses `age >= 18`
### Example: equivalent to `WHERE age >= 18`
```json
{
@ -352,18 +310,154 @@ Likewise, this filter expresses `age >= 18`
```
## Range filter
The range filter is a replacement for the bound filter. It compares against any type of column and is designed to have has more SQL compliant behavior than the bound filter. It won't match null values, even if you don't specify a lower bound.
Druid's SQL planner uses the range filter by default instead of bound filter whenever `druid.generic.useDefaultValueForNull=false`, or if `sqlUseBoundAndSelectors` is set to false on the [SQL query context](./sql-query-context.md).
| Property | Description | Required |
| -------- | ----------- | -------- |
| `type` | Must be "range".| Yes |
| `column` | Input column or virtual column name to filter. | Yes |
| `matchValueType` | String specifying the type of bounds to match. For example `STRING`, `LONG`, `DOUBLE`, `FLOAT`, `ARRAY<STRING>`, `ARRAY<LONG>`, or any other Druid type. The `matchValueType` determines how Druid interprets the `matchValue` to assist in converting to the type of the matched `column` and also defines the type of comparison used when matching values. | Yes |
| `lower` | Lower bound value to match. | No. At least one of `lower` or `upper` must not be null. |
| `upper` | Upper bound value to match. | No. At least one of `lower` or `upper` must not be null. |
| `lowerOpen` | Boolean indicating if lower bound is open in the interval of values defined by the range (">" instead of ">="). | No |
| `upperOpen` | Boolean indicating if upper bound is open on the interval of values defined by range ("<" instead of "<="). | No |
### Example: equivalent to `WHERE 21 <= age <= 31`
```json
{
"type": "range",
"column": "age",
"matchValueType": "LONG",
"lower": 21,
"upper": 31
}
```
### Example: equivalent to `WHERE 'foo' <= name <= 'hoo'`, using STRING comparison
```json
{
"type": "range",
"column": "name",
"matchValueType": "STRING",
"lower": "foo",
"upper": "hoo"
}
```
### Example: equivalent to `WHERE 21 < age < 31`
```json
{
"type": "range",
"column": "age",
"matchValueType": "LONG",
"lower": "21",
"lowerOpen": true,
"upper": "31" ,
"upperOpen": true
}
```
### Example: equivalent to `WHERE age < 31`
```json
{
"type": "range",
"column": "age",
"matchValueType": "LONG",
"upper": "31" ,
"upperOpen": true
}
```
### Example: equivalent to `WHERE age >= 18`
```json
{
"type": "range",
"column": "age",
"matchValueType": "LONG",
"lower": 18
}
```
### Example: equivalent to `WHERE ARRAY['a','b','c'] < arrayColumn < ARRAY['d','e','f']`, using ARRAY comparison
```json
{
"type": "range",
"column": "name",
"matchValueType": "ARRAY<STRING>",
"lower": ["a","b","c"],
"lowerOpen": true,
"upper": ["d","e","f"],
"upperOpen": true
}
```
## Like filter
Like filters can be used for basic wildcard searches. They are equivalent to the SQL LIKE operator. Special characters
supported are "%" (matches any number of characters) and "\_" (matches any one character).
| Property | Description | Required |
| -------- | ----------- | -------- |
| `type` | Must be "like".| Yes |
| `dimension` | Input column or virtual column name to filter. | Yes |
| `pattern` | String LIKE pattern, such as "foo%" or "___bar".| Yes |
| `escape`| A string escape character that can be used to escape special characters. | No |
| `extractionFn` | [Extraction function](./dimensionspecs.md#extraction-functions) to apply to `dimension` prior to value matching. See [filtering with extraction functions](#filtering-with-extraction-functions) for details. | No |
Like filters support the use of extraction functions, see [Filtering with Extraction Functions](#filtering-with-extraction-functions) for details.
### Example: equivalent of `WHERE last_name LIKE "D%"` (last_name starts with "D")
```json
{
"type": "like",
"dimension": "last_name",
"pattern": "D%"
}
```
## Regular expression filter
The regular expression filter is similar to the selector filter, but using regular expressions. It matches the specified dimension with the given pattern.
| Property | Description | Required |
| -------- | ----------- | -------- |
| `type` | Must be "regex".| Yes |
| `dimension` | Input column or virtual column name to filter. | Yes |
| `pattern` | String pattern to match - any standard [Java regular expression](http://docs.oracle.com/javase/6/docs/api/java/util/regex/Pattern.html). | Yes |
| `extractionFn` | [Extraction function](./dimensionspecs.md#extraction-functions) to apply to `dimension` prior to value matching. See [filtering with extraction functions](#filtering-with-extraction-functions) for details. | No |
Note that it is often more optimal to use a like filter instead of a regex for simple matching of prefixes.
### Example: matches values that start with "50."
``` json
{ "type": "regex", "dimension": "someColumn", "pattern": ^50.* }
```
## Interval filter
The Interval filter enables range filtering on columns that contain long millisecond values, with the boundaries specified as ISO 8601 time intervals. It is suitable for the `__time` column, long metric columns, and dimensions with values that can be parsed as long milliseconds.
This filter converts the ISO 8601 intervals to long millisecond start/end ranges and translates to an OR of Bound filters on those millisecond ranges, with numeric comparison. The Bound filters will have left-closed and right-open matching (i.e., start <= time < end).
|property|type|description|required?|
|--------|-----------|---------|---------|
|type|String|This should always be "interval".|yes|
|dimension|String|The dimension to filter on|yes|
|intervals|Array|A JSON array containing ISO-8601 interval strings. This defines the time ranges to filter on.|yes|
|extractionFn|[Extraction function](#filtering-with-extraction-functions)| Extraction function to apply to the dimension|no|
| Property | Description | Required |
| -------- | ----------- | -------- |
| `type` | Must be "interval". | Yes |
| `dimension` | Input column or virtual column name to filter. | Yes |
| `intervals` | A JSON array containing ISO-8601 interval strings that defines the time ranges to filter on. | Yes |
| `extractionFn` | [Extraction function](./dimensionspecs.md#extraction-functions) to apply to `dimension` prior to value matching. See [filtering with extraction functions](#filtering-with-extraction-functions) for details. | No |
The interval filter supports the use of extraction functions, see [Filtering with Extraction Functions](#filtering-with-extraction-functions) for details.
@ -410,6 +504,157 @@ The filter above is equivalent to the following OR of Bound filters:
}
```
## True filter
A filter which matches all values. You can use it to temporarily disable other filters without removing them.
```json
{ "type" : "true" }
```
## False filter
A filter matches no values. You can use it to force a query to match no values.
```json
{"type": "false" }
```
## Search filter
You can use search filters to filter on partial string matches.
```json
{
"filter": {
"type": "search",
"dimension": "product",
"query": {
"type": "insensitive_contains",
"value": "foo"
}
}
}
```
| Property | Description | Required |
| -------- | ----------- | -------- |
| `type` | Must be "search". | Yes |
| `dimension` | Input column or virtual column name to filter. | Yes |
| `query`| A JSON object for the type of search. See [search query spec](#search-query-spec) for more information. | Yes |
| `extractionFn` | [Extraction function](./dimensionspecs.md#extraction-functions) to apply to `dimension` prior to value matching. See [filtering with extraction functions](#filtering-with-extraction-functions) for details. | No |
### Search query spec
#### Contains
| Property | Description | Required |
| -------- | ----------- | -------- |
| `type` | Must be "contains". | Yes |
| `value` | A String value to search. | Yes |
| `caseSensitive` | Whether the string comparison is case-sensitive or not. | No, default is false (insensitive) |
#### Insensitive contains
| Property | Description | Required |
| -------- | ----------- | -------- |
| `type` | Must be "insensitive_contains". | Yes |
| `value` | A String value to search. | Yes |
Note that an "insensitive_contains" search is equivalent to a "contains" search with "caseSensitive": false (or not
provided).
#### Fragment
| Property | Description | Required |
| -------- | ----------- | -------- |
| `type` | Must be "fragment". | Yes |
| `values` | A JSON array of string values to search. | Yes |
| `caseSensitive` | Whether the string comparison is case-sensitive or not. | No, default is false (insensitive) |
## Expression filter
The expression filter allows for the implementation of arbitrary conditions, leveraging the Druid expression system. This filter allows for complete flexibility, but it might be less performant than a combination of the other filters on this page because it can't always use the same optimizations available to other filters.
| Property | Description | Required |
| -------- | ----------- | -------- |
| `type` | Must be "expression" | Yes |
| `expression` | Expression string to evaluate into true or false. See the [Druid expression system](math-expr.md) for more details. | Yes |
### Example: expression based matching
```json
{
"type" : "expression" ,
"expression" : "((product_type == 42) && (!is_deleted))"
}
```
## JavaScript filter
The JavaScript filter matches a dimension against the specified JavaScript function predicate. The filter matches values for which the function returns true.
| Property | Description | Required |
| -------- | ----------- | -------- |
| `type` | Must be "javascript" | Yes |
| `dimension` | Input column or virtual column name to filter. | Yes |
| `function` | JavaScript function which accepts the dimension value as a single argument, and returns either true or false. | Yes |
| `extractionFn` | [Extraction function](./dimensionspecs.md#extraction-functions) to apply to `dimension` prior to value matching. See [filtering with extraction functions](#filtering-with-extraction-functions) for details. | No |
### Example: matching any dimension values for the dimension `name` between `'bar'` and `'foo'`
```json
{
"type" : "javascript",
"dimension" : "name",
"function" : "function(x) { return(x >= 'bar' && x <= 'foo') }"
}
```
> JavaScript-based functionality is disabled by default. Refer to the Druid [JavaScript programming guide](../development/javascript.md) for guidelines about using Druid's JavaScript functionality, including instructions on how to enable it.
## Extraction filter
> The extraction filter is now deprecated. Use the selector filter with an extraction function instead.
Extraction filter matches a dimension using a specific [extraction function](./dimensionspecs.md#extraction-functions).
The following filter matches the values for which the extraction function has a transformation entry `input_key=output_value` where
`output_value` is equal to the filter `value` and `input_key` is present as a dimension.
| Property | Description | Required |
| -------- | ----------- | -------- |
| `type` | Must be "extraction" | Yes |
| `dimension` | Input column or virtual column name to filter. | Yes |
| `value` | String value to match. | No. If not specified the filter will match NULL values. |
| `extractionFn` | [Extraction function](./dimensionspecs.md#extraction-functions) to apply to `dimension` prior to value matching. See [filtering with extraction functions](#filtering-with-extraction-functions) for details. | No |
### Example: matching dimension values in `[product_1, product_3, product_5]` for the column `product`
```json
{
"filter": {
"type": "extraction",
"dimension": "product",
"value": "bar_1",
"extractionFn": {
"type": "lookup",
"lookup": {
"type": "map",
"map": {
"product_1": "bar_1",
"product_5": "bar_1",
"product_3": "bar_1"
}
}
}
}
}
```
## Filtering with extraction functions
All filters except the "spatial" filter support extraction functions.
@ -420,9 +665,7 @@ If specified, the extraction function will be used to transform input values bef
The example below shows a selector filter combined with an extraction function. This filter will transform input values
according to the values defined in the lookup map; transformed values will then be matched with the string "bar_1".
**Example**
The following matches dimension values in `[product_1, product_3, product_5]` for the column `product`
### Example: matches dimension values in `[product_1, product_3, product_5]` for the column `product`
```json
{
@ -449,29 +692,97 @@ The following matches dimension values in `[product_1, product_3, product_5]` fo
Druid supports filtering on timestamp, string, long, and float columns.
Note that only string columns have bitmap indexes. Therefore, queries that filter on other column types will need to
Note that only string columns and columns produced with the ['auto' ingestion spec](../ingestion/ingestion-spec.md#dimension-objects) also used by [type aware schema discovery](../ingestion/schema-design.md#type-aware-schema-discovery) have bitmap indexes. Queries that filter on other column types must
scan those columns.
### Filtering on multi-value string columns
All filters return true if any one of the dimension values is satisfies the filter.
#### Example: multi-value match behavior
Given a multi-value STRING row with values `['a', 'b', 'c']`, a filter such as
```json
{ "type": "equality", "column": "someMultiValueColumn", "matchValueType": "STRING", "matchValue": "b" }
```
will successfully match the entire row. This can produce sometimes unintuitive behavior when coupled with the implicit UNNEST functionality of Druid [GroupBy](./groupbyquery.md) and [TopN](./topnquery.md) queries.
Additionally, contradictory filters may be defined and perfectly legal in native queries which will not work in SQL.
#### Example: SQL "contradiction"
This query is impossible to express as is in SQL since it is a contradiction that the SQL planner will optimize to false and match nothing.
Given a multi-value STRING row with values `['a', 'b', 'c']`, and filter such as
```json
{
"type": "and",
"fields": [
{
"type": "equality",
"column": "someMultiValueColumn",
"matchValueType": "STRING",
"matchValue": "a"
},
{
"type": "equality",
"column": "someMultiValueColumn",
"matchValueType": "STRING",
"matchValue": "b"
}
]
}
```
will successfully match the entire row, but not match a row with value `['a', 'c']`.
To express this filter in SQL, use [SQL multi-value string functions](./sql-multivalue-string-functions.md) such as `MV_CONTAINS`, which can be optimized by the planner to the same native filters.
### Filtering on numeric columns
When filtering on numeric columns, you can write filters as if they were strings. In most cases, your filter will be
Some filters, such as equality and range filters allow accepting numeric match values directly since they include a secondary `matchValueType` parameter.
When filtering on numeric columns using string based filters such as the selector, in, and bounds filters, you can write filter match values as if they were strings. In most cases, your filter will be
converted into a numeric predicate and will be applied to the numeric column values directly. In some cases (such as
the "regex" filter) the numeric column values will be converted to strings during the scan.
For example, filtering on a specific value, `myFloatColumn = 10.1`:
#### Example: filtering on a specific value, `myFloatColumn = 10.1`
```json
"filter": {
{
"type": "equality",
"dimension": "myFloatColumn",
"matchValueType": "FLOAT",
"value": 10.1
}
```
or with a selector filter:
```json
{
"type": "selector",
"dimension": "myFloatColumn",
"value": "10.1"
}
```
Filtering on a range of values, `10 <= myFloatColumn < 20`:
#### Example: filtering on a range of values, `10 <= myFloatColumn < 20`
```json
"filter": {
{
"type": "range",
"column": "myFloatColumn",
"matchvalueType": "FLOAT",
"lower": 10.1,
"lowerOpen": false,
"upper": 20.9,
"upperOpen": true
}
```
or with a bound filter:
```json
{
"type": "bound",
"dimension": "myFloatColumn",
"ordering": "numeric",
@ -488,22 +799,33 @@ Query filters can also be applied to the timestamp column. The timestamp column
to the timestamp column, use the string `__time` as the dimension name. Like numeric dimensions, timestamp filters
should be specified as if the timestamp values were strings.
If the user wishes to interpret the timestamp with a specific format, timezone, or locale, the [Time Format Extraction Function](./dimensionspecs.md#time-format-extraction-function) is useful.
If you want to interpret the timestamp with a specific format, timezone, or locale, the [Time Format Extraction Function](./dimensionspecs.md#time-format-extraction-function) is useful.
For example, filtering on a long timestamp value:
#### Example: filtering on a long timestamp value
```json
"filter": {
{
"type": "equality",
"dimension": "__time",
"matchValueType": "LONG",
"value": 124457387532
}
```
or with a selector filter:
```json
{
"type": "selector",
"dimension": "__time",
"value": "124457387532"
}
```
Filtering on day of week:
#### Example: filtering on day of week using an extraction function
```json
"filter": {
{
"type": "selector",
"dimension": "__time",
"value": "Friday",
@ -516,7 +838,7 @@ Filtering on day of week:
}
```
Filtering on a set of ISO 8601 intervals:
#### Example: filtering on a set of ISO 8601 intervals
```json
{
@ -529,25 +851,3 @@ Filtering on a set of ISO 8601 intervals:
}
```
### True filter
The true filter is a filter which matches all values. It can be used to temporarily disable other filters without removing the filter.
```json
{ "type" : "true" }
```
### Expression filter
The expression filter allows for the implementation of arbitrary conditions, leveraging the Druid expression system.
This filter allows for more flexibility, but it might be less performant than a combination of the other filters on this page due to the fact that not all filter optimizations are in place yet.
```json
{
"type" : "expression" ,
"expression" : "((product_type == 42) && (!is_deleted))"
}
```
See the [Druid expression system](math-expr.md) for more details.

View File

@ -97,8 +97,8 @@ In the aggregation functions supported by Druid, only `COUNT`, `ARRAY_AGG`, and
|`ARRAY_AGG(DISTINCT expr, [size])`|Collects all distinct values of `expr` into an ARRAY, including null values, with `size` in bytes limit on aggregation size (default of 1024 bytes) per aggregate. If the aggregated array grows larger than the maximum size in bytes, the query will fail. Use of `ORDER BY` within the `ARRAY_AGG` expression is not currently supported, and the ordering of results will be based on the default for the element type.|`null`|
|`ARRAY_CONCAT_AGG(expr, [size])`|Concatenates all array `expr` into a single ARRAY, with `size` in bytes limit on aggregation size (default of 1024 bytes). Input `expr` _must_ be an array. Null `expr` will be ignored, but any null values within an `expr` _will_ be included in the resulting array. If the aggregated array grows larger than the maximum size in bytes, the query will fail. Use of `ORDER BY` within the `ARRAY_CONCAT_AGG` expression is not currently supported, and the ordering of results within the output array may vary depending on processing order.|`null`|
|`ARRAY_CONCAT_AGG(DISTINCT expr, [size])`|Concatenates all distinct values of all array `expr` into a single ARRAY, with `size` in bytes limit on aggregation size (default of 1024 bytes) per aggregate. Input `expr` _must_ be an array. Null `expr` will be ignored, but any null values within an `expr` _will_ be included in the resulting array. If the aggregated array grows larger than the maximum size in bytes, the query will fail. Use of `ORDER BY` within the `ARRAY_CONCAT_AGG` expression is not currently supported, and the ordering of results will be based on the default for the element type.|`null`|
|`STRING_AGG(expr, separator, [size])`|Collects all values of `expr` into a single STRING, ignoring null values. Each value is joined by the `separator` which must be a literal STRING. An optional `size` in bytes can be supplied to limit aggregation size (default of 1024 bytes). If the aggregated string grows larger than the maximum size in bytes, the query will fail. Use of `ORDER BY` within the `STRING_AGG` expression is not currently supported, and the ordering of results within the output string may vary depending on processing order.|`null` if `druid.generic.useDefaultValueForNull=false`, otherwise `''`|
|`STRING_AGG(DISTINCT expr, separator, [size])`|Collects all distinct values of `expr` into a single STRING, ignoring null values. Each value is joined by the `separator` which must be a literal STRING. An optional `size` in bytes can be supplied to limit aggregation size (default of 1024 bytes). If the aggregated string grows larger than the maximum size in bytes, the query will fail. Use of `ORDER BY` within the `STRING_AGG` expression is not currently supported, and the ordering of results will be based on the default `STRING` ordering.|`null` if `druid.generic.useDefaultValueForNull=false`, otherwise `''`|
|`STRING_AGG([DISTINCT] expr, [separator, [size]])`|Collects all values (or all distinct values) of `expr` into a single STRING, ignoring null values. Each value is joined by an optional `separator`, which must be a literal STRING. If the `separator` is not provided, strings are concatenated without a separator.<br /><br />An optional `size` in bytes can be supplied to limit aggregation size (default of 1024 bytes). If the aggregated string grows larger than the maximum size in bytes, the query will fail. Use of `ORDER BY` within the `STRING_AGG` expression is not currently supported, and the ordering of results within the output string may vary depending on processing order.|`null` if `druid.generic.useDefaultValueForNull=false`, otherwise `''`|
|`LISTAGG([DISTINCT] expr, [separator, [size]])`|Synonym for `STRING_AGG`.|`null` if `druid.generic.useDefaultValueForNull=false`, otherwise `''`|
|`BIT_AND(expr)`|Performs a bitwise AND operation on all input values.|`null` if `druid.generic.useDefaultValueForNull=false`, otherwise `0`|
|`BIT_OR(expr)`|Performs a bitwise OR operation on all input values.|`null` if `druid.generic.useDefaultValueForNull=false`, otherwise `0`|
|`BIT_XOR(expr)`|Performs a bitwise XOR operation on all input values.|`null` if `druid.generic.useDefaultValueForNull=false`, otherwise `0`|

View File

@ -44,6 +44,7 @@ Configure Druid SQL query planning using the parameters in the table below.
|`enableTimeBoundaryPlanning`|If true, SQL queries will get converted to TimeBoundary queries wherever possible. TimeBoundary queries are very efficient for min-max calculation on `__time` column in a datasource |`druid.query.default.context.enableTimeBoundaryPlanning` on the Broker (default: false)|
|`useNativeQueryExplain`|If true, `EXPLAIN PLAN FOR` will return the explain plan as a JSON representation of equivalent native query(s), else it will return the original version of explain plan generated by Calcite.<br /><br />This property is provided for backwards compatibility. It is not recommended to use this parameter unless you were depending on the older behavior.|`druid.sql.planner.useNativeQueryExplain` on the Broker (default: true)|
|`sqlFinalizeOuterSketches`|If false (default behavior in Druid 25.0.0 and later), `DS_HLL`, `DS_THETA`, and `DS_QUANTILES_SKETCH` return sketches in query results, as documented. If true (default behavior in Druid 24.0.1 and earlier), sketches from these functions are finalized when they appear in query results.<br /><br />This property is provided for backwards compatibility with behavior in Druid 24.0.1 and earlier. It is not recommended to use this parameter unless you were depending on the older behavior. Instead, use a function that does not return a sketch, such as `APPROX_COUNT_DISTINCT_DS_HLL`, `APPROX_COUNT_DISTINCT_DS_THETA`, `APPROX_QUANTILE_DS`, `DS_THETA_ESTIMATE`, or `DS_GET_QUANTILE`.|`druid.query.default.context.sqlFinalizeOuterSketches` on the Broker (default: false)|
|`sqlUseBoundAndSelectors`|If false (default behavior if `druid.generic.useDefaultValueForNull=false` in Druid 27.0.0 and later), the SQL planner will use [equality](./filters.md#equality-filter), [null](./filters.md#null-filter), and [range](./filters.md#range-filter) filters instead of [selector](./filters.md#selector-filter) and [bounds](./filters.md#bound-filter). This value must be set to `false` for correct behavior for filtering `ARRAY` typed values. | Defaults to same value as `druid.generic.useDefaultValueForNull` |
## Setting the query context
The query context parameters can be specified as a "context" object in the [JSON API](../api-reference/sql-api.md) or as a [JDBC connection properties object](../api-reference/sql-jdbc.md).

View File

@ -38,18 +38,18 @@ Once the Docker install is complete, please proceed to the next steps in the tut
## Build the Hadoop docker image
For this tutorial, we've provided a Dockerfile for a Hadoop 2.8.5 cluster, which we'll use to run the batch indexing task.
For this tutorial, we've provided a Dockerfile for a Hadoop 3.3.6 cluster, which we'll use to run the batch indexing task.
This Dockerfile and related files are located at `quickstart/tutorial/hadoop/docker`.
From the apache-druid-{{DRUIDVERSION}} package root, run the following commands to build a Docker image named "druid-hadoop-demo" with version tag "2.8.5":
From the apache-druid-{{DRUIDVERSION}} package root, run the following commands to build a Docker image named "druid-hadoop-demo" with version tag "3.3.6":
```bash
cd quickstart/tutorial/hadoop/docker
docker build -t druid-hadoop-demo:2.8.5 .
docker build -t druid-hadoop-demo:3.3.6 .
```
This will start building the Hadoop image. Once the image build is done, you should see the message `Successfully tagged druid-hadoop-demo:2.8.5` printed to the console.
This will start building the Hadoop image. Once the image build is done, you should see the message `Successfully tagged druid-hadoop-demo:3.3.6` printed to the console.
## Setup the Hadoop docker cluster
@ -77,7 +77,7 @@ On the host machine, add the following entry to `/etc/hosts`:
Once the `/tmp/shared` folder has been created and the `etc/hosts` entry has been added, run the following command to start the Hadoop container.
```bash
docker run -it -h druid-hadoop-demo --name druid-hadoop-demo -p 2049:2049 -p 2122:2122 -p 8020:8020 -p 8021:8021 -p 8030:8030 -p 8031:8031 -p 8032:8032 -p 8033:8033 -p 8040:8040 -p 8042:8042 -p 8088:8088 -p 8443:8443 -p 9000:9000 -p 10020:10020 -p 19888:19888 -p 34455:34455 -p 49707:49707 -p 50010:50010 -p 50020:50020 -p 50030:50030 -p 50060:50060 -p 50070:50070 -p 50075:50075 -p 50090:50090 -p 51111:51111 -v /tmp/shared:/shared druid-hadoop-demo:2.8.5 /etc/bootstrap.sh -bash
docker run -it -h druid-hadoop-demo --name druid-hadoop-demo -p 2049:2049 -p 2122:2122 -p 8020:8020 -p 8021:8021 -p 8030:8030 -p 8031:8031 -p 8032:8032 -p 8033:8033 -p 8040:8040 -p 8042:8042 -p 8088:8088 -p 8443:8443 -p 9000:9000 -p 10020:10020 -p 19888:19888 -p 34455:34455 -p 49707:49707 -p 50010:50010 -p 50020:50020 -p 50030:50030 -p 50060:50060 -p 50070:50070 -p 50075:50075 -p 50090:50090 -p 51111:51111 -v /tmp/shared:/shared druid-hadoop-demo:3.3.6 /etc/bootstrap.sh -bash
```
Once the container is started, your terminal will attach to a bash shell running inside the container:
@ -125,6 +125,7 @@ cd /usr/local/hadoop/bin
./hdfs dfs -mkdir /druid
./hdfs dfs -mkdir /druid/segments
./hdfs dfs -mkdir /quickstart
./hdfs dfs -mkdir /user
./hdfs dfs -chmod 777 /druid
./hdfs dfs -chmod 777 /druid/segments
./hdfs dfs -chmod 777 /quickstart
@ -205,10 +206,10 @@ We've included a sample of Wikipedia edits from September 12, 2015 to get you st
To load this data into Druid, you can submit an *ingestion task* pointing to the file. We've included
a task that loads the `wikiticker-2015-09-12-sampled.json.gz` file included in the archive.
Let's submit the `wikipedia-index-hadoop.json` task:
Let's submit the `wikipedia-index-hadoop3.json` task:
```bash
bin/post-index-task --file quickstart/tutorial/wikipedia-index-hadoop.json --url http://localhost:8081
bin/post-index-task --file quickstart/tutorial/wikipedia-index-hadoop3.json --url http://localhost:8081
```
## Querying your data

View File

@ -37,7 +37,8 @@ RUN pip install requests \
pip install seaborn \
pip install bokeh \
pip install kafka-python \
pip install sortedcontainers
pip install sortedcontainers \
pip install tqdm
# Install druidapi client from apache/druid
# Local install requires sudo privileges
@ -46,12 +47,6 @@ ADD druidapi /home/jovyan/druidapi
WORKDIR /home/jovyan/druidapi
RUN pip install .
# WIP -- install DruidDataDriver as a package
# Import data generator and configuration file
# Change permissions to allow import (requires sudo privileges)
# The Jupyter notebooks themselves are mounted into the image's /home/jovyan/notebooks
# path when running this image.
RUN mkdir -p /home/jovyan/notebooks
@ -59,8 +54,3 @@ RUN mkdir -p /home/jovyan/notebooks
WORKDIR /home/jovyan/notebooks
USER jovyan
# Add location of the data generator to PYTHONPATH
ENV PYTHONPATH "${PYTHONPATH}:/home/jovyan/notebooks/02-ingestion"

View File

@ -27,6 +27,7 @@ volumes:
coordinator_var: {}
router_var: {}
druid_shared: {}
datagen_data: {}
services:
@ -175,3 +176,12 @@ services:
- "${JUPYTER_PORT:-8889}:8888"
volumes:
- ../notebooks:/home/jovyan/notebooks
datagen:
image: imply/datagen:latest
container_name: datagen
profiles: ["jupyter", "kafka-jupyter", "druid-jupyter", "all-services"]
ports:
- "${DATAGEN_PORT:-9999}:9999"
volumes:
- datagen_data:/files

View File

@ -27,6 +27,7 @@ volumes:
coordinator_var: {}
router_var: {}
druid_shared: {}
datagen_data: {}
services:
@ -173,3 +174,12 @@ services:
- "${JUPYTER_PORT:-8889}:8888"
volumes:
- ../notebooks:/home/jovyan/notebooks
datagen:
image: imply/datagen:latest
container_name: datagen
profiles: ["jupyter", "kafka-jupyter", "druid-jupyter", "all-services"]
ports:
- "${DATAGEN_PORT:-9999}:9999"
volumes:
- datagen_data:/files

View File

@ -39,8 +39,8 @@ druid_metadata_storage_connector_password=FoolishPassword
druid_coordinator_balancer_strategy=cachingCost
druid_indexer_runner_javaOptsArray=["-server", "-Xmx1g", "-Xms1g", "-XX:MaxDirectMemorySize=3g", "-Duser.timezone=UTC", "-Dfile.encoding=UTF-8", "-Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager"]
druid_indexer_fork_property_druid_processing_buffer_sizeBytes=256MiB
druid_indexer_runner_javaOptsArray=["-server", "-Xmx256m", "-Xms256m", "-XX:MaxDirectMemorySize=324m", "-Duser.timezone=UTC", "-Dfile.encoding=UTF-8", "-Djava.util.logging.manager=org.apache.logging.log4j.jul.LogManager"]
druid_indexer_fork_property_druid_processing_buffer_sizeBytes=64MiB

View File

@ -14,6 +14,7 @@
# limitations under the License.
from druidapi import consts
import time
class DisplayClient:
'''
@ -144,3 +145,36 @@ class DisplayClient:
def tables(self, schema=consts.DRUID_SCHEMA):
self._druid.sql._tables_query(schema).show(display=self)
def run_task(self, query):
'''
Run an MSQ task while displaying progress in the cell output.
:param query: INSERT/REPLACE statement to run
:return: None
'''
from tqdm import tqdm
task = self._druid.sql.task(query)
with tqdm(total=100.0) as pbar:
previous_progress = 0.0
while True:
reports=task.reports_no_wait()
# check if progress metric is available and display it
if 'multiStageQuery' in reports.keys():
if 'payload' in reports['multiStageQuery'].keys():
if 'counters' in reports['multiStageQuery']['payload'].keys():
if ('0' in reports['multiStageQuery']['payload']['counters'].keys() ) and \
('0' in reports['multiStageQuery']['payload']['counters']['0'].keys()):
if 'progressDigest' in reports['multiStageQuery']['payload']['counters']['0']['0']['sortProgress'].keys():
current_progress = reports['multiStageQuery']['payload']['counters']['0']['0']['sortProgress']['progressDigest']*100.0
pbar.update( current_progress - previous_progress ) # update requires a relative value
previous_progress = current_progress
# present status if available
if 'status' in reports['multiStageQuery']['payload'].keys():
pbar.set_description(f"Loading data, status:[{reports['multiStageQuery']['payload']['status']['status']}]")
# stop when job is done
if reports['multiStageQuery']['payload']['status']['status'] in ['SUCCESS', 'FAILED']:
break;
else:
pbar.set_description('Initializing...')
time.sleep(1)

View File

@ -585,6 +585,9 @@ class QueryTaskResult:
self._reports = self._tasks().task_reports(self._id)
return self._reports
def reports_no_wait(self) -> dict:
return self._tasks().task_reports(self._id, require_ok=False)
@property
def results(self):
if not self._results:
@ -844,7 +847,7 @@ class QueryClient:
'''
return self._function_args_query(table_name).rows
def wait_until_ready(self, table_name):
def wait_until_ready(self, table_name, verify_load_status=True):
'''
Waits for a datasource to be loaded in the cluster, and to become available to SQL.
@ -852,8 +855,12 @@ class QueryClient:
----------
table_name str
The name of a datasource in the 'druid' schema.
verify_load_status
If true, checks whether all published segments are loaded before testing query.
If false, tries the test query before checking whether all published segments are loaded.
'''
self.druid_client.datasources.wait_until_ready(table_name)
if verify_load_status:
self.druid_client.datasources.wait_until_ready(table_name)
while True:
try:
self.sql('SELECT 1 FROM "{}" LIMIT 1'.format(table_name));

View File

@ -14,6 +14,7 @@
# limitations under the License.
from druidapi.consts import OVERLORD_BASE
import requests
REQ_TASKS = OVERLORD_BASE + '/tasks'
REQ_POST_TASK = OVERLORD_BASE + '/task'
@ -112,7 +113,7 @@ class TaskClient:
'''
return self.client.get_json(REQ_TASK_STATUS, args=[task_id])
def task_reports(self, task_id) -> dict:
def task_reports(self, task_id, require_ok = True) -> dict:
'''
Retrieves the completion report for a completed task.
@ -129,7 +130,19 @@ class TaskClient:
---------
`GET /druid/indexer/v1/task/{taskId}/reports`
'''
return self.client.get_json(REQ_TASK_REPORTS, args=[task_id])
if require_ok:
return self.client.get_json(REQ_TASK_REPORTS, args=[task_id])
else:
resp = self.client.get(REQ_TASK_REPORTS, args=[task_id], require_ok=require_ok)
if resp.status_code == requests.codes.ok:
try:
result = resp.json()
except Exception as ex:
result = {"message":"Payload could not be converted to json.", "payload":f"{resp.content}", "exception":f"{ex}"}
return result
else:
return {"message":f"Request return code:{resp.status_code}"}
def submit_task(self, payload):
'''

View File

@ -91,7 +91,8 @@
" basics related to the Druid REST API and several endpoints.\n",
"- [Introduction to the Druid Python API](01-druidapi-package-intro.ipynb) walks you through some of the\n",
" basics related to the Druid API using the Python wrapper API.\n",
"- [Learn the basics of Druid SQL](../03-query/00-using-sql-with-druidapi.ipynb) introduces you to the unique aspects of Druid SQL with the primary focus on the SELECT statement. \n",
"- [Learn the basics of Druid SQL](../03-query/00-using-sql-with-druidapi.ipynb) introduces you to the unique aspects of Druid SQL with the primary focus on the SELECT statement.\n",
"- [Learn to use the Data Generator](./02-datagen-intro.ipynb) gets you started with streaming and batch file data generation for testing of any data schema.\n",
"- [Ingest and query data from Apache Kafka](../02-ingestion/01-streaming-from-kafka.ipynb) walks you through ingesting an event stream from Kafka."
]
},

View File

@ -445,7 +445,7 @@
"metadata": {},
"outputs": [],
"source": [
"sql_client.run_task(sql)"
"display.run_task(sql)"
]
},
{
@ -473,7 +473,7 @@
"id": "11d9c95a",
"metadata": {},
"source": [
"`describe_table()` lists the columns in a table."
"`display.table(<table_name>)` lists the columns in a table."
]
},
{

View File

@ -0,0 +1,642 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "9e07b3f5-d919-4179-91a1-0f6b66c42757",
"metadata": {},
"source": [
"# Data Generator Server\n",
"<!--\n",
" ~ Licensed to the Apache Software Foundation (ASF) under one\n",
" ~ or more contributor license agreements. See the NOTICE file\n",
" ~ distributed with this work for additional information\n",
" ~ regarding copyright ownership. The ASF licenses this file\n",
" ~ to you under the Apache License, Version 2.0 (the\n",
" ~ \"License\"); you may not use this file except in compliance\n",
" ~ with the License. You may obtain a copy of the License at\n",
" ~\n",
" ~ http://www.apache.org/licenses/LICENSE-2.0\n",
" ~\n",
" ~ Unless required by applicable law or agreed to in writing,\n",
" ~ software distributed under the License is distributed on an\n",
" ~ \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY\n",
" ~ KIND, either express or implied. See the License for the\n",
" ~ specific language governing permissions and limitations\n",
" ~ under the License.\n",
" -->\n",
"The default Docker Compose deployment includes a data generation service created from the published Docker image at `imply/datagen:latest`. \n",
"This image is built by the project https://github.com/implydata/druid-datagenerator. \n",
"\n",
"This notebook shows you how to use the data generation service included in the Docker Compose deployment. It explains how to use predefined data generator configurations as well as how to build a custom data generator. You will also learn how to create sample data files for batch ingestion and how to generate live streaming data for streaming ingestion.\n",
"\n",
"## Table of contents\n",
"\n",
"* [Initialization](#Initialization)\n",
"* [List available configurations](#List-available-configurations)\n",
"* [Generate a data file for backfilling history](#Generate-a-data-file-for-backfilling-history)\n",
"* [Batch ingestion of generated files](#Batch-ingestion-of-generated-files)\n",
"* [Generate custom data](#Generate-custom-data)\n",
"* [Stream generated data](#Stream-generated-data)\n",
"* [Ingest data from a stream](#Ingest-data-from-a-stream)\n",
"* [Cleanup](#Cleanup)\n",
"\n",
"\n",
"## Initialization\n",
"\n",
"To interact with the data generation service, use the REST client provided in the [`druidapi` Python package](https://druid.apache.org/docs/latest/tutorials/tutorial-jupyter-index.html#python-api-for-druid)."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f84766c7-c6a5-4496-91a3-abdb8ddd2375",
"metadata": {},
"outputs": [],
"source": [
"import druidapi\n",
"import os\n",
"import time\n",
"\n",
"# Datagen client \n",
"datagen = druidapi.rest.DruidRestClient(\"http://datagen:9999\")\n",
"\n",
"if (os.environ['DRUID_HOST'] == None):\n",
" druid_host=f\"http://router:8888\"\n",
"else:\n",
" druid_host=f\"http://{os.environ['DRUID_HOST']}:8888\"\n",
"\n",
"# Druid client\n",
"druid = druidapi.jupyter_client(druid_host)\n",
"\n",
"\n",
"\n",
"# these imports and constants are used by multiple cells\n",
"from datetime import datetime, timedelta\n",
"import json\n",
"\n",
"headers = {\n",
" 'Content-Type': 'application/json'\n",
"}"
]
},
{
"cell_type": "markdown",
"id": "c54af617-0998-4010-90c3-9b5a38a09a5f",
"metadata": {},
"source": [
"### List available configurations\n",
"Use the `/list` API endpoint to get the data generator's available configuration values with predefined data generator schemas."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1ba6a80a-c49b-4abf-943b-9dad82f2ae13",
"metadata": {},
"outputs": [],
"source": [
"display(datagen.get(f\"/list\", require_ok=False).json())"
]
},
{
"cell_type": "markdown",
"id": "ae88a3b7-60da-405d-bcf4-fb4affcfe973",
"metadata": {},
"source": [
"### Generate a data file for backfilling history\n",
"When generating a file for backfill purposes, you can select the start time and the duration of the simulation.\n",
"\n",
"Configure the data generator request as follows:\n",
"* `name`: an arbitrary name you assign to the job. Refer to the job name to get the job status or to stop the job.\n",
"* `target.type`: \"file\" to generate a data file\n",
"* `target.path`: identifies the name of the file to generate. The data generator ignores any path specified and creates the file in the current working directory.\n",
"* `time_type`,`time`: The data generator simulates the time range you specify with a start timestamp in the `time_type` property and a duration in the `time` property. To specify `time`, use the `h` suffix for hours, `m` for minutes, and `s` for seconds.\n",
"- `concurrency` indicates the maximum number of entities used concurrently to generate events. Each entity is a separate state machine that simulates things like user sessions, IoT devices, or other concurrent sources of event data.\n",
"\n",
"The following example uses the `clickstream.json` predefined configuration to generate data into a file called `clicks.json`. The data generator starts the sample data at one hour prior to the current time and simulates events for a duration of one hour. Since it is simulated, it does this in just a few seconds."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "811ff58f-75af-4092-a08d-5e07a51592ff",
"metadata": {},
"outputs": [],
"source": [
"# Configure the start time to one hour prior to the current time. \n",
"startDateTime = (datetime.now() - timedelta(hours = 1)).strftime('%Y-%m-%dT%H:%M:%S.001')\n",
"print(f\"Starting to generate history at {startDateTime}.\")\n",
"\n",
"# Give the datagen job a name for use in subsequent API calls\n",
"job_name=\"gen_clickstream1\"\n",
"\n",
"# Generate a data file on the datagen server\n",
"datagen_request = {\n",
" \"name\": job_name,\n",
" \"target\": { \"type\": \"file\", \"path\":\"clicks.json\"},\n",
" \"config_file\": \"clickstream/clickstream.json\", \n",
" \"time_type\": startDateTime,\n",
" \"time\": \"1h\",\n",
" \"concurrency\":100\n",
"}\n",
"response = datagen.post(\"/start\", json.dumps(datagen_request), headers=headers, require_ok=False)\n",
"response.json()"
]
},
{
"cell_type": "markdown",
"id": "d407d1d9-3f01-4128-a014-6a5f371c25a5",
"metadata": {},
"source": [
"#### Display jobs\n",
"Use the `/jobs` API endpoint to get the current jobs and job statuses."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3de698c5-bcf4-40c7-b295-728fb54d1f0a",
"metadata": {},
"outputs": [],
"source": [
"display(datagen.get(f\"/jobs\").json())"
]
},
{
"cell_type": "markdown",
"id": "972ebed0-34a1-4ad2-909d-69b8b27c3046",
"metadata": {},
"source": [
"#### Get status of a job\n",
"Use the `/status/JOB_NAME` API endpoint to get the current jobs and their status."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "debce4f8-9c16-476c-9593-21ec984985d2",
"metadata": {},
"outputs": [],
"source": [
"display(datagen.get(f\"/status/{job_name}\", require_ok=False).json())"
]
},
{
"cell_type": "markdown",
"id": "ef818d78-6aa6-4d38-8a43-83416aede96f",
"metadata": {},
"source": [
"#### Stop a job\n",
"Use the `/stop/JOB_NAME` API endpoint to stop a job."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7631b8b8-d3d6-4803-9162-587f440d2ef2",
"metadata": {},
"outputs": [],
"source": [
"display(datagen.post(f\"/stop/{job_name}\", '').json())"
]
},
{
"cell_type": "markdown",
"id": "0a8dc7d3-64e5-41e3-8c28-c5f19c0536f5",
"metadata": {},
"source": [
"#### List files created on datagen server\n",
"Use the `/files` API endpoint to list files available on the server."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "06ee36bd-2d2b-4904-9987-10636cf52aac",
"metadata": {},
"outputs": [],
"source": [
"display(datagen.get(f\"/files\", '').json())"
]
},
{
"cell_type": "markdown",
"id": "83ef9edb-98e2-45b4-88e8-578703faedc1",
"metadata": {},
"source": [
"### Batch ingestion of generated files\n",
"Use a [Druid HTTP input source](https://druid.apache.org/docs/latest/ingestion/native-batch-input-sources.html#http-input-source) in the [EXTERN function](https://druid.apache.org/docs/latest/multi-stage-query/reference.html#extern-function) of a [SQL-based ingestion](https://druid.apache.org/docs/latest/multi-stage-query/index.html) to load generated files.\n",
"You can access files by name from within Druid using the URI `http://datagen:9999/file/FILE_NAME`. Alternatively, if you run Druid outside of Docker but on the same machine, access the file with `http://localhost:9999/file/FILE_NAME`.\n",
"The following example assumes that both Druid and the data generator server are running in Docker Compose."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0d72b015-f8ec-4713-b6f2-fe7a15afff59",
"metadata": {},
"outputs": [],
"source": [
"sql = '''\n",
"REPLACE INTO \"clicks\" OVERWRITE ALL\n",
"WITH \"ext\" AS (SELECT *\n",
"FROM TABLE(\n",
" EXTERN(\n",
" '{\"type\":\"http\",\"uris\":[\"http://datagen:9999/file/clicks.json\"]}',\n",
" '{\"type\":\"json\"}'\n",
" )\n",
") EXTEND (\"time\" VARCHAR, \"user_id\" VARCHAR, \"event_type\" VARCHAR, \"client_ip\" VARCHAR, \"client_device\" VARCHAR, \"client_lang\" VARCHAR, \"client_country\" VARCHAR, \"referrer\" VARCHAR, \"keyword\" VARCHAR, \"product\" VARCHAR))\n",
"SELECT\n",
" TIME_PARSE(\"time\") AS \"__time\",\n",
" \"user_id\",\n",
" \"event_type\",\n",
" \"client_ip\",\n",
" \"client_device\",\n",
" \"client_lang\",\n",
" \"client_country\",\n",
" \"referrer\",\n",
" \"keyword\",\n",
" \"product\"\n",
"FROM \"ext\"\n",
"PARTITIONED BY DAY\n",
"''' \n",
"\n",
"druid.display.run_task(sql)\n",
"print(\"Waiting for segment avaialbility ...\")\n",
"druid.sql.wait_until_ready('clicks')\n",
"print(\"Data is available for query.\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b0997b38-02c2-483e-bd15-439c4bf0097a",
"metadata": {},
"outputs": [],
"source": [
"sql = '''\n",
"SELECT \"event_type\", \"user_id\", count( DISTINCT \"client_ip\") ip_count\n",
"FROM \"clicks\"\n",
"GROUP BY 1,2\n",
"ORDER BY 3 DESC\n",
"LIMIT 10\n",
"'''\n",
"druid.display.sql(sql)"
]
},
{
"cell_type": "markdown",
"id": "66ec013f-28e4-4d5a-94a6-06e0ed537b4e",
"metadata": {},
"source": [
"## Generate custom data\n",
"\n",
"You can find the full set of configuration options for the data generator in the [README](https://github.com/implydata/druid-datagenerator#data-generator-configuration).\n",
"\n",
"This section demonstrates a simple custom configuration as an example. Notice that the emitter defined the schema as a list of dimensions, each dimension specifies how its values are generated: "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d6451310-b7dd-4b39-a23b-7b735b152d6c",
"metadata": {},
"outputs": [],
"source": [
"gen_config = {\n",
" \"emitters\": [\n",
" {\n",
" \"name\": \"simple_record\",\n",
" \"dimensions\": [\n",
" {\n",
" \"type\": \"string\",\n",
" \"name\": \"random_string_column\",\n",
" \"length_distribution\": {\n",
" \"type\": \"constant\",\n",
" \"value\": 13\n",
" },\n",
" \"cardinality\": 0,\n",
" \"chars\": \"#.abcdefghijklmnopqrstuvwxyz\"\n",
" },\n",
" {\n",
" \"type\": \"int\",\n",
" \"name\": \"distributed_number\",\n",
" \"distribution\": {\n",
" \"type\": \"uniform\",\n",
" \"min\": 0,\n",
" \"max\": 1000\n",
" },\n",
" \"cardinality\": 10,\n",
" \"cardinality_distribution\": {\n",
" \"type\": \"exponential\",\n",
" \"mean\": 5\n",
" }\n",
" }\n",
" ]\n",
" }\n",
" ],\n",
" \"interarrival\": {\n",
" \"type\": \"constant\",\n",
" \"value\": 1\n",
" },\n",
" \"states\": [\n",
" {\n",
" \"name\": \"state_1\",\n",
" \"emitter\": \"simple_record\",\n",
" \"delay\": {\n",
" \"type\": \"constant\",\n",
" \"value\": 1\n",
" },\n",
" \"transitions\": [\n",
" {\n",
" \"next\": \"state_1\",\n",
" \"probability\": 1.0\n",
" }\n",
" ]\n",
" }\n",
" ]\n",
"}\n",
"\n",
"target = { \"type\":\"file\", \"path\":\"sample_data.json\"}"
]
},
{
"cell_type": "markdown",
"id": "89a22645-aea5-4c15-b81a-959b27df731f",
"metadata": {},
"source": [
"This example uses the `config` attribute of the request to configure a new custom data generator instead of using a predefined `config_file`."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e5e5c535-3474-42b4-9772-14279e712f3d",
"metadata": {},
"outputs": [],
"source": [
"# generate 1 hour of simulated time using custom configuration\n",
"datagen_request = {\n",
" \"name\": \"sample_custom\",\n",
" \"target\": target,\n",
" \"config\": gen_config, \n",
" \"time\": \"1h\",\n",
" \"concurrency\":10,\n",
" \"time_type\": \"SIM\"\n",
"}\n",
"response = datagen.post(\"/start\", json.dumps(datagen_request), headers=headers, require_ok=False)\n",
"response.json()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "952386f7-8181-4325-972b-5f30dc12cf21",
"metadata": {},
"outputs": [],
"source": [
"display(datagen.get(f\"/jobs\", require_ok=False).json())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "470b3a2a-4fd9-45a2-9221-497d906f62a9",
"metadata": {},
"outputs": [],
"source": [
"# display the first 1k characters of the generated data file\n",
"display( datagen.get(f\"/file/sample_data.json\").content[:1024])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "350faea6-55b0-4386-830c-5160ae495012",
"metadata": {},
"outputs": [],
"source": [
"datagen.post(f\"/stop/sample_custom\",'')"
]
},
{
"cell_type": "markdown",
"id": "77bff054-0f16-4fd5-8ade-2d44b30d0cf2",
"metadata": {},
"source": [
"## Stream generated data\n",
"\n",
"The data generator works exactly the same whether it is writing data to a file or publishing messages into a stream. You only need to change the target configuration.\n",
"\n",
"To use the Kafka container running on Docker Compose, use the host name `kafka:9092`. This tutorial uses the KAFKA_HOST environment variable from Docker Compose to specify the Kafka host. "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9959b7c3-6223-479d-b0c2-115a1c555090",
"metadata": {},
"outputs": [],
"source": [
"if (os.environ['KAFKA_HOST'] == None):\n",
" kafka_host=f\"kafka:9092\"\n",
"else:\n",
" kafka_host=f\"{os.environ['KAFKA_HOST']}:9092\""
]
},
{
"cell_type": "markdown",
"id": "497abc18-6538-4536-a17f-fe10c4367611",
"metadata": {},
"source": [
"The simplest `target` object for Kafka and, similarly, Confluent is:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "686a74ab-e2dd-458e-9e93-10291064e9db",
"metadata": {},
"outputs": [],
"source": [
"target = {\n",
" \"type\":\"kafka\",\n",
" \"endpoint\": kafka_host,\n",
" \"topic\": \"custom_data\"\n",
"}\n",
"\n",
"# Generate 1 hour of real time using custom configuration, this means that this stream will run for an hour if not stopped\n",
"datagen_request = {\n",
" \"name\": \"sample_custom\",\n",
" \"target\": target,\n",
" \"config\": gen_config, \n",
" \"time\": \"1h\",\n",
" \"concurrency\":10,\n",
" \"time_type\": \"REAL\"\n",
"}\n",
"response = datagen.post(\"/start\", json.dumps(datagen_request), headers=headers, require_ok=False)\n",
"response.json()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ec17d0c7-a3ab-4f37-bbf0-cc02bff44cf1",
"metadata": {},
"outputs": [],
"source": [
"time.sleep(1) # avoid race condition of async job start\n",
"display(datagen.get(f\"/jobs\", require_ok=False).json())"
]
},
{
"cell_type": "markdown",
"id": "84d7b706-9040-4a69-a956-1b1bbb037c32",
"metadata": {},
"source": [
"### Ingest data from a stream \n",
"This example shows how to start a streaming ingestion supervisor in Apache Druid to consume your custom data:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "51912409-e4e7-48d1-b3a5-b269622b4e56",
"metadata": {},
"outputs": [],
"source": [
"ingestion_spec ={\n",
" \"type\": \"kafka\",\n",
" \"spec\": {\n",
" \"ioConfig\": {\n",
" \"type\": \"kafka\",\n",
" \"consumerProperties\": {\n",
" \"bootstrap.servers\": \"kafka:9092\"\n",
" },\n",
" \"topic\": \"custom_data\",\n",
" \"inputFormat\": {\n",
" \"type\": \"json\"\n",
" },\n",
" \"useEarliestOffset\": True\n",
" },\n",
" \"tuningConfig\": {\n",
" \"type\": \"kafka\",\n",
" \"maxRowsInMemory\": 100000,\n",
" \"resetOffsetAutomatically\": False\n",
" },\n",
" \"dataSchema\": {\n",
" \"dataSource\": \"custom_data\",\n",
" \"timestampSpec\": {\n",
" \"column\": \"time\",\n",
" \"format\": \"iso\"\n",
" },\n",
" \"dimensionsSpec\": {\n",
" \"dimensions\": [\n",
" \"random_string_column\",\n",
" {\n",
" \"type\": \"long\",\n",
" \"name\": \"distributed_number\"\n",
" }\n",
" ]\n",
" },\n",
" \"granularitySpec\": {\n",
" \"queryGranularity\": \"none\",\n",
" \"rollup\": False,\n",
" \"segmentGranularity\": \"hour\"\n",
" }\n",
" }\n",
" }\n",
"}\n",
"\n",
"headers = {\n",
" 'Content-Type': 'application/json'\n",
"}\n",
"\n",
"druid.rest.post(\"/druid/indexer/v1/supervisor\", json.dumps(ingestion_spec), headers=headers)"
]
},
{
"cell_type": "markdown",
"id": "dddfb1cc-f863-4bf4-8c5a-b261b0b9c2f0",
"metadata": {},
"source": [
"Query the data on the stream, but first wait for its availability. It takes a bit of time for the streaming tasks to start, but once they are consuming you can see data very close to real time: Run the following cell multiple times to see how the data is changing:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7e1284ed-5c49-4f37-81f7-c3b720473158",
"metadata": {},
"outputs": [],
"source": [
"druid.sql.wait_until_ready('custom_data', verify_load_status=False)\n",
"druid.display.sql('''\n",
"SELECT SUM(distributed_number) sum_randoms, count(*) total_count\n",
"FROM custom_data\n",
"''')"
]
},
{
"cell_type": "markdown",
"id": "4486e430-0776-46ad-8a8b-4f0354f17bfb",
"metadata": {},
"source": [
"### Cleanup\n",
"\n",
"Stop the streaming ingestion and the streaming producer:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "38943a92-dc23-41cf-91a4-1b68d2178033",
"metadata": {},
"outputs": [],
"source": [
"print(f\"Stop streaming generator: [{datagen.post('/stop/sample_custom','',require_ok=False)}]\")\n",
"print(f'Reset offsets for streaming ingestion: [{druid.rest.post(\"/druid/indexer/v1/supervisor/custom_data/reset\",\"\", require_ok=False)}]')\n",
"print(f'Stop streaming ingestion: [{druid.rest.post(\"/druid/indexer/v1/supervisor/custom_data/terminate\",\"\", require_ok=False)}]')"
]
},
{
"cell_type": "markdown",
"id": "0cf53bdc-de7f-425d-84b1-68d0cef420d8",
"metadata": {},
"source": [
"Wait for streaming ingestion to complete and then remove the custom data table:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "87341e7c-f7ab-488c-9913-091f712534cb",
"metadata": {},
"outputs": [],
"source": [
"print(f\"Drop datasource: [{druid.datasources.drop('custom_data')}]\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -4,7 +4,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"Ingest and query data from Apache Kafka\n",
"# Ingest and query data from Apache Kafka\n",
"\n",
"<!--\n",
" ~ Licensed to the Apache Software Foundation (ASF) under one\n",
@ -60,9 +60,10 @@
" * Update the `rest_client` variable to point to your Coordinator endpoint. For example, `\"http://localhost:8081\"`.\n",
"* A running Kafka cluster.\n",
" * Update the Kafka bootstrap servers to point to your servers. For example, `bootstrap_servers=[\"localhost:9092\"]`.\n",
"* A running [Data Generator server](https://github.com/implydata/druid-datagenerator) accessible to the cluster.\n",
" * Update the data generator client. For example `datagen = druidapi.rest.DruidRestClient(\"http://localhost:9999\")`.\n",
"* The following Python packages:\n",
" * `druidapi`, a Python client for Apache Druid\n",
" * `DruidDataDriver`, a data generator\n",
" * `kafka`, a Python client for Apache Kafka\n",
" * `pandas`, `matplotlib`, and `seaborn` for data visualization\n"
]
@ -88,36 +89,16 @@
"outputs": [],
"source": [
"import druidapi\n",
"import json\n",
"import os\n",
"import time\n",
"\n",
"# druid_host is the hostname and port for your Druid deployment. \n",
"# In the Docker Compose tutorial environment, this is the Router\n",
"# service running at \"http://router:8888\".\n",
"# If you are not using the Docker Compose environment, edit the `druid_host`.\n",
"\n",
"druid_host = \"http://router:8888\"\n",
"druid_host\n",
"\n",
"druid = druidapi.jupyter_client(druid_host)\n",
"display = druid.display\n",
"sql_client = druid.sql\n",
"\n",
"# Create a rest client for native JSON ingestion for streaming data\n",
"rest_client = druidapi.rest.DruidRestClient(\"http://coordinator:8081\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create Kafka topic"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This notebook relies on the Python client for the Apache Kafka. Import the Kafka producer and consumer modules, then create a Kafka client. You use the Kafka producer to create and publish records to a new topic named `social_media`."
"if 'DRUID_HOST' not in os.environ.keys():\n",
" druid_host=f\"http://localhost:8888\"\n",
"else:\n",
" druid_host=f\"http://{os.environ['DRUID_HOST']}:8888\"\n",
" \n",
"print(f\"Opening a connection to {druid_host}.\")\n",
"druid = druidapi.jupyter_client(druid_host)"
]
},
{
@ -126,19 +107,55 @@
"metadata": {},
"outputs": [],
"source": [
"from kafka import KafkaProducer\n",
"from kafka import KafkaConsumer\n",
"# Use kafka_host variable when connecting to kafka \n",
"if 'KAFKA_HOST' not in os.environ.keys():\n",
" kafka_host=f\"http://localhost:9092\"\n",
"else:\n",
" kafka_host=f\"{os.environ['KAFKA_HOST']}:9092\"\n",
"\n",
"# Kafka runs on kafka:9092 in multi-container tutorial application\n",
"producer = KafkaProducer(bootstrap_servers='kafka:9092')\n",
"# this is the kafka topic we will be working with:\n",
"topic_name = \"social_media\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"\n",
"# shortcuts for display and sql api's\n",
"display = druid.display\n",
"sql_client = druid.sql\n",
"\n",
"# client for Data Generator API\n",
"datagen = druidapi.rest.DruidRestClient(\"http://datagen:9999\")\n",
"\n",
"# client for Druid API\n",
"rest_client = druid.rest"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create the `social_media` topic and send a sample event. The `send()` command returns a metadata descriptor for the record."
"## Publish generated data directly to Kafka topic"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In this section, you use the data generator included as part of the Docker application to generate a stream of messages. The data generator creates and send messages to a Kafka topic named `social_media`. To learn more about the Druid Data Generator, see the [project](https://github.com/implydata/druid-datagenerator) and the [data generation notebook](../01-introduction/02-datagen-intro.ipynb)."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Generate data\n",
"Run the following cells to load sample data into the `social_media` Kafka topic. The data generator sends events until it reaches 50,000 messages."
]
},
{
@ -147,24 +164,25 @@
"metadata": {},
"outputs": [],
"source": [
"event = {\n",
" \"__time\": \"2023-01-03T16:40:21.501\",\n",
" \"username\": \"willow\",\n",
" \"post_title\": \"This title is required\",\n",
" \"views\": 15284,\n",
" \"upvotes\": 124,\n",
" \"comments\": 21,\n",
" \"edited\": \"True\"\n",
"headers = {\n",
" 'Content-Type': 'application/json'\n",
"}\n",
"\n",
"producer.send(topic_name, json.dumps(event).encode('utf-8'))"
"datagen_request = {\n",
" \"name\": \"social_stream\",\n",
" \"target\": { \"type\": \"kafka\", \"endpoint\": kafka_host, \"topic\": topic_name },\n",
" \"config_file\": \"social/social_posts.json\", \n",
" \"total_events\":50000,\n",
" \"concurrency\":100\n",
"}\n",
"datagen.post(\"/start\", json.dumps(datagen_request), headers=headers)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To verify that the Kafka topic stored the event, create a consumer client to read records from the Kafka cluster, and get the next (only) message:"
"Check the status of the job with the following cell:"
]
},
{
@ -173,59 +191,9 @@
"metadata": {},
"outputs": [],
"source": [
"consumer = KafkaConsumer(topic_name, bootstrap_servers=['kafka:9092'], auto_offset_reset='earliest',\n",
" enable_auto_commit=True)\n",
"\n",
"print(next(consumer).value.decode('utf-8'))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load data into Kafka topic"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Instead of manually creating events to send to the Kafka topic, use a data generator to simulate a continuous data stream. This tutorial makes use of Druid Data Driver to simulate a continuous data stream into the `social_media` Kafka topic. To learn more about the Druid Data Driver, see the Druid Summit talk, [Generating Time centric Data for Apache Druid](https://www.youtube.com/watch?v=3zAOeLe3iAo).\n",
"\n",
"In this notebook, you use a background process to continuously load data into the Kafka topic.\n",
"This allows you to keep executing commands in this notebook while data is constantly being streamed into the topic."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Run the following cells to load sample data into the `social_media` Kafka topic:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import multiprocessing as mp\n",
"from datetime import datetime\n",
"import DruidDataDriver"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def run_driver():\n",
" DruidDataDriver.simulate(\"kafka_docker_config.json\", None, None, \"REAL\", datetime.now())\n",
" \n",
"mp.set_start_method('fork')\n",
"ps = mp.Process(target=run_driver)\n",
"ps.start()"
"time.sleep(1) # avoid race between start of the job and its status being available\n",
"response = datagen.get('/status/social_stream')\n",
"response.json()"
]
},
{
@ -258,16 +226,56 @@
"metadata": {},
"outputs": [],
"source": [
"kafka_ingestion_spec = \"{\\\"type\\\": \\\"kafka\\\",\\\"spec\\\": {\\\"ioConfig\\\": {\\\"type\\\": \\\"kafka\\\",\\\"consumerProperties\\\": {\\\"bootstrap.servers\\\": \\\"kafka:9092\\\"},\\\"topic\\\": \\\"social_media\\\",\\\"inputFormat\\\": {\\\"type\\\": \\\"json\\\"},\\\"useEarliestOffset\\\": true},\\\"tuningConfig\\\": {\\\"type\\\": \\\"kafka\\\"},\\\"dataSchema\\\": {\\\"dataSource\\\": \\\"social_media\\\",\\\"timestampSpec\\\": {\\\"column\\\": \\\"__time\\\",\\\"format\\\": \\\"iso\\\"},\\\"dimensionsSpec\\\": {\\\"dimensions\\\": [\\\"username\\\",\\\"post_title\\\",{\\\"type\\\": \\\"long\\\",\\\"name\\\": \\\"views\\\"},{\\\"type\\\": \\\"long\\\",\\\"name\\\": \\\"upvotes\\\"},{\\\"type\\\": \\\"long\\\",\\\"name\\\": \\\"comments\\\"},\\\"edited\\\"]},\\\"granularitySpec\\\": {\\\"queryGranularity\\\": \\\"none\\\",\\\"rollup\\\": false,\\\"segmentGranularity\\\": \\\"hour\\\"}}}}\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(json.dumps(json.loads(kafka_ingestion_spec), indent=4))"
"kafka_ingestion_spec = {\n",
" \"type\": \"kafka\",\n",
" \"spec\": {\n",
" \"ioConfig\": {\n",
" \"type\": \"kafka\",\n",
" \"consumerProperties\": {\n",
" \"bootstrap.servers\": \"kafka:9092\"\n",
" },\n",
" \"topic\": \"social_media\",\n",
" \"inputFormat\": {\n",
" \"type\": \"json\"\n",
" },\n",
" \"useEarliestOffset\": True\n",
" },\n",
" \"tuningConfig\": {\n",
" \"type\": \"kafka\"\n",
" },\n",
" \"dataSchema\": {\n",
" \"dataSource\": \"social_media\",\n",
" \"timestampSpec\": {\n",
" \"column\": \"time\",\n",
" \"format\": \"iso\"\n",
" },\n",
" \"dimensionsSpec\": {\n",
" \"dimensions\": [\n",
" \"username\",\n",
" \"post_title\",\n",
" {\n",
" \"type\": \"long\",\n",
" \"name\": \"views\"\n",
" },\n",
" {\n",
" \"type\": \"long\",\n",
" \"name\": \"upvotes\"\n",
" },\n",
" {\n",
" \"type\": \"long\",\n",
" \"name\": \"comments\"\n",
" },\n",
" \"edited\"\n",
" ]\n",
" },\n",
" \"granularitySpec\": {\n",
" \"queryGranularity\": \"none\",\n",
" \"rollup\": False,\n",
" \"segmentGranularity\": \"hour\"\n",
" }\n",
" }\n",
" }\n",
"}"
]
},
{
@ -287,14 +295,26 @@
" 'Content-Type': 'application/json'\n",
"}\n",
"\n",
"rest_client.post(\"/druid/indexer/v1/supervisor\", kafka_ingestion_spec, headers=headers)"
"supervisor = rest_client.post(\"/druid/indexer/v1/supervisor\", json.dumps(kafka_ingestion_spec), headers=headers)\n",
"print(supervisor.status_code)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"A `200` response indicates that the request was successful. You can view the running ingestion task and the new datasource in the web console at http://localhost:8888/unified-console.html."
"A `200` response indicates that the request was successful. You can view the running ingestion task and the new datasource in the web console's [ingestion view](http://localhost:8888/unified-console.html#ingestion).\n",
"\n",
"The following cell pauses further execution until the ingestion has started and the datasource is available for querying:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"druid.sql.wait_until_ready('social_media', verify_load_status=False)"
]
},
{
@ -496,8 +516,49 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"This plot shows how some users maintain relatively consistent social media impact between the two query snapshots, whereas other users grow or decline in their influence.\n",
"\n",
"This plot shows how some users maintain relatively consistent social media impact between the two query snapshots, whereas other users grow or decline in their influence."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Cleanup \n",
"The following cells stop the data generation and ingestion jobs and removes the datasource from Druid."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(f\"Stop streaming generator: [{datagen.post('/stop/social_stream','',require_ok=False)}]\")\n",
"print(f'Reset offsets for ingestion: [{druid.rest.post(\"/druid/indexer/v1/supervisor/social_media/reset\",\"\", require_ok=False)}]')\n",
"print(f'Stop streaming ingestion: [{druid.rest.post(\"/druid/indexer/v1/supervisor/social_media/terminate\",\"\", require_ok=False)}]')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Once the ingestion process ends and completes any final ingestion steps, remove the datasource with the following cell:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"time.sleep(5) # wait for streaming ingestion tasks to end\n",
"print(f\"Drop datasource: [{druid.datasources.drop('social_media')}]\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Learn more\n",
"\n",
"This tutorial showed you how to create a Kafka topic using a Python client for Kafka, send a simulated stream of data to Kafka using a data generator, and query and visualize results over time. For more information, see the following resources:\n",

View File

@ -1,90 +0,0 @@
{
"target": {
"type": "kafka",
"endpoint": "kafka:9092",
"topic": "social_media"
},
"emitters": [
{
"name": "example_record_1",
"dimensions": [
{
"type": "enum",
"name": "username",
"values": ["willow", "mia", "leon", "milton", "miette", "gus", "jojo", "rocket"],
"cardinality_distribution": {
"type": "uniform",
"min": 0,
"max": 7
}
},
{
"type": "string",
"name": "post_title",
"length_distribution": {"type": "uniform", "min": 1, "max": 140},
"cardinality": 0,
"chars": "abcdefghijklmnopqrstuvwxyz0123456789_ABCDEFGHIJKLMNOPQRSTUVWXYZ!';:,."
},
{
"type": "int",
"name": "views",
"distribution": {
"type": "exponential",
"mean": 10000
},
"cardinality": 0
},
{
"type": "int",
"name": "upvotes",
"distribution": {
"type": "normal",
"mean": 70,
"stddev": 20
},
"cardinality": 0
},
{
"type": "int",
"name": "comments",
"distribution": {
"type": "normal",
"mean": 10,
"stddev": 5
},
"cardinality": 0
},
{
"type": "enum",
"name": "edited",
"values": ["True","False"],
"cardinality_distribution": {
"type": "uniform",
"min": 0,
"max": 1
}
}
]
}
],
"interarrival": {
"type": "constant",
"value": 1
},
"states": [
{
"name": "state_1",
"emitter": "example_record_1",
"delay": {
"type": "constant",
"value": 1
},
"transitions": [
{
"next": "state_1",
"probability": 1.0
}
]
}
]
}

View File

@ -14,7 +14,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# Creates pseudo distributed hadoop 2.8.5 with java 8
# Creates pseudo distributed hadoop 3.3.6 with java 8
FROM centos:7
USER root
@ -56,32 +56,40 @@ ENV PATH $PATH:$JAVA_HOME/bin
# hadoop
ARG APACHE_ARCHIVE_MIRROR_HOST=https://archive.apache.org
RUN curl -s ${APACHE_ARCHIVE_MIRROR_HOST}/dist/hadoop/core/hadoop-2.8.5/hadoop-2.8.5.tar.gz | tar -xz -C /usr/local/
RUN cd /usr/local && ln -s ./hadoop-2.8.5 hadoop
RUN curl -s ${APACHE_ARCHIVE_MIRROR_HOST}/dist/hadoop/core/hadoop-3.3.6/hadoop-3.3.6.tar.gz | tar -xz -C /usr/local/
RUN cd /usr/local && ln -s ./hadoop-3.3.6 hadoop
ENV HADOOP_PREFIX /usr/local/hadoop
ENV HADOOP_HOME /usr/local/hadoop
ENV HADOOP_COMMON_HOME /usr/local/hadoop
ENV HADOOP_HDFS_HOME /usr/local/hadoop
ENV HADOOP_MAPRED_HOME /usr/local/hadoop
ENV HADOOP_YARN_HOME /usr/local/hadoop
ENV HADOOP_CONF_DIR /usr/local/hadoop/etc/hadoop
ENV YARN_CONF_DIR $HADOOP_PREFIX/etc/hadoop
ENV YARN_CONF_DIR $HADOOP_HOME/etc/hadoop
RUN sed -i '/^export JAVA_HOME/ s:.*:export JAVA_HOME=/usr/lib/jvm/zulu8\nexport HADOOP_PREFIX=/usr/local/hadoop\nexport HADOOP_HOME=/usr/local/hadoop\n:' $HADOOP_PREFIX/etc/hadoop/hadoop-env.sh
RUN sed -i '/^export HADOOP_CONF_DIR/ s:.*:export HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop/:' $HADOOP_PREFIX/etc/hadoop/hadoop-env.sh
# in hadoop 3 the example file is nearly empty so we can just append stuff
RUN sed -i '$ a export JAVA_HOME=/usr/lib/jvm/zulu8' $HADOOP_HOME/etc/hadoop/hadoop-env.sh
RUN sed -i '$ a export HADOOP_HOME=/usr/local/hadoop' $HADOOP_HOME/etc/hadoop/hadoop-env.sh
RUN sed -i '$ a export HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop/' $HADOOP_HOME/etc/hadoop/hadoop-env.sh
RUN sed -i '$ a export HDFS_NAMENODE_USER=root' $HADOOP_HOME/etc/hadoop/hadoop-env.sh
RUN sed -i '$ a export HDFS_DATANODE_USER=root' $HADOOP_HOME/etc/hadoop/hadoop-env.sh
RUN sed -i '$ a export HDFS_SECONDARYNAMENODE_USER=root' $HADOOP_HOME/etc/hadoop/hadoop-env.sh
RUN sed -i '$ a export YARN_RESOURCEMANAGER_USER=root' $HADOOP_HOME/etc/hadoop/hadoop-env.sh
RUN sed -i '$ a export YARN_NODEMANAGER_USER=root' $HADOOP_HOME/etc/hadoop/hadoop-env.sh
RUN mkdir $HADOOP_PREFIX/input
RUN cp $HADOOP_PREFIX/etc/hadoop/*.xml $HADOOP_PREFIX/input
RUN cat $HADOOP_HOME/etc/hadoop/hadoop-env.sh
RUN mkdir $HADOOP_HOME/input
RUN cp $HADOOP_HOME/etc/hadoop/*.xml $HADOOP_HOME/input
# pseudo distributed
ADD core-site.xml.template $HADOOP_PREFIX/etc/hadoop/core-site.xml.template
ADD core-site.xml.template $HADOOP_HOME/etc/hadoop/core-site.xml.template
RUN sed s/HOSTNAME/localhost/ /usr/local/hadoop/etc/hadoop/core-site.xml.template > /usr/local/hadoop/etc/hadoop/core-site.xml
ADD hdfs-site.xml $HADOOP_PREFIX/etc/hadoop/hdfs-site.xml
ADD hdfs-site.xml $HADOOP_HOME/etc/hadoop/hdfs-site.xml
ADD mapred-site.xml $HADOOP_HOME/etc/hadoop/mapred-site.xml
ADD yarn-site.xml $HADOOP_HOME/etc/hadoop/yarn-site.xml
ADD mapred-site.xml $HADOOP_PREFIX/etc/hadoop/mapred-site.xml
ADD yarn-site.xml $HADOOP_PREFIX/etc/hadoop/yarn-site.xml
RUN $HADOOP_PREFIX/bin/hdfs namenode -format
RUN $HADOOP_HOME/bin/hdfs namenode -format
ADD ssh_config /root/.ssh/config
RUN chmod 600 /root/.ssh/config
@ -120,16 +128,16 @@ RUN echo -e \
/usr/local/bin/start_sshd && \
chmod a+x /usr/local/bin/start_sshd
RUN start_sshd && $HADOOP_PREFIX/etc/hadoop/hadoop-env.sh && $HADOOP_PREFIX/sbin/start-dfs.sh && $HADOOP_PREFIX/bin/hdfs dfs -mkdir -p /user/root
RUN start_sshd && $HADOOP_PREFIX/etc/hadoop/hadoop-env.sh && $HADOOP_PREFIX/sbin/start-dfs.sh && $HADOOP_PREFIX/bin/hdfs dfs -put $HADOOP_PREFIX/etc/hadoop/ input
RUN start_sshd && $HADOOP_HOME/etc/hadoop/hadoop-env.sh && $HADOOP_HOME/sbin/start-dfs.sh
RUN start_sshd && $HADOOP_HOME/etc/hadoop/hadoop-env.sh && $HADOOP_HOME/sbin/start-dfs.sh
CMD ["/etc/bootstrap.sh", "-d"]
# Hdfs ports
EXPOSE 50010 50020 50070 50075 50090 8020 9000
EXPOSE 8020 9000 9820 9864 9865 9866 9867 9868 9869 9870 9871 50010 50020 50070 50075 50090
# Mapred ports
EXPOSE 10020 19888
#Yarn ports
EXPOSE 8030 8031 8032 8033 8040 8042 8088
#Other ports
EXPOSE 49707 2122
EXPOSE 2122 49707

13
examples/quickstart/tutorial/hadoop/docker/bootstrap.sh Executable file → Normal file
View File

@ -15,23 +15,22 @@
# See the License for the specific language governing permissions and
# limitations under the License.
: ${HADOOP_PREFIX:=/usr/local/hadoop}
: ${HADOOP_HOME:=/usr/local/hadoop}
$HADOOP_PREFIX/etc/hadoop/hadoop-env.sh
$HADOOP_HOME/etc/hadoop/hadoop-env.sh
rm /tmp/*.pid
# installing libraries if any - (resource urls added comma separated to the ACP system variable)
cd $HADOOP_PREFIX/share/hadoop/common ; for cp in ${ACP//,/ }; do echo == $cp; curl -LO $cp ; done; cd -
cd $HADOOP_HOME/share/hadoop/common ; for cp in ${ACP//,/ }; do echo == $cp; curl -LO $cp ; done; cd -
# altering the core-site configuration
sed s/HOSTNAME/$HOSTNAME/ /usr/local/hadoop/etc/hadoop/core-site.xml.template > /usr/local/hadoop/etc/hadoop/core-site.xml
start_sshd
$HADOOP_PREFIX/sbin/start-dfs.sh
$HADOOP_PREFIX/sbin/start-yarn.sh
$HADOOP_PREFIX/sbin/mr-jobhistory-daemon.sh start historyserver
$HADOOP_HOME/sbin/start-dfs.sh
$HADOOP_HOME/sbin/start-yarn.sh
$HADOOP_HOME/sbin/mr-jobhistory-daemon.sh start historyserver
if [[ $1 == "-d" ]]; then
while true; do sleep 1000; done

View File

@ -1,143 +0,0 @@
# Based on the SequenceIQ hadoop-docker project hosted at
# https://github.com/sequenceiq/hadoop-docker, and modified at
# the Apache Software Foundation (ASF).
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Creates pseudo distributed hadoop 3.3.1 with java 8
FROM centos:7
USER root
# install dev tools
RUN yum clean all \
&& rpm --rebuilddb \
&& yum install -y curl which tar sudo openssh-server openssh-clients rsync yum-plugin-ovl\
&& yum clean all \
&& yum update -y libselinux \
&& yum update -y nss \
&& yum clean all
# update libselinux. see https://github.com/sequenceiq/hadoop-docker/issues/14
# update nss. see https://unix.stackexchange.com/questions/280548/curl-doesnt-connect-to-https-while-wget-does-nss-error-12286
# passwordless ssh
RUN ssh-keygen -q -N "" -t dsa -f /etc/ssh/ssh_host_dsa_key
RUN ssh-keygen -q -N "" -t rsa -f /etc/ssh/ssh_host_rsa_key
RUN ssh-keygen -q -N "" -t rsa -f /root/.ssh/id_rsa
RUN cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys
#
# Pull Zulu OpenJDK binaries from official repository:
#
ARG ZULU_REPO_VER=1.0.0-1
RUN rpm --import http://repos.azulsystems.com/RPM-GPG-KEY-azulsystems && \
curl -sLO https://cdn.azul.com/zulu/bin/zulu-repo-${ZULU_REPO_VER}.noarch.rpm && \
rpm -ivh zulu-repo-${ZULU_REPO_VER}.noarch.rpm && \
yum -q -y update && \
yum -q -y upgrade && \
yum -q -y install zulu8-jdk && \
yum clean all && \
rm -rf /var/cache/yum zulu-repo_${ZULU_REPO_VER}.noarch.rpm
ENV JAVA_HOME=/usr/lib/jvm/zulu8
ENV PATH $PATH:$JAVA_HOME/bin
# hadoop
ARG APACHE_ARCHIVE_MIRROR_HOST=https://archive.apache.org
RUN curl -s ${APACHE_ARCHIVE_MIRROR_HOST}/dist/hadoop/core/hadoop-3.3.1/hadoop-3.3.1.tar.gz | tar -xz -C /usr/local/
RUN cd /usr/local && ln -s ./hadoop-3.3.1 hadoop
ENV HADOOP_HOME /usr/local/hadoop
ENV HADOOP_COMMON_HOME /usr/local/hadoop
ENV HADOOP_HDFS_HOME /usr/local/hadoop
ENV HADOOP_MAPRED_HOME /usr/local/hadoop
ENV HADOOP_YARN_HOME /usr/local/hadoop
ENV HADOOP_CONF_DIR /usr/local/hadoop/etc/hadoop
ENV YARN_CONF_DIR $HADOOP_HOME/etc/hadoop
# in hadoop 3 the example file is nearly empty so we can just append stuff
RUN sed -i '$ a export JAVA_HOME=/usr/lib/jvm/zulu8' $HADOOP_HOME/etc/hadoop/hadoop-env.sh
RUN sed -i '$ a export HADOOP_HOME=/usr/local/hadoop' $HADOOP_HOME/etc/hadoop/hadoop-env.sh
RUN sed -i '$ a export HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop/' $HADOOP_HOME/etc/hadoop/hadoop-env.sh
RUN sed -i '$ a export HDFS_NAMENODE_USER=root' $HADOOP_HOME/etc/hadoop/hadoop-env.sh
RUN sed -i '$ a export HDFS_DATANODE_USER=root' $HADOOP_HOME/etc/hadoop/hadoop-env.sh
RUN sed -i '$ a export HDFS_SECONDARYNAMENODE_USER=root' $HADOOP_HOME/etc/hadoop/hadoop-env.sh
RUN sed -i '$ a export YARN_RESOURCEMANAGER_USER=root' $HADOOP_HOME/etc/hadoop/hadoop-env.sh
RUN sed -i '$ a export YARN_NODEMANAGER_USER=root' $HADOOP_HOME/etc/hadoop/hadoop-env.sh
RUN cat $HADOOP_HOME/etc/hadoop/hadoop-env.sh
RUN mkdir $HADOOP_HOME/input
RUN cp $HADOOP_HOME/etc/hadoop/*.xml $HADOOP_HOME/input
# pseudo distributed
ADD core-site.xml.template $HADOOP_HOME/etc/hadoop/core-site.xml.template
RUN sed s/HOSTNAME/localhost/ /usr/local/hadoop/etc/hadoop/core-site.xml.template > /usr/local/hadoop/etc/hadoop/core-site.xml
ADD hdfs-site.xml $HADOOP_HOME/etc/hadoop/hdfs-site.xml
ADD mapred-site.xml $HADOOP_HOME/etc/hadoop/mapred-site.xml
ADD yarn-site.xml $HADOOP_HOME/etc/hadoop/yarn-site.xml
RUN $HADOOP_HOME/bin/hdfs namenode -format
ADD ssh_config /root/.ssh/config
RUN chmod 600 /root/.ssh/config
RUN chown root:root /root/.ssh/config
# # installing supervisord
# RUN yum install -y python-setuptools
# RUN easy_install pip
# RUN curl https://bitbucket.org/pypa/setuptools/raw/bootstrap/ez_setup.py -o - | python
# RUN pip install supervisor
#
# ADD supervisord.conf /etc/supervisord.conf
ADD bootstrap.sh /etc/bootstrap.sh
RUN chown root:root /etc/bootstrap.sh
RUN chmod 700 /etc/bootstrap.sh
ENV BOOTSTRAP /etc/bootstrap.sh
# workingaround docker.io build error
RUN ls -la /usr/local/hadoop/etc/hadoop/*-env.sh
RUN chmod +x /usr/local/hadoop/etc/hadoop/*-env.sh
RUN ls -la /usr/local/hadoop/etc/hadoop/*-env.sh
# Copy additional .jars to classpath
RUN cp /usr/local/hadoop/share/hadoop/tools/lib/*.jar /usr/local/hadoop/share/hadoop/common/lib/
# fix the 254 error code
RUN sed -i "/^[^#]*UsePAM/ s/.*/#&/" /etc/ssh/sshd_config
RUN echo "UsePAM no" >> /etc/ssh/sshd_config
RUN echo "Port 2122" >> /etc/ssh/sshd_config
# script for plain sshd start
RUN echo -e \
'#!/bin/bash\n/usr/sbin/sshd\ntimeout 10 bash -c "until printf \"\" 2>>/dev/null >>/dev/tcp/127.0.0.1/2122; do sleep 0.5; done"' > \
/usr/local/bin/start_sshd && \
chmod a+x /usr/local/bin/start_sshd
RUN start_sshd && $HADOOP_HOME/etc/hadoop/hadoop-env.sh && $HADOOP_HOME/sbin/start-dfs.sh
RUN start_sshd && $HADOOP_HOME/etc/hadoop/hadoop-env.sh && $HADOOP_HOME/sbin/start-dfs.sh
CMD ["/etc/bootstrap.sh", "-d"]
# Hdfs ports
EXPOSE 8020 9000 9820 9864 9865 9866 9867 9868 9869 9870 9871 50010 50020 50070 50075 50090
# Mapred ports
EXPOSE 10020 19888
#Yarn ports
EXPOSE 8030 8031 8032 8033 8040 8042 8088
#Other ports
EXPOSE 2122 49707

View File

@ -1,41 +0,0 @@
#!/bin/bash
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
: ${HADOOP_HOME:=/usr/local/hadoop}
$HADOOP_HOME/etc/hadoop/hadoop-env.sh
rm /tmp/*.pid
# installing libraries if any - (resource urls added comma separated to the ACP system variable)
cd $HADOOP_HOME/share/hadoop/common ; for cp in ${ACP//,/ }; do echo == $cp; curl -LO $cp ; done; cd -
# altering the core-site configuration
sed s/HOSTNAME/$HOSTNAME/ /usr/local/hadoop/etc/hadoop/core-site.xml.template > /usr/local/hadoop/etc/hadoop/core-site.xml
start_sshd
$HADOOP_HOME/sbin/start-dfs.sh
$HADOOP_HOME/sbin/start-yarn.sh
$HADOOP_HOME/sbin/mr-jobhistory-daemon.sh start historyserver
if [[ $1 == "-d" ]]; then
while true; do sleep 1000; done
fi
if [[ $1 == "-bash" ]]; then
/bin/bash
fi

View File

@ -1,24 +0,0 @@
<!--
~ Licensed to the Apache Software Foundation (ASF) under one
~ or more contributor license agreements. See the NOTICE file
~ distributed with this work for additional information
~ regarding copyright ownership. The ASF licenses this file
~ to you under the Apache License, Version 2.0 (the
~ "License"); you may not use this file except in compliance
~ with the License. You may obtain a copy of the License at
~
~ http://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing, software
~ distributed under the License is distributed on an "AS IS" BASIS,
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~ See the License for the specific language governing permissions and
~ limitations under the License.
-->
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://HOSTNAME:9000</value>
</property>
</configuration>

View File

@ -1,32 +0,0 @@
<!--
~ Licensed to the Apache Software Foundation (ASF) under one
~ or more contributor license agreements. See the NOTICE file
~ distributed with this work for additional information
~ regarding copyright ownership. The ASF licenses this file
~ to you under the Apache License, Version 2.0 (the
~ "License"); you may not use this file except in compliance
~ with the License. You may obtain a copy of the License at
~
~ http://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing, software
~ distributed under the License is distributed on an "AS IS" BASIS,
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~ See the License for the specific language governing permissions and
~ limitations under the License.
-->
<configuration>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
<property>
<name>dfs.client.use.datanode.hostname</name>
<value>true</value>
</property>
<property>
<name>dfs.datanode.use.datanode.hostname</name>
<value>true</value>
</property>
</configuration>

View File

@ -1,24 +0,0 @@
<!--
~ Licensed to the Apache Software Foundation (ASF) under one
~ or more contributor license agreements. See the NOTICE file
~ distributed with this work for additional information
~ regarding copyright ownership. The ASF licenses this file
~ to you under the Apache License, Version 2.0 (the
~ "License"); you may not use this file except in compliance
~ with the License. You may obtain a copy of the License at
~
~ http://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing, software
~ distributed under the License is distributed on an "AS IS" BASIS,
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~ See the License for the specific language governing permissions and
~ limitations under the License.
-->
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
</configuration>

View File

@ -1,20 +0,0 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Host *
UserKnownHostsFile /dev/null
StrictHostKeyChecking no
LogLevel quiet
Port 2122

View File

@ -1,65 +0,0 @@
<!--
~ Licensed to the Apache Software Foundation (ASF) under one
~ or more contributor license agreements. See the NOTICE file
~ distributed with this work for additional information
~ regarding copyright ownership. The ASF licenses this file
~ to you under the Apache License, Version 2.0 (the
~ "License"); you may not use this file except in compliance
~ with the License. You may obtain a copy of the License at
~
~ http://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing, software
~ distributed under the License is distributed on an "AS IS" BASIS,
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~ See the License for the specific language governing permissions and
~ limitations under the License.
-->
<configuration>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.application.classpath</name>
<value>/usr/local/hadoop/etc/hadoop, /usr/local/hadoop/share/hadoop/common/*, /usr/local/hadoop/share/hadoop/common/lib/*, /usr/local/hadoop/share/hadoop/hdfs/*, /usr/local/hadoop/share/hadoop/hdfs/lib/*, /usr/local/hadoop/share/hadoop/mapreduce/*, /usr/local/hadoop/share/hadoop/mapreduce/lib/*, /usr/local/hadoop/share/hadoop/yarn/*, /usr/local/hadoop/share/hadoop/yarn/lib/*</value>
</property>
<property>
<description>
Number of seconds after an application finishes before the nodemanager's
DeletionService will delete the application's localized file directory
and log directory.
To diagnose Yarn application problems, set this property's value large
enough (for example, to 600 = 10 minutes) to permit examination of these
directories. After changing the property's value, you must restart the
nodemanager in order for it to have an effect.
The roots of Yarn applications' work directories is configurable with
the yarn.nodemanager.local-dirs property (see below), and the roots
of the Yarn applications' log directories is configurable with the
yarn.nodemanager.log-dirs property (see also below).
</description>
<name>yarn.nodemanager.delete.debug-delay-sec</name>
<value>600</value>
</property>
<property>
<name>yarn.log-aggregation-enable</name>
<value>true</value>
</property>
<property>
<name>yarn.log-aggregation.retain-seconds</name>
<value>900000</value>
</property>
<property>
<name>yarn.nodemanager.vmem-check-enabled</name>
<value>false</value>
</property>
</configuration>

View File

@ -1,79 +0,0 @@
{
"type" : "index_hadoop",
"spec" : {
"dataSchema" : {
"dataSource" : "wikipedia",
"parser" : {
"type" : "hadoopyString",
"parseSpec" : {
"format" : "json",
"dimensionsSpec" : {
"dimensions" : [
"channel",
"cityName",
"comment",
"countryIsoCode",
"countryName",
"isAnonymous",
"isMinor",
"isNew",
"isRobot",
"isUnpatrolled",
"metroCode",
"namespace",
"page",
"regionIsoCode",
"regionName",
"user",
{ "name": "added", "type": "long" },
{ "name": "deleted", "type": "long" },
{ "name": "delta", "type": "long" }
]
},
"timestampSpec" : {
"format" : "auto",
"column" : "time"
}
}
},
"metricsSpec" : [],
"granularitySpec" : {
"type" : "uniform",
"segmentGranularity" : "day",
"queryGranularity" : "none",
"intervals" : ["2015-09-12/2015-09-13"],
"rollup" : false
}
},
"ioConfig" : {
"type" : "hadoop",
"inputSpec" : {
"type" : "static",
"paths" : "/quickstart/wikiticker-2015-09-12-sampled.json.gz"
}
},
"tuningConfig" : {
"type" : "hadoop",
"partitionsSpec" : {
"type" : "hashed",
"targetPartitionSize" : 5000000
},
"forceExtendableShardSpecs" : true,
"jobProperties" : {
"fs.default.name" : "hdfs://druid-hadoop-demo:9000",
"fs.defaultFS" : "hdfs://druid-hadoop-demo:9000",
"dfs.datanode.address" : "druid-hadoop-demo",
"dfs.client.use.datanode.hostname" : "true",
"dfs.datanode.use.datanode.hostname" : "true",
"yarn.resourcemanager.hostname" : "druid-hadoop-demo",
"yarn.nodemanager.vmem-check-enabled" : "false",
"mapreduce.map.java.opts" : "-Duser.timezone=UTC -Dfile.encoding=UTF-8",
"mapreduce.job.user.classpath.first" : "true",
"mapreduce.reduce.java.opts" : "-Duser.timezone=UTC -Dfile.encoding=UTF-8",
"mapreduce.map.memory.mb" : 1024,
"mapreduce.reduce.memory.mb" : 1024
}
}
},
"hadoopDependencyCoordinates": ["org.apache.hadoop:hadoop-client:2.8.5"]
}

View File

@ -91,8 +91,8 @@
<scope>provided</scope>
</dependency>
<dependency>
<groupId>javax.validation</groupId>
<artifactId>validation-api</artifactId>
<groupId>jakarta.validation</groupId>
<artifactId>jakarta.validation-api</artifactId>
<scope>provided</scope>
</dependency>
<dependency>

View File

@ -150,8 +150,8 @@
<scope>provided</scope>
</dependency>
<dependency>
<groupId>javax.validation</groupId>
<artifactId>validation-api</artifactId>
<groupId>jakarta.validation</groupId>
<artifactId>jakarta.validation-api</artifactId>
<scope>provided</scope>
</dependency>
<dependency>

View File

@ -35,10 +35,7 @@
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<jclouds.version>1.9.1</jclouds.version>
<!-- The version of guice is forced to 3.0 since JClouds 1.9.1 does not
work with guice 4.0-beta -->
<guice.version>3.0</guice.version>
<jclouds.version>2.0.0</jclouds.version>
</properties>
<dependencies>
@ -91,8 +88,8 @@
<artifactId>guava</artifactId>
</dependency>
<dependency>
<groupId>javax.validation</groupId>
<artifactId>validation-api</artifactId>
<groupId>jakarta.validation</groupId>
<artifactId>jakarta.validation-api</artifactId>
<scope>provided</scope>
</dependency>
<!-- jclouds dependencies -->
@ -151,8 +148,8 @@
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.easymock</groupId>
<artifactId>easymock</artifactId>
<groupId>org.mockito</groupId>
<artifactId>mockito-core</artifactId>
<scope>test</scope>
</dependency>
<dependency>

View File

@ -19,8 +19,6 @@
package org.apache.druid.storage.cloudfiles;
import org.easymock.EasyMock;
import org.easymock.EasyMockSupport;
import org.jclouds.io.Payload;
import org.junit.Assert;
import org.junit.Test;
@ -28,30 +26,35 @@ import org.junit.Test;
import java.io.IOException;
import java.io.InputStream;
public class CloudFilesByteSourceTest extends EasyMockSupport
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.times;
import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;
public class CloudFilesByteSourceTest
{
@Test
public void openStreamTest() throws IOException
{
final String path = "path";
CloudFilesObjectApiProxy objectApi = createMock(CloudFilesObjectApiProxy.class);
CloudFilesObject cloudFilesObject = createMock(CloudFilesObject.class);
Payload payload = createMock(Payload.class);
InputStream stream = createMock(InputStream.class);
CloudFilesObjectApiProxy objectApi = mock(CloudFilesObjectApiProxy.class);
CloudFilesObject cloudFilesObject = mock(CloudFilesObject.class);
Payload payload = mock(Payload.class);
InputStream stream = mock(InputStream.class);
EasyMock.expect(objectApi.get(path, 0)).andReturn(cloudFilesObject);
EasyMock.expect(cloudFilesObject.getPayload()).andReturn(payload);
EasyMock.expect(payload.openStream()).andReturn(stream);
when(objectApi.get(path, 0)).thenReturn(cloudFilesObject);
when(cloudFilesObject.getPayload()).thenReturn(payload);
when(payload.openStream()).thenReturn(stream);
payload.close();
replayAll();
CloudFilesByteSource byteSource = new CloudFilesByteSource(objectApi, path);
Assert.assertEquals(stream, byteSource.openStream());
byteSource.closeStream();
verifyAll();
verify(objectApi).get(path, 0);
verify(cloudFilesObject).getPayload();
verify(payload).openStream();
}
@Test()
@ -59,18 +62,17 @@ public class CloudFilesByteSourceTest extends EasyMockSupport
{
final String path = "path";
CloudFilesObjectApiProxy objectApi = createMock(CloudFilesObjectApiProxy.class);
CloudFilesObject cloudFilesObject = createMock(CloudFilesObject.class);
Payload payload = createMock(Payload.class);
InputStream stream = createMock(InputStream.class);
CloudFilesObjectApiProxy objectApi = mock(CloudFilesObjectApiProxy.class);
CloudFilesObject cloudFilesObject = mock(CloudFilesObject.class);
Payload payload = mock(Payload.class);
InputStream stream = mock(InputStream.class);
EasyMock.expect(objectApi.get(path, 0)).andReturn(cloudFilesObject);
EasyMock.expect(cloudFilesObject.getPayload()).andReturn(payload);
EasyMock.expect(payload.openStream()).andThrow(new IOException()).andReturn(stream);
when(objectApi.get(path, 0)).thenReturn(cloudFilesObject);
when(cloudFilesObject.getPayload()).thenReturn(payload);
when(payload.openStream()).thenThrow(new IOException())
.thenReturn(stream);
payload.close();
replayAll();
CloudFilesByteSource byteSource = new CloudFilesByteSource(objectApi, path);
try {
byteSource.openStream();
@ -82,6 +84,8 @@ public class CloudFilesByteSourceTest extends EasyMockSupport
Assert.assertEquals(stream, byteSource.openStream());
byteSource.closeStream();
verifyAll();
verify(objectApi).get(path, 0);
verify(cloudFilesObject).getPayload();
verify(payload, times(2)).openStream();
}
}

View File

@ -24,7 +24,6 @@ import org.apache.druid.jackson.DefaultObjectMapper;
import org.apache.druid.java.util.common.Intervals;
import org.apache.druid.timeline.DataSegment;
import org.apache.druid.timeline.partition.NoneShardSpec;
import org.easymock.EasyMock;
import org.jclouds.openstack.swift.v1.features.ObjectApi;
import org.jclouds.rackspace.cloudfiles.v1.CloudFilesApi;
import org.junit.Assert;
@ -36,6 +35,12 @@ import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.Mockito.atLeastOnce;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;
/**
*/
public class CloudFilesDataSegmentPusherTest
@ -46,16 +51,12 @@ public class CloudFilesDataSegmentPusherTest
@Test
public void testPush() throws Exception
{
ObjectApi objectApi = EasyMock.createStrictMock(ObjectApi.class);
EasyMock.expect(objectApi.put(EasyMock.anyString(), EasyMock.anyObject())).andReturn(null).atLeastOnce();
EasyMock.replay(objectApi);
CloudFilesApi api = EasyMock.createStrictMock(CloudFilesApi.class);
EasyMock.expect(api.getObjectApi(EasyMock.anyString(), EasyMock.anyString()))
.andReturn(objectApi)
.atLeastOnce();
EasyMock.replay(api);
ObjectApi objectApi = mock(ObjectApi.class);
when(objectApi.put(any(), any())).thenReturn(null);
CloudFilesApi api = mock(CloudFilesApi.class);
when(api.getObjectApi(any(), any()))
.thenReturn(objectApi);
CloudFilesDataSegmentPusherConfig config = new CloudFilesDataSegmentPusherConfig();
config.setRegion("region");
@ -87,6 +88,7 @@ public class CloudFilesDataSegmentPusherTest
Assert.assertEquals(segmentToPush.getSize(), segment.getSize());
EasyMock.verify(api);
verify(objectApi, atLeastOnce()).put(any(), any());
verify(api, atLeastOnce()).getObjectApi(any(), any());
}
}

View File

@ -19,8 +19,6 @@
package org.apache.druid.storage.cloudfiles;
import org.easymock.EasyMock;
import org.easymock.EasyMockSupport;
import org.jclouds.io.Payload;
import org.jclouds.openstack.swift.v1.domain.SwiftObject;
import org.jclouds.openstack.swift.v1.features.ObjectApi;
@ -28,7 +26,11 @@ import org.jclouds.rackspace.cloudfiles.v1.CloudFilesApi;
import org.junit.Assert;
import org.junit.Test;
public class CloudFilesObjectApiProxyTest extends EasyMockSupport
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;
public class CloudFilesObjectApiProxyTest
{
@Test
public void getTest()
@ -37,16 +39,14 @@ public class CloudFilesObjectApiProxyTest extends EasyMockSupport
final String region = "region";
final String container = "container";
CloudFilesApi cloudFilesApi = createMock(CloudFilesApi.class);
ObjectApi objectApi = createMock(ObjectApi.class);
SwiftObject swiftObject = createMock(SwiftObject.class);
Payload payload = createMock(Payload.class);
CloudFilesApi cloudFilesApi = mock(CloudFilesApi.class);
ObjectApi objectApi = mock(ObjectApi.class);
SwiftObject swiftObject = mock(SwiftObject.class);
Payload payload = mock(Payload.class);
EasyMock.expect(cloudFilesApi.getObjectApi(region, container)).andReturn(objectApi);
EasyMock.expect(objectApi.get(path)).andReturn(swiftObject);
EasyMock.expect(swiftObject.getPayload()).andReturn(payload);
replayAll();
when(cloudFilesApi.getObjectApi(region, container)).thenReturn(objectApi);
when(objectApi.get(path)).thenReturn(swiftObject);
when(swiftObject.getPayload()).thenReturn(payload);
CloudFilesObjectApiProxy cfoApiProxy = new CloudFilesObjectApiProxy(cloudFilesApi, region, container);
CloudFilesObject cloudFilesObject = cfoApiProxy.get(path, 0);
@ -56,6 +56,8 @@ public class CloudFilesObjectApiProxyTest extends EasyMockSupport
Assert.assertEquals(cloudFilesObject.getContainer(), container);
Assert.assertEquals(cloudFilesObject.getPath(), path);
verifyAll();
verify(cloudFilesApi).getObjectApi(region, container);
verify(objectApi).get(path);
verify(swiftObject).getPayload();
}
}

View File

@ -126,7 +126,6 @@
<dependency>
<groupId>joda-time</groupId>
<artifactId>joda-time</artifactId>
<version>2.10.5</version>
<scope>provided</scope>
</dependency>
<dependency>
@ -138,12 +137,10 @@
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
<version>2.10.2</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-annotations</artifactId>
<version>2.10.2</version>
</dependency>
</dependencies>
</project>

View File

@ -87,8 +87,8 @@
<scope>provided</scope>
</dependency>
<dependency>
<groupId>javax.validation</groupId>
<artifactId>validation-api</artifactId>
<groupId>jakarta.validation</groupId>
<artifactId>jakarta.validation-api</artifactId>
<scope>provided</scope>
</dependency>

View File

@ -81,6 +81,11 @@
<artifactId>jackson-core</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.datatype</groupId>
<artifactId>jackson-datatype-joda</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>

View File

@ -20,6 +20,7 @@
package org.apache.druid.emitter.kafka;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.datatype.joda.JodaModule;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import org.apache.druid.java.util.common.DateTimes;
@ -102,9 +103,11 @@ public class KafkaEmitterTest
requestTopic == null ? totalEventsExcludingRequestLogEvents : totalEvents);
final KafkaProducer<String, String> producer = mock(KafkaProducer.class);
ObjectMapper mapper = new ObjectMapper();
mapper.registerModule(new JodaModule());
final KafkaEmitter kafkaEmitter = new KafkaEmitter(
new KafkaEmitterConfig("", eventsType, "metrics", "alerts", requestTopic, "metadata", "test-cluster", null),
new ObjectMapper()
mapper
)
{
@Override

View File

@ -107,8 +107,8 @@
<version>6.7.2</version>
</dependency>
<dependency>
<groupId>javax.validation</groupId>
<artifactId>validation-api</artifactId>
<groupId>jakarta.validation</groupId>
<artifactId>jakarta.validation-api</artifactId>
<scope>provided</scope>
</dependency>
<dependency>

View File

@ -245,8 +245,10 @@ public class KubernetesPeonLifecycle
podStatus.getPodIP(),
DruidK8sConstants.PORT,
DruidK8sConstants.TLS_PORT,
Boolean.parseBoolean(pod.getMetadata().getAnnotations().getOrDefault(DruidK8sConstants.TLS_ENABLED, "false"))
Boolean.parseBoolean(pod.getMetadata().getAnnotations().getOrDefault(DruidK8sConstants.TLS_ENABLED, "false")),
pod.getMetadata() != null ? pod.getMetadata().getName() : ""
);
log.info("K8s task %s is running at location %s", taskId.getOriginalTaskId(), taskLocation);
}
return taskLocation;

View File

@ -42,6 +42,8 @@ import org.apache.druid.java.util.common.DateTimes;
import org.apache.druid.java.util.common.ISE;
import org.apache.druid.java.util.common.Pair;
import org.apache.druid.java.util.common.concurrent.Execs;
import org.apache.druid.java.util.common.lifecycle.LifecycleStart;
import org.apache.druid.java.util.common.lifecycle.LifecycleStop;
import org.apache.druid.java.util.emitter.EmittingLogger;
import org.apache.druid.java.util.emitter.service.ServiceEmitter;
import org.apache.druid.java.util.emitter.service.ServiceMetricEvent;
@ -325,6 +327,7 @@ public class KubernetesTaskRunner implements TaskLogStreamer, TaskRunner
}
@Override
@LifecycleStart
public void start()
{
cleanupExecutor.scheduleAtFixedRate(
@ -342,6 +345,7 @@ public class KubernetesTaskRunner implements TaskLogStreamer, TaskRunner
@Override
@LifecycleStop
public void stop()
{
log.debug("Stopping KubernetesTaskRunner");

View File

@ -34,6 +34,7 @@ public class DruidK8sConstants
public static final String TASK_JSON_ENV = "TASK_JSON";
public static final String TASK_DIR_ENV = "TASK_DIR";
public static final String TASK_ID_ENV = "TASK_ID";
public static final String LOAD_BROADCAST_SEGMENTS_ENV = "LOAD_BROADCAST_SEGMENTS";
public static final String JAVA_OPTS = "JAVA_OPTS";
public static final String DRUID_HOST_ENV = "druid_host";
public static final String DRUID_HOSTNAME_ENV = "HOSTNAME";

View File

@ -224,7 +224,11 @@ public class PodTemplateTaskAdapter implements TaskAdapter
.withValueFrom(new EnvVarSourceBuilder().withFieldRef(new ObjectFieldSelector(
null,
StringUtils.format("metadata.annotations['%s']", DruidK8sConstants.TASK)
)).build()).build()
)).build()).build(),
new EnvVarBuilder()
.withName(DruidK8sConstants.LOAD_BROADCAST_SEGMENTS_ENV)
.withValue(Boolean.toString(task.supportsQueries()))
.build()
);
}

View File

@ -815,6 +815,7 @@ public class KubernetesPeonLifecycleTest extends EasyMockSupport
Assert.assertEquals("ip", location.getHost());
Assert.assertEquals(8100, location.getPort());
Assert.assertEquals(-1, location.getTlsPort());
Assert.assertEquals(ID, location.getK8sPodName());
verifyAll();
}
@ -850,6 +851,7 @@ public class KubernetesPeonLifecycleTest extends EasyMockSupport
Assert.assertEquals("ip", location.getHost());
Assert.assertEquals(8100, location.getPort());
Assert.assertEquals(-1, location.getTlsPort());
Assert.assertEquals(ID, location.getK8sPodName());
verifyAll();
}
@ -886,6 +888,7 @@ public class KubernetesPeonLifecycleTest extends EasyMockSupport
Assert.assertEquals("ip", location.getHost());
Assert.assertEquals(-1, location.getPort());
Assert.assertEquals(8091, location.getTlsPort());
Assert.assertEquals(ID, location.getK8sPodName());
verifyAll();
}

View File

@ -36,6 +36,7 @@ import org.apache.druid.k8s.overlord.common.Base64Compression;
import org.apache.druid.k8s.overlord.common.DruidK8sConstants;
import org.apache.druid.k8s.overlord.common.K8sTestUtils;
import org.apache.druid.server.DruidNode;
import org.easymock.EasyMock;
import org.junit.Assert;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
@ -48,6 +49,7 @@ import java.nio.file.Path;
import java.util.Collections;
import java.util.Map;
import java.util.Properties;
import java.util.stream.Collectors;
public class PodTemplateTaskAdapterTest
{
@ -354,6 +356,42 @@ public class PodTemplateTaskAdapterTest
assertJobSpecsEqual(actual, expected);
}
@Test
public void test_fromTask_taskSupportsQueries() throws IOException
{
Path templatePath = Files.createFile(tempDir.resolve("noop.yaml"));
mapper.writeValue(templatePath.toFile(), podTemplateSpec);
Properties props = new Properties();
props.setProperty("druid.indexer.runner.k8s.podTemplate.base", templatePath.toString());
props.setProperty("druid.indexer.runner.k8s.podTemplate.queryable", templatePath.toString());
PodTemplateTaskAdapter adapter = new PodTemplateTaskAdapter(
taskRunnerConfig,
taskConfig,
node,
mapper,
props
);
Task task = EasyMock.mock(Task.class);
EasyMock.expect(task.supportsQueries()).andReturn(true);
EasyMock.expect(task.getType()).andReturn("queryable").anyTimes();
EasyMock.expect(task.getId()).andReturn("id").anyTimes();
EasyMock.expect(task.getGroupId()).andReturn("groupid").anyTimes();
EasyMock.expect(task.getDataSource()).andReturn("datasource").anyTimes();
EasyMock.replay(task);
Job actual = adapter.fromTask(task);
EasyMock.verify(task);
Assertions.assertEquals("true", actual.getSpec().getTemplate()
.getSpec().getContainers()
.get(0).getEnv().stream()
.filter(env -> env.getName().equals(DruidK8sConstants.LOAD_BROADCAST_SEGMENTS_ENV))
.collect(Collectors.toList()).get(0).getValue());
}
private void assertJobSpecsEqual(Job actual, Job expected) throws IOException

View File

@ -46,5 +46,7 @@ spec:
valueFrom:
fieldRef:
fieldPath: "metadata.annotations['task']"
- name: "LOAD_BROADCAST_SEGMENTS"
value: "false"
image: one
name: primary

View File

@ -46,5 +46,7 @@ spec:
valueFrom:
fieldRef:
fieldPath: "metadata.annotations['task']"
- name: "LOAD_BROADCAST_SEGMENTS"
value: "false"
image: one
name: primary

View File

@ -46,5 +46,7 @@ spec:
valueFrom:
fieldRef:
fieldPath: "metadata.annotations['task']"
- name: "LOAD_BROADCAST_SEGMENTS"
value: "false"
image: one
name: primary

View File

@ -199,7 +199,8 @@ public class MaterializedViewSupervisorSpec implements SupervisorSpec
tuningConfig.isLogParseExceptions(),
tuningConfig.getMaxParseExceptions(),
tuningConfig.isUseYarnRMJobStatusFallback(),
tuningConfig.getAwaitSegmentAvailabilityTimeoutMillis()
tuningConfig.getAwaitSegmentAvailabilityTimeoutMillis(),
HadoopTuningConfig.DEFAULT_DETERMINE_PARTITIONS_SAMPLING_FACTOR
);
// generate granularity

View File

@ -92,9 +92,8 @@
<scope>provided</scope>
</dependency>
<dependency>
<groupId>javax.validation</groupId>
<artifactId>validation-api</artifactId>
<version>1.1.0.Final</version>
<groupId>jakarta.validation</groupId>
<artifactId>jakarta.validation-api</artifactId>
<scope>provided</scope>
</dependency>

View File

@ -175,5 +175,9 @@
"namespace/cache/numEntries" : { "dimensions" : [], "type" : "gauge" },
"namespace/cache/heapSizeInBytes" : { "dimensions" : [], "type" : "gauge" },
"service/heartbeat" : { "dimensions" : ["leader"], "type" : "count" }
"service/heartbeat" : { "dimensions" : ["leader"], "type" : "count" },
"killTask/availableSlot/count" : { "dimensions" : [], "type" : "count" },
"killTask/maxSlot/count" : { "dimensions" : [], "type" : "count" },
"killTask/task/count" : { "dimensions" : [], "type" : "count" }
}

View File

@ -133,34 +133,6 @@
</dependency>
</dependencies>
<profiles>
<profile>
<id>hadoop2</id>
<activation>
<property>
<name>hadoop2.enabled</name>
<value>true</value>
</property>
</activation>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.compile.version}</version>
<scope>provided</scope>
<exclusions>
<exclusion>
<groupId>org.apache.avro</groupId>
<artifactId>avro</artifactId>
</exclusion>
</exclusions>
</dependency>
</dependencies>
</profile>
<profile>
<id>hadoop3</id>
<activation>

View File

@ -268,51 +268,6 @@
</dependency>
</dependencies>
<profiles>
<profile>
<id>hadoop2</id>
<activation>
<property>
<name>hadoop2.enabled</name>
<value>true</value>
</property>
</activation>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.compile.version}</version>
<scope>provided</scope>
<exclusions>
<exclusion>
<groupId>org.apache.avro</groupId>
<artifactId>avro</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<scope>provided</scope>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<scope>provided</scope>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
</exclusion>
</exclusions>
</dependency>
</dependencies>
</profile>
<profile>
<id>hadoop3</id>
<activation>

View File

@ -92,7 +92,7 @@
<dependency>
<groupId>com.google.inject.extensions</groupId>
<artifactId>guice-assistedinject</artifactId>
<version>${guice.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
@ -115,8 +115,8 @@
<scope>provided</scope>
</dependency>
<dependency>
<groupId>javax.validation</groupId>
<artifactId>validation-api</artifactId>
<groupId>jakarta.validation</groupId>
<artifactId>jakarta.validation-api</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
@ -152,6 +152,17 @@
<artifactId>equalsverifier</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-core</artifactId>
<version>${mockito.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-inline</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
<build>

View File

@ -139,7 +139,7 @@ public class AzureInputSource extends CloudObjectInputSource
public long getObjectSize(CloudObjectLocation location)
{
try {
final CloudBlob blobWithAttributes = storage.getBlobReferenceWithAttributes(
final CloudBlob blobWithAttributes = storage.getBlockBlobReferenceWithAttributes(
location.getBucket(),
location.getPath()
);

View File

@ -60,7 +60,7 @@ public class AzureByteSource extends ByteSource
public InputStream openStream(long offset) throws IOException
{
try {
return azureStorage.getBlobInputStream(offset, containerName, blobPath);
return azureStorage.getBlockBlobInputStream(offset, containerName, blobPath);
}
catch (StorageException | URISyntaxException e) {
if (AzureUtils.AZURE_RETRY.apply(e)) {

View File

@ -183,7 +183,7 @@ public class AzureDataSegmentPusher implements DataSegmentPusher
)
throws StorageException, IOException, URISyntaxException
{
azureStorage.uploadBlob(compressedSegmentData, segmentConfig.getContainer(), azurePath);
azureStorage.uploadBlockBlob(compressedSegmentData, segmentConfig.getContainer(), azurePath);
final DataSegment outSegment = segment
.withSize(size)

View File

@ -23,19 +23,26 @@ import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Supplier;
import com.microsoft.azure.storage.ResultContinuation;
import com.microsoft.azure.storage.ResultSegment;
import com.microsoft.azure.storage.RetryExponentialRetry;
import com.microsoft.azure.storage.StorageException;
import com.microsoft.azure.storage.blob.BlobDeleteBatchOperation;
import com.microsoft.azure.storage.blob.BlobListingDetails;
import com.microsoft.azure.storage.blob.BlobRequestOptions;
import com.microsoft.azure.storage.blob.CloudBlob;
import com.microsoft.azure.storage.blob.CloudBlobClient;
import com.microsoft.azure.storage.blob.CloudBlobContainer;
import com.microsoft.azure.storage.blob.CloudBlockBlob;
import com.microsoft.azure.storage.blob.DeleteSnapshotsOption;
import com.microsoft.azure.storage.blob.ListBlobItem;
import org.apache.druid.java.util.common.RE;
import org.apache.druid.java.util.common.logger.Logger;
import javax.annotation.Nullable;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.EnumSet;
@ -48,6 +55,9 @@ public class AzureStorage
{
private static final boolean USE_FLAT_BLOB_LISTING = true;
// Default value from Azure library
private static final int DELTA_BACKOFF_MS = 30_000;
private static final Logger log = new Logger(AzureStorage.class);
/**
@ -70,14 +80,28 @@ public class AzureStorage
public List<String> emptyCloudBlobDirectory(final String containerName, final String virtualDirPath)
throws StorageException, URISyntaxException
{
return emptyCloudBlobDirectory(containerName, virtualDirPath, null);
}
public List<String> emptyCloudBlobDirectory(final String containerName, final String virtualDirPath, final Integer maxAttempts)
throws StorageException, URISyntaxException
{
List<String> deletedFiles = new ArrayList<>();
CloudBlobContainer container = getOrCreateCloudBlobContainer(containerName);
for (ListBlobItem blobItem : container.listBlobs(virtualDirPath, true, null, null, null)) {
Iterable<ListBlobItem> blobItems = container.listBlobs(
virtualDirPath,
USE_FLAT_BLOB_LISTING,
null,
getRequestOptionsWithRetry(maxAttempts),
null
);
for (ListBlobItem blobItem : blobItems) {
CloudBlob cloudBlob = (CloudBlob) blobItem;
log.info("Removing file[%s] from Azure.", cloudBlob.getName());
if (cloudBlob.deleteIfExists()) {
log.debug("Removing file[%s] from Azure.", cloudBlob.getName());
if (cloudBlob.deleteIfExists(DeleteSnapshotsOption.NONE, null, getRequestOptionsWithRetry(maxAttempts), null)) {
deletedFiles.add(cloudBlob.getName());
}
}
@ -89,7 +113,7 @@ public class AzureStorage
return deletedFiles;
}
public void uploadBlob(final File file, final String containerName, final String blobPath)
public void uploadBlockBlob(final File file, final String containerName, final String blobPath)
throws IOException, StorageException, URISyntaxException
{
CloudBlobContainer container = getOrCreateCloudBlobContainer(containerName);
@ -98,7 +122,29 @@ public class AzureStorage
}
}
public CloudBlob getBlobReferenceWithAttributes(final String containerName, final String blobPath)
public OutputStream getBlockBlobOutputStream(
final String containerName,
final String blobPath,
@Nullable final Integer streamWriteSizeBytes,
Integer maxAttempts
) throws URISyntaxException, StorageException
{
CloudBlobContainer container = getOrCreateCloudBlobContainer(containerName);
CloudBlockBlob blockBlobReference = container.getBlockBlobReference(blobPath);
if (blockBlobReference.exists()) {
throw new RE("Reference already exists");
}
if (streamWriteSizeBytes != null) {
blockBlobReference.setStreamWriteSizeInBytes(streamWriteSizeBytes);
}
return blockBlobReference.openOutputStream(null, getRequestOptionsWithRetry(maxAttempts), null);
}
public CloudBlob getBlockBlobReferenceWithAttributes(final String containerName, final String blobPath)
throws URISyntaxException, StorageException
{
final CloudBlockBlob blobReference = getOrCreateCloudBlobContainer(containerName).getBlockBlobReference(blobPath);
@ -106,28 +152,97 @@ public class AzureStorage
return blobReference;
}
public long getBlobLength(final String containerName, final String blobPath)
public long getBlockBlobLength(final String containerName, final String blobPath)
throws URISyntaxException, StorageException
{
return getBlobReferenceWithAttributes(containerName, blobPath).getProperties().getLength();
return getBlockBlobReferenceWithAttributes(containerName, blobPath).getProperties().getLength();
}
public InputStream getBlobInputStream(final String containerName, final String blobPath)
public InputStream getBlockBlobInputStream(final String containerName, final String blobPath)
throws URISyntaxException, StorageException
{
return getBlobInputStream(0L, containerName, blobPath);
return getBlockBlobInputStream(0L, containerName, blobPath);
}
public InputStream getBlobInputStream(long offset, final String containerName, final String blobPath)
public InputStream getBlockBlobInputStream(long offset, final String containerName, final String blobPath)
throws URISyntaxException, StorageException
{
return getBlockBlobInputStream(offset, null, containerName, blobPath);
}
public InputStream getBlockBlobInputStream(long offset, Long length, final String containerName, final String blobPath)
throws URISyntaxException, StorageException
{
return getBlockBlobInputStream(offset, length, containerName, blobPath, null);
}
public InputStream getBlockBlobInputStream(long offset, Long length, final String containerName, final String blobPath, Integer maxAttempts)
throws URISyntaxException, StorageException
{
CloudBlobContainer container = getOrCreateCloudBlobContainer(containerName);
return container.getBlockBlobReference(blobPath).openInputStream(offset, null, null, null, null);
return container.getBlockBlobReference(blobPath)
.openInputStream(offset, length, null, getRequestOptionsWithRetry(maxAttempts), null);
}
public boolean getBlobExists(String container, String blobPath) throws URISyntaxException, StorageException
public void batchDeleteFiles(String containerName, Iterable<String> paths, Integer maxAttempts)
throws URISyntaxException, StorageException
{
return getOrCreateCloudBlobContainer(container).getBlockBlobReference(blobPath).exists();
CloudBlobContainer cloudBlobContainer = getOrCreateCloudBlobContainer(containerName);
BlobDeleteBatchOperation blobDeleteBatchOperation = new BlobDeleteBatchOperation();
for (String path : paths) {
CloudBlob blobReference = cloudBlobContainer.getBlockBlobReference(path);
blobDeleteBatchOperation.addSubOperation(blobReference);
}
cloudBlobClient.get().executeBatch(blobDeleteBatchOperation, getRequestOptionsWithRetry(maxAttempts), null);
}
public List<String> listDir(final String containerName, final String virtualDirPath)
throws URISyntaxException, StorageException
{
return listDir(containerName, virtualDirPath, null);
}
public List<String> listDir(final String containerName, final String virtualDirPath, final Integer maxAttempts)
throws StorageException, URISyntaxException
{
List<String> files = new ArrayList<>();
CloudBlobContainer container = getOrCreateCloudBlobContainer(containerName);
for (ListBlobItem blobItem :
container.listBlobs(virtualDirPath, USE_FLAT_BLOB_LISTING, null, getRequestOptionsWithRetry(maxAttempts), null)) {
CloudBlob cloudBlob = (CloudBlob) blobItem;
files.add(cloudBlob.getName());
}
return files;
}
public boolean getBlockBlobExists(String container, String blobPath) throws URISyntaxException, StorageException
{
return getBlockBlobExists(container, blobPath, null);
}
public boolean getBlockBlobExists(String container, String blobPath, Integer maxAttempts)
throws URISyntaxException, StorageException
{
return getOrCreateCloudBlobContainer(container).getBlockBlobReference(blobPath)
.exists(null, getRequestOptionsWithRetry(maxAttempts), null);
}
/**
* If maxAttempts is provided, this method returns request options with retry built in.
* Retry backoff is exponential backoff, with maxAttempts set to the one provided
*/
@Nullable
private BlobRequestOptions getRequestOptionsWithRetry(Integer maxAttempts)
{
if (maxAttempts == null) {
return null;
}
BlobRequestOptions requestOptions = new BlobRequestOptions();
requestOptions.setRetryPolicyFactory(new RetryExponentialRetry(DELTA_BACKOFF_MS, maxAttempts));
return requestOptions;
}
@VisibleForTesting

View File

@ -51,7 +51,7 @@ import java.util.List;
public class AzureStorageDruidModule implements DruidModule
{
static final String SCHEME = "azure";
public static final String SCHEME = "azure";
public static final String
STORAGE_CONNECTION_STRING_WITH_KEY = "DefaultEndpointsProtocol=%s;AccountName=%s;AccountKey=%s";
public static final String

View File

@ -95,7 +95,7 @@ public class AzureTaskLogs implements TaskLogs
try {
AzureUtils.retryAzureOperation(
() -> {
azureStorage.uploadBlob(logFile, config.getContainer(), taskKey);
azureStorage.uploadBlockBlob(logFile, config.getContainer(), taskKey);
return null;
},
config.getMaxTries()
@ -129,12 +129,12 @@ public class AzureTaskLogs implements TaskLogs
{
final String container = config.getContainer();
try {
if (!azureStorage.getBlobExists(container, taskKey)) {
if (!azureStorage.getBlockBlobExists(container, taskKey)) {
return Optional.absent();
}
try {
final long start;
final long length = azureStorage.getBlobLength(container, taskKey);
final long length = azureStorage.getBlockBlobLength(container, taskKey);
if (offset > 0 && offset < length) {
start = offset;
@ -144,7 +144,7 @@ public class AzureTaskLogs implements TaskLogs
start = 0;
}
InputStream stream = azureStorage.getBlobInputStream(container, taskKey);
InputStream stream = azureStorage.getBlockBlobInputStream(container, taskKey);
stream.skip(start);
return Optional.of(stream);

View File

@ -0,0 +1,99 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.druid.storage.azure.output;
import java.util.Objects;
/**
* Represents a chunk of the Azure blob
*/
public class AzureInputRange
{
/**
* Starting location in the blob stream
*/
private final long start;
/**
* Size of the blob stream that this object represents
*/
private final long size;
/**
* Container where the blob resides
*/
private final String container;
/**
* Absolute path of the blob
*/
private final String path;
public AzureInputRange(long start, long size, String container, String path)
{
this.start = start;
this.size = size;
this.container = container;
this.path = path;
}
public long getStart()
{
return start;
}
public long getSize()
{
return size;
}
public String getContainer()
{
return container;
}
public String getPath()
{
return path;
}
@Override
public boolean equals(Object o)
{
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
AzureInputRange that = (AzureInputRange) o;
return start == that.start
&& size == that.size
&& Objects.equals(container, that.container)
&& Objects.equals(path, that.path);
}
@Override
public int hashCode()
{
return Objects.hash(start, size, container, path);
}
}

View File

@ -0,0 +1,171 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.druid.storage.azure.output;
import com.fasterxml.jackson.annotation.JsonProperty;
import org.apache.druid.error.DruidException;
import org.apache.druid.error.InvalidInput;
import org.apache.druid.java.util.common.FileUtils;
import org.apache.druid.java.util.common.HumanReadableBytes;
import org.apache.druid.java.util.common.RetryUtils;
import javax.annotation.Nullable;
import java.io.File;
import java.io.IOException;
import java.util.Objects;
/**
* Configuration of the Azure storage connector
*/
public class AzureOutputConfig
{
@JsonProperty
private final String container;
@JsonProperty
private final String prefix;
@JsonProperty
private final File tempDir;
@JsonProperty
private final HumanReadableBytes chunkSize;
private static final HumanReadableBytes DEFAULT_CHUNK_SIZE = new HumanReadableBytes("4MiB");
// Minimum limit is self-imposed, so that chunks are appropriately sized, and we don't spend a lot of time downloading
// the part of the blobs
private static final long AZURE_MIN_CHUNK_SIZE_BYTES = new HumanReadableBytes("256KiB").getBytes();
// Maximum limit is imposed by Azure, on the size of one block blob
private static final long AZURE_MAX_CHUNK_SIZE_BYTES = new HumanReadableBytes("4000MiB").getBytes();
@JsonProperty
private final int maxRetry;
public AzureOutputConfig(
@JsonProperty(value = "container", required = true) String container,
@JsonProperty(value = "prefix", required = true) String prefix,
@JsonProperty(value = "tempDir", required = true) File tempDir,
@JsonProperty(value = "chunkSize") @Nullable HumanReadableBytes chunkSize,
@JsonProperty(value = "maxRetry") @Nullable Integer maxRetry
)
{
this.container = container;
this.prefix = prefix;
this.tempDir = tempDir;
this.chunkSize = chunkSize != null ? chunkSize : DEFAULT_CHUNK_SIZE;
this.maxRetry = maxRetry != null ? maxRetry : RetryUtils.DEFAULT_MAX_TRIES;
validateFields();
}
public String getContainer()
{
return container;
}
public String getPrefix()
{
return prefix;
}
public File getTempDir()
{
return tempDir;
}
public HumanReadableBytes getChunkSize()
{
return chunkSize;
}
public int getMaxRetry()
{
return maxRetry;
}
private void validateFields()
{
if (chunkSize.getBytes() < AZURE_MIN_CHUNK_SIZE_BYTES || chunkSize.getBytes() > AZURE_MAX_CHUNK_SIZE_BYTES) {
throw InvalidInput.exception(
"'chunkSize' [%d] bytes to the AzureConfig should be between [%d] bytes and [%d] bytes",
chunkSize.getBytes(),
AZURE_MIN_CHUNK_SIZE_BYTES,
AZURE_MAX_CHUNK_SIZE_BYTES
);
}
try {
FileUtils.mkdirp(tempDir);
}
catch (IOException e) {
throw DruidException.forPersona(DruidException.Persona.ADMIN)
.ofCategory(DruidException.Category.RUNTIME_FAILURE)
.build(e, "Unable to create temporary directory [%s]", tempDir.getAbsolutePath());
}
if (!tempDir.canRead() || !tempDir.canWrite()) {
throw DruidException.forPersona(DruidException.Persona.ADMIN)
.ofCategory(DruidException.Category.RUNTIME_FAILURE)
.build(
"Cannot read or write on the 'tempDir' [%s]. "
+ "Please provide a different path to store the intermediate contents of AzureStorageConnector",
tempDir.getAbsolutePath()
);
}
}
@Override
public boolean equals(Object o)
{
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
AzureOutputConfig that = (AzureOutputConfig) o;
return maxRetry == that.maxRetry
&& Objects.equals(container, that.container)
&& Objects.equals(prefix, that.prefix)
&& Objects.equals(tempDir, that.tempDir)
&& Objects.equals(chunkSize, that.chunkSize);
}
@Override
public int hashCode()
{
return Objects.hash(container, prefix, tempDir, chunkSize, maxRetry);
}
@Override
public String toString()
{
return "AzureOutputConfig{" +
"container='" + container + '\'' +
", prefix='" + prefix + '\'' +
", tempDir=" + tempDir +
", chunkSize=" + chunkSize +
", maxRetry=" + maxRetry +
'}';
}
}

View File

@ -0,0 +1,219 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.druid.storage.azure.output;
import com.google.common.base.Joiner;
import com.google.common.collect.Iterables;
import com.microsoft.azure.storage.StorageException;
import org.apache.druid.data.input.impl.prefetch.ObjectOpenFunction;
import org.apache.druid.storage.azure.AzureStorage;
import org.apache.druid.storage.azure.AzureUtils;
import org.apache.druid.storage.remote.ChunkingStorageConnector;
import org.apache.druid.storage.remote.ChunkingStorageConnectorParameters;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URISyntaxException;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
/**
* Implementation of the storage connector that facilitates reading and writing from Azure's blob storage.
* This extends the {@link ChunkingStorageConnector} so that the downloads are in a chunked manner.
*/
public class AzureStorageConnector extends ChunkingStorageConnector<AzureInputRange>
{
private static final String DELIM = "/";
private static final Joiner JOINER = Joiner.on(DELIM).skipNulls();
private final AzureOutputConfig config;
private final AzureStorage azureStorage;
public AzureStorageConnector(
final AzureOutputConfig config,
final AzureStorage azureStorage
)
{
this.config = config;
this.azureStorage = azureStorage;
}
@Override
public ChunkingStorageConnectorParameters<AzureInputRange> buildInputParams(String path) throws IOException
{
try {
return buildInputParams(path, 0, azureStorage.getBlockBlobLength(config.getContainer(), objectPath(path)));
}
catch (URISyntaxException | StorageException e) {
throw new IOException(e);
}
}
@Override
public ChunkingStorageConnectorParameters<AzureInputRange> buildInputParams(String path, long from, long size)
{
ChunkingStorageConnectorParameters.Builder<AzureInputRange> parameters = new ChunkingStorageConnectorParameters.Builder<>();
parameters.tempDirSupplier(config::getTempDir);
parameters.maxRetry(config.getMaxRetry());
parameters.cloudStoragePath(objectPath(path));
parameters.retryCondition(AzureUtils.AZURE_RETRY);
parameters.start(from);
parameters.end(from + size);
parameters.objectSupplier((start, end) -> new AzureInputRange(
start,
end - start,
config.getContainer(),
objectPath(path)
));
parameters.objectOpenFunction(
new ObjectOpenFunction<AzureInputRange>()
{
@Override
public InputStream open(AzureInputRange inputRange) throws IOException
{
try {
return azureStorage.getBlockBlobInputStream(
inputRange.getStart(),
inputRange.getSize(),
inputRange.getContainer(),
inputRange.getPath(),
config.getMaxRetry()
);
}
catch (URISyntaxException | StorageException e) {
throw new IOException(e);
}
}
@Override
public InputStream open(AzureInputRange inputRange, long offset) throws IOException
{
AzureInputRange newInputRange = new AzureInputRange(
inputRange.getStart() + offset,
Math.max(inputRange.getSize() - offset, 0),
inputRange.getContainer(),
inputRange.getPath()
);
return open(newInputRange);
}
}
);
return parameters.build();
}
@Override
public boolean pathExists(String path) throws IOException
{
try {
return azureStorage.getBlockBlobExists(config.getContainer(), objectPath(path), config.getMaxRetry());
}
catch (URISyntaxException | StorageException e) {
throw new IOException(e);
}
}
@Override
public OutputStream write(String path) throws IOException
{
try {
return azureStorage.getBlockBlobOutputStream(
config.getContainer(),
objectPath(path),
config.getChunkSize().getBytesInInt(),
config.getMaxRetry()
);
}
catch (URISyntaxException | StorageException e) {
throw new IOException(e);
}
}
@Override
public void deleteFile(String path) throws IOException
{
try {
azureStorage.batchDeleteFiles(
config.getContainer(),
Collections.singletonList(objectPath(path)),
config.getMaxRetry()
);
}
catch (URISyntaxException | StorageException e) {
throw new IOException(e);
}
}
@Override
public void deleteFiles(Iterable<String> paths) throws IOException
{
try {
azureStorage.batchDeleteFiles(
config.getContainer(),
Iterables.transform(paths, this::objectPath),
config.getMaxRetry()
);
}
catch (StorageException | URISyntaxException e) {
throw new IOException(e);
}
}
@Override
public void deleteRecursively(String path) throws IOException
{
try {
azureStorage.emptyCloudBlobDirectory(config.getContainer(), objectPath(path), config.getMaxRetry());
}
catch (StorageException | URISyntaxException e) {
throw new IOException(e);
}
}
@Override
public Iterator<String> listDir(String dirName) throws IOException
{
final String prefixBasePath = objectPath(dirName);
List<String> paths;
try {
paths = azureStorage.listDir(config.getContainer(), prefixBasePath, config.getMaxRetry());
}
catch (StorageException | URISyntaxException e) {
throw new IOException(e);
}
return paths.stream().map(path -> {
String[] size = path.split(prefixBasePath, 2);
if (size.length > 1) {
return size[1];
} else {
return "";
}
}).iterator();
}
private String objectPath(String path)
{
return JOINER.join(config.getPrefix(), path);
}
}

View File

@ -0,0 +1,46 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.druid.storage.azure.output;
import com.fasterxml.jackson.databind.Module;
import com.fasterxml.jackson.databind.module.SimpleModule;
import com.google.inject.Binder;
import org.apache.druid.initialization.DruidModule;
import java.util.Collections;
import java.util.List;
public class AzureStorageConnectorModule implements DruidModule
{
@Override
public List<? extends Module> getJacksonModules()
{
return Collections.singletonList(
new SimpleModule(AzureStorageConnectorModule.class.getSimpleName())
.registerSubtypes(AzureStorageConnectorProvider.class)
);
}
@Override
public void configure(Binder binder)
{
}
}

View File

@ -0,0 +1,59 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.druid.storage.azure.output;
import com.fasterxml.jackson.annotation.JacksonInject;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonTypeName;
import org.apache.druid.java.util.common.HumanReadableBytes;
import org.apache.druid.storage.StorageConnector;
import org.apache.druid.storage.StorageConnectorProvider;
import org.apache.druid.storage.azure.AzureStorage;
import org.apache.druid.storage.azure.AzureStorageDruidModule;
import javax.annotation.Nullable;
import java.io.File;
@JsonTypeName(AzureStorageDruidModule.SCHEME)
public class AzureStorageConnectorProvider extends AzureOutputConfig implements StorageConnectorProvider
{
@JacksonInject
AzureStorage azureStorage;
@JsonCreator
public AzureStorageConnectorProvider(
@JsonProperty(value = "container", required = true) String container,
@JsonProperty(value = "prefix", required = true) String prefix,
@JsonProperty(value = "tempDir", required = true) File tempDir,
@JsonProperty(value = "chunkSize") @Nullable HumanReadableBytes chunkSize,
@JsonProperty(value = "maxRetry") @Nullable Integer maxRetry
)
{
super(container, prefix, tempDir, chunkSize, maxRetry);
}
@Override
public StorageConnector get()
{
return new AzureStorageConnector(this, azureStorage);
}
}

View File

@ -13,4 +13,5 @@
# See the License for the specific language governing permissions and
# limitations under the License.
org.apache.druid.storage.azure.output.AzureStorageConnectorModule
org.apache.druid.storage.azure.AzureStorageDruidModule

View File

@ -41,7 +41,7 @@ public class AzureByteSourceTest extends EasyMockSupport
AzureStorage azureStorage = createMock(AzureStorage.class);
InputStream stream = createMock(InputStream.class);
EasyMock.expect(azureStorage.getBlobInputStream(NO_OFFSET, containerName, blobPath)).andReturn(stream);
EasyMock.expect(azureStorage.getBlockBlobInputStream(NO_OFFSET, containerName, blobPath)).andReturn(stream);
replayAll();
@ -60,7 +60,7 @@ public class AzureByteSourceTest extends EasyMockSupport
AzureStorage azureStorage = createMock(AzureStorage.class);
InputStream stream = createMock(InputStream.class);
EasyMock.expect(azureStorage.getBlobInputStream(OFFSET, containerName, blobPath)).andReturn(stream);
EasyMock.expect(azureStorage.getBlockBlobInputStream(OFFSET, containerName, blobPath)).andReturn(stream);
replayAll();
@ -78,7 +78,7 @@ public class AzureByteSourceTest extends EasyMockSupport
final String blobPath = "/path/to/file";
AzureStorage azureStorage = createMock(AzureStorage.class);
EasyMock.expect(azureStorage.getBlobInputStream(NO_OFFSET, containerName, blobPath)).andThrow(
EasyMock.expect(azureStorage.getBlockBlobInputStream(NO_OFFSET, containerName, blobPath)).andThrow(
new StorageException(
"",
"",

View File

@ -62,7 +62,7 @@ public class AzureDataSegmentPullerTest extends EasyMockSupport
final InputStream zipStream = new FileInputStream(pulledFile);
EasyMock.expect(byteSourceFactory.create(CONTAINER_NAME, BLOB_PATH)).andReturn(new AzureByteSource(azureStorage, CONTAINER_NAME, BLOB_PATH));
EasyMock.expect(azureStorage.getBlobInputStream(0L, CONTAINER_NAME, BLOB_PATH)).andReturn(zipStream);
EasyMock.expect(azureStorage.getBlockBlobInputStream(0L, CONTAINER_NAME, BLOB_PATH)).andReturn(zipStream);
replayAll();
@ -94,7 +94,7 @@ public class AzureDataSegmentPullerTest extends EasyMockSupport
final InputStream zipStream = new FileInputStream(pulledFile);
EasyMock.expect(byteSourceFactory.create(CONTAINER_NAME, BLOB_PATH)).andReturn(new AzureByteSource(azureStorage, CONTAINER_NAME, BLOB_PATH));
EasyMock.expect(azureStorage.getBlobInputStream(0L, CONTAINER_NAME, BLOB_PATH)).andReturn(zipStream);
EasyMock.expect(azureStorage.getBlockBlobInputStream(0L, CONTAINER_NAME, BLOB_PATH)).andReturn(zipStream);
replayAll();
@ -123,7 +123,7 @@ public class AzureDataSegmentPullerTest extends EasyMockSupport
final File outDir = FileUtils.createTempDir();
try {
EasyMock.expect(byteSourceFactory.create(CONTAINER_NAME, BLOB_PATH)).andReturn(new AzureByteSource(azureStorage, CONTAINER_NAME, BLOB_PATH));
EasyMock.expect(azureStorage.getBlobInputStream(0L, CONTAINER_NAME, BLOB_PATH)).andThrow(
EasyMock.expect(azureStorage.getBlockBlobInputStream(0L, CONTAINER_NAME, BLOB_PATH)).andThrow(
new URISyntaxException(
"error",
"error",
@ -155,7 +155,7 @@ public class AzureDataSegmentPullerTest extends EasyMockSupport
final File outDir = FileUtils.createTempDir();
try {
EasyMock.expect(byteSourceFactory.create(CONTAINER_NAME, BLOB_PATH)).andReturn(new AzureByteSource(azureStorage, CONTAINER_NAME, BLOB_PATH));
EasyMock.expect(azureStorage.getBlobInputStream(0L, CONTAINER_NAME, BLOB_PATH)).andThrow(
EasyMock.expect(azureStorage.getBlockBlobInputStream(0L, CONTAINER_NAME, BLOB_PATH)).andThrow(
new StorageException(null, null, 0, null, null)
).atLeastOnce();

View File

@ -115,7 +115,7 @@ public class AzureDataSegmentPusherTest extends EasyMockSupport
Files.write(DATA, tmp);
String azurePath = pusher.getAzurePath(SEGMENT_TO_PUSH, useUniquePath);
azureStorage.uploadBlob(EasyMock.anyObject(File.class), EasyMock.eq(CONTAINER_NAME), EasyMock.eq(azurePath));
azureStorage.uploadBlockBlob(EasyMock.anyObject(File.class), EasyMock.eq(CONTAINER_NAME), EasyMock.eq(azurePath));
EasyMock.expectLastCall();
replayAll();
@ -145,7 +145,7 @@ public class AzureDataSegmentPusherTest extends EasyMockSupport
Files.write(DATA, tmp);
String azurePath = pusher.getAzurePath(SEGMENT_TO_PUSH, useUniquePath);
azureStorage.uploadBlob(
azureStorage.uploadBlockBlob(
EasyMock.anyObject(File.class),
EasyMock.eq(CONTAINER_NAME),
EasyMock.eq(PREFIX + "/" + azurePath)
@ -178,7 +178,7 @@ public class AzureDataSegmentPusherTest extends EasyMockSupport
Files.write(DATA, tmp);
String azurePath = pusher.getAzurePath(SEGMENT_TO_PUSH, useUniquePath);
azureStorage.uploadBlob(
azureStorage.uploadBlockBlob(
EasyMock.anyObject(File.class),
EasyMock.eq(CONTAINER_NAME),
EasyMock.matches(UNIQUE_MATCHER_NO_PREFIX)
@ -211,7 +211,7 @@ public class AzureDataSegmentPusherTest extends EasyMockSupport
Files.write(DATA, tmp);
String azurePath = pusher.getAzurePath(SEGMENT_TO_PUSH, useUniquePath);
azureStorage.uploadBlob(
azureStorage.uploadBlockBlob(
EasyMock.anyObject(File.class),
EasyMock.eq(CONTAINER_NAME),
EasyMock.matches(UNIQUE_MATCHER_PREFIX)
@ -245,7 +245,7 @@ public class AzureDataSegmentPusherTest extends EasyMockSupport
final long size = DATA.length;
String azurePath = pusher.getAzurePath(SEGMENT_TO_PUSH, useUniquePath);
azureStorage.uploadBlob(EasyMock.anyObject(File.class), EasyMock.eq(CONTAINER_NAME), EasyMock.eq(azurePath));
azureStorage.uploadBlockBlob(EasyMock.anyObject(File.class), EasyMock.eq(CONTAINER_NAME), EasyMock.eq(azurePath));
EasyMock.expectLastCall().andThrow(new URISyntaxException("", ""));
replayAll();
@ -284,7 +284,7 @@ public class AzureDataSegmentPusherTest extends EasyMockSupport
final File compressedSegmentData = new File("index.zip");
final String azurePath = pusher.getAzurePath(DATA_SEGMENT, false);
azureStorage.uploadBlob(compressedSegmentData, CONTAINER_NAME, azurePath);
azureStorage.uploadBlockBlob(compressedSegmentData, CONTAINER_NAME, azurePath);
EasyMock.expectLastCall();
replayAll();

View File

@ -0,0 +1,71 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.druid.storage.azure;
import com.google.common.collect.ImmutableList;
import com.microsoft.azure.storage.StorageException;
import com.microsoft.azure.storage.blob.CloudBlobClient;
import com.microsoft.azure.storage.blob.CloudBlobContainer;
import com.microsoft.azure.storage.blob.CloudBlockBlob;
import com.microsoft.azure.storage.blob.ListBlobItem;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import org.mockito.ArgumentMatchers;
import org.mockito.Mockito;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.List;
public class AzureStorageTest
{
AzureStorage azureStorage;
CloudBlobClient cloudBlobClient = Mockito.mock(CloudBlobClient.class);
CloudBlobContainer cloudBlobContainer = Mockito.mock(CloudBlobContainer.class);
@Before
public void setup() throws URISyntaxException, StorageException
{
Mockito.doReturn(cloudBlobContainer).when(cloudBlobClient).getContainerReference(ArgumentMatchers.anyString());
azureStorage = new AzureStorage(() -> cloudBlobClient);
}
@Test
public void testListDir() throws URISyntaxException, StorageException
{
List<ListBlobItem> listBlobItems = ImmutableList.of(
new CloudBlockBlob(new URI("azure://dummy.com/container/blobName"))
);
Mockito.doReturn(listBlobItems).when(cloudBlobContainer).listBlobs(
ArgumentMatchers.anyString(),
ArgumentMatchers.anyBoolean(),
ArgumentMatchers.any(),
ArgumentMatchers.any(),
ArgumentMatchers.any()
);
Assert.assertEquals(ImmutableList.of("blobName"), azureStorage.listDir("test", ""));
}
}

View File

@ -97,7 +97,7 @@ public class AzureTaskLogsTest extends EasyMockSupport
try {
final File logFile = new File(tmpDir, "log");
azureStorage.uploadBlob(logFile, CONTAINER, PREFIX + "/" + TASK_ID + "/log");
azureStorage.uploadBlockBlob(logFile, CONTAINER, PREFIX + "/" + TASK_ID + "/log");
EasyMock.expectLastCall();
replayAll();
@ -119,7 +119,7 @@ public class AzureTaskLogsTest extends EasyMockSupport
try {
final File logFile = new File(tmpDir, "log");
azureStorage.uploadBlob(logFile, CONTAINER, PREFIX + "/" + TASK_ID + "/log");
azureStorage.uploadBlockBlob(logFile, CONTAINER, PREFIX + "/" + TASK_ID + "/log");
EasyMock.expectLastCall().andThrow(new IOException());
replayAll();
@ -141,7 +141,7 @@ public class AzureTaskLogsTest extends EasyMockSupport
try {
final File logFile = new File(tmpDir, "log");
azureStorage.uploadBlob(logFile, CONTAINER, PREFIX + "/" + TASK_ID + "/report.json");
azureStorage.uploadBlockBlob(logFile, CONTAINER, PREFIX + "/" + TASK_ID + "/report.json");
EasyMock.expectLastCall();
replayAll();
@ -163,7 +163,7 @@ public class AzureTaskLogsTest extends EasyMockSupport
try {
final File logFile = new File(tmpDir, "status.json");
azureStorage.uploadBlob(logFile, CONTAINER, PREFIX + "/" + TASK_ID + "/status.json");
azureStorage.uploadBlockBlob(logFile, CONTAINER, PREFIX + "/" + TASK_ID + "/status.json");
EasyMock.expectLastCall();
replayAll();
@ -185,7 +185,7 @@ public class AzureTaskLogsTest extends EasyMockSupport
try {
final File logFile = new File(tmpDir, "log");
azureStorage.uploadBlob(logFile, CONTAINER, PREFIX + "/" + TASK_ID + "/report.json");
azureStorage.uploadBlockBlob(logFile, CONTAINER, PREFIX + "/" + TASK_ID + "/report.json");
EasyMock.expectLastCall().andThrow(new IOException());
replayAll();
@ -205,9 +205,9 @@ public class AzureTaskLogsTest extends EasyMockSupport
final String testLog = "hello this is a log";
final String blobPath = PREFIX + "/" + TASK_ID + "/log";
EasyMock.expect(azureStorage.getBlobExists(CONTAINER, blobPath)).andReturn(true);
EasyMock.expect(azureStorage.getBlobLength(CONTAINER, blobPath)).andReturn((long) testLog.length());
EasyMock.expect(azureStorage.getBlobInputStream(CONTAINER, blobPath)).andReturn(
EasyMock.expect(azureStorage.getBlockBlobExists(CONTAINER, blobPath)).andReturn(true);
EasyMock.expect(azureStorage.getBlockBlobLength(CONTAINER, blobPath)).andReturn((long) testLog.length());
EasyMock.expect(azureStorage.getBlockBlobInputStream(CONTAINER, blobPath)).andReturn(
new ByteArrayInputStream(testLog.getBytes(StandardCharsets.UTF_8)));
@ -228,9 +228,9 @@ public class AzureTaskLogsTest extends EasyMockSupport
final String testLog = "hello this is a log";
final String blobPath = PREFIX + "/" + TASK_ID + "/log";
EasyMock.expect(azureStorage.getBlobExists(CONTAINER, blobPath)).andReturn(true);
EasyMock.expect(azureStorage.getBlobLength(CONTAINER, blobPath)).andReturn((long) testLog.length());
EasyMock.expect(azureStorage.getBlobInputStream(CONTAINER, blobPath)).andReturn(
EasyMock.expect(azureStorage.getBlockBlobExists(CONTAINER, blobPath)).andReturn(true);
EasyMock.expect(azureStorage.getBlockBlobLength(CONTAINER, blobPath)).andReturn((long) testLog.length());
EasyMock.expect(azureStorage.getBlockBlobInputStream(CONTAINER, blobPath)).andReturn(
new ByteArrayInputStream(testLog.getBytes(StandardCharsets.UTF_8)));
@ -251,9 +251,9 @@ public class AzureTaskLogsTest extends EasyMockSupport
final String testLog = "hello this is a log";
final String blobPath = PREFIX + "/" + TASK_ID + "/log";
EasyMock.expect(azureStorage.getBlobExists(CONTAINER, blobPath)).andReturn(true);
EasyMock.expect(azureStorage.getBlobLength(CONTAINER, blobPath)).andReturn((long) testLog.length());
EasyMock.expect(azureStorage.getBlobInputStream(CONTAINER, blobPath)).andReturn(
EasyMock.expect(azureStorage.getBlockBlobExists(CONTAINER, blobPath)).andReturn(true);
EasyMock.expect(azureStorage.getBlockBlobLength(CONTAINER, blobPath)).andReturn((long) testLog.length());
EasyMock.expect(azureStorage.getBlockBlobInputStream(CONTAINER, blobPath)).andReturn(
new ByteArrayInputStream(StringUtils.toUtf8(testLog)));
@ -274,9 +274,9 @@ public class AzureTaskLogsTest extends EasyMockSupport
final String testLog = "hello this is a log";
final String blobPath = PREFIX + "/" + TASK_ID + "/report.json";
EasyMock.expect(azureStorage.getBlobExists(CONTAINER, blobPath)).andReturn(true);
EasyMock.expect(azureStorage.getBlobLength(CONTAINER, blobPath)).andReturn((long) testLog.length());
EasyMock.expect(azureStorage.getBlobInputStream(CONTAINER, blobPath)).andReturn(
EasyMock.expect(azureStorage.getBlockBlobExists(CONTAINER, blobPath)).andReturn(true);
EasyMock.expect(azureStorage.getBlockBlobLength(CONTAINER, blobPath)).andReturn((long) testLog.length());
EasyMock.expect(azureStorage.getBlockBlobInputStream(CONTAINER, blobPath)).andReturn(
new ByteArrayInputStream(testLog.getBytes(StandardCharsets.UTF_8)));
@ -297,7 +297,7 @@ public class AzureTaskLogsTest extends EasyMockSupport
final String testLog = "hello this is a log";
final String blobPath = PREFIX + "/" + TASK_ID_NOT_FOUND + "/report.json";
EasyMock.expect(azureStorage.getBlobExists(CONTAINER, blobPath)).andReturn(false);
EasyMock.expect(azureStorage.getBlockBlobExists(CONTAINER, blobPath)).andReturn(false);
replayAll();
@ -315,9 +315,9 @@ public class AzureTaskLogsTest extends EasyMockSupport
final String testLog = "hello this is a log";
final String blobPath = PREFIX + "/" + TASK_ID + "/report.json";
EasyMock.expect(azureStorage.getBlobExists(CONTAINER, blobPath)).andReturn(true);
EasyMock.expect(azureStorage.getBlobLength(CONTAINER, blobPath)).andReturn((long) testLog.length());
EasyMock.expect(azureStorage.getBlobInputStream(CONTAINER, blobPath)).andThrow(
EasyMock.expect(azureStorage.getBlockBlobExists(CONTAINER, blobPath)).andReturn(true);
EasyMock.expect(azureStorage.getBlockBlobLength(CONTAINER, blobPath)).andReturn((long) testLog.length());
EasyMock.expect(azureStorage.getBlockBlobInputStream(CONTAINER, blobPath)).andThrow(
new URISyntaxException("", ""));
@ -336,7 +336,7 @@ public class AzureTaskLogsTest extends EasyMockSupport
final String testLog = "hello this is a log";
final String blobPath = PREFIX + "/" + TASK_ID + "/report.json";
EasyMock.expect(azureStorage.getBlobExists(CONTAINER, blobPath)).andThrow(new URISyntaxException("", ""));
EasyMock.expect(azureStorage.getBlockBlobExists(CONTAINER, blobPath)).andThrow(new URISyntaxException("", ""));
replayAll();
@ -351,9 +351,9 @@ public class AzureTaskLogsTest extends EasyMockSupport
final String taskStatus = "{}";
final String blobPath = PREFIX + "/" + TASK_ID + "/status.json";
EasyMock.expect(azureStorage.getBlobExists(CONTAINER, blobPath)).andReturn(true);
EasyMock.expect(azureStorage.getBlobLength(CONTAINER, blobPath)).andReturn((long) taskStatus.length());
EasyMock.expect(azureStorage.getBlobInputStream(CONTAINER, blobPath)).andReturn(
EasyMock.expect(azureStorage.getBlockBlobExists(CONTAINER, blobPath)).andReturn(true);
EasyMock.expect(azureStorage.getBlockBlobLength(CONTAINER, blobPath)).andReturn((long) taskStatus.length());
EasyMock.expect(azureStorage.getBlockBlobInputStream(CONTAINER, blobPath)).andReturn(
new ByteArrayInputStream(taskStatus.getBytes(StandardCharsets.UTF_8)));
@ -372,7 +372,7 @@ public class AzureTaskLogsTest extends EasyMockSupport
public void test_streamTaskStatus_blobDoesNotExist_returnsAbsent() throws Exception
{
final String blobPath = PREFIX + "/" + TASK_ID_NOT_FOUND + "/status.json";
EasyMock.expect(azureStorage.getBlobExists(CONTAINER, blobPath)).andReturn(false);
EasyMock.expect(azureStorage.getBlockBlobExists(CONTAINER, blobPath)).andReturn(false);
replayAll();
@ -390,9 +390,9 @@ public class AzureTaskLogsTest extends EasyMockSupport
final String taskStatus = "{}";
final String blobPath = PREFIX + "/" + TASK_ID + "/status.json";
EasyMock.expect(azureStorage.getBlobExists(CONTAINER, blobPath)).andReturn(true);
EasyMock.expect(azureStorage.getBlobLength(CONTAINER, blobPath)).andReturn((long) taskStatus.length());
EasyMock.expect(azureStorage.getBlobInputStream(CONTAINER, blobPath)).andThrow(
EasyMock.expect(azureStorage.getBlockBlobExists(CONTAINER, blobPath)).andReturn(true);
EasyMock.expect(azureStorage.getBlockBlobLength(CONTAINER, blobPath)).andReturn((long) taskStatus.length());
EasyMock.expect(azureStorage.getBlockBlobInputStream(CONTAINER, blobPath)).andThrow(
new URISyntaxException("", ""));
@ -409,7 +409,7 @@ public class AzureTaskLogsTest extends EasyMockSupport
public void test_streamTaskStatus_exceptionWhenCheckingBlobExistence_throwsException() throws Exception
{
final String blobPath = PREFIX + "/" + TASK_ID + "/status.json";
EasyMock.expect(azureStorage.getBlobExists(CONTAINER, blobPath)).andThrow(new URISyntaxException("", ""));
EasyMock.expect(azureStorage.getBlockBlobExists(CONTAINER, blobPath)).andThrow(new URISyntaxException("", ""));
replayAll();

View File

@ -0,0 +1,34 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.druid.storage.azure.output;
import nl.jqno.equalsverifier.EqualsVerifier;
import org.junit.Test;
public class AzureInputRangeTest
{
@Test
public void testEquals()
{
EqualsVerifier.forClass(AzureInputRange.class)
.usingGetClass()
.verify();
}
}

View File

@ -0,0 +1,85 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.druid.storage.azure.output;
import org.apache.druid.error.DruidException;
import org.apache.druid.java.util.common.FileUtils;
import org.apache.druid.java.util.common.HumanReadableBytes;
import org.apache.druid.java.util.common.ISE;
import org.junit.Assert;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
import java.io.File;
import java.io.IOException;
public class AzureOutputConfigTest
{
@Rule
public final TemporaryFolder temporaryFolder = new TemporaryFolder();
private static final String CONTAINER = "container";
private static final String PREFIX = "prefix";
private static final int MAX_RETRY_COUNT = 0;
@Test
public void testTooLargeChunkSize()
{
HumanReadableBytes chunkSize = new HumanReadableBytes("4001MiB");
Assert.assertThrows(
DruidException.class,
() -> new AzureOutputConfig(CONTAINER, PREFIX, temporaryFolder.newFolder(), chunkSize, MAX_RETRY_COUNT)
);
}
@Test
public void testTempDirectoryNotWritable() throws IOException
{
File tempDir = temporaryFolder.newFolder();
if (!tempDir.setWritable(false)) {
throw new ISE("Unable to change the permission of temp folder for %s", this.getClass().getName());
}
//noinspection ResultOfObjectAllocationIgnored
Assert.assertThrows(
DruidException.class,
() -> new AzureOutputConfig(CONTAINER, PREFIX, tempDir, null, MAX_RETRY_COUNT)
);
}
@Test
public void testTempDirectoryNotPresentButWritable() throws IOException
{
File tempDir = new File(temporaryFolder.newFolder() + "/notPresent1/notPresent2/notPresent3");
//noinspection ResultOfObjectAllocationIgnored
new AzureOutputConfig(CONTAINER, PREFIX, tempDir, null, MAX_RETRY_COUNT);
}
@Test
public void testTempDirectoryPresent() throws IOException
{
File tempDir = new File(temporaryFolder.newFolder() + "/notPresent1/notPresent2/notPresent3");
FileUtils.mkdirp(tempDir);
//noinspection ResultOfObjectAllocationIgnored
new AzureOutputConfig(CONTAINER, PREFIX, tempDir, null, MAX_RETRY_COUNT);
}
}

View File

@ -0,0 +1,140 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.druid.storage.azure.output;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.exc.MismatchedInputException;
import com.fasterxml.jackson.databind.exc.ValueInstantiationException;
import org.apache.druid.java.util.common.HumanReadableBytes;
import org.apache.druid.java.util.common.StringUtils;
import org.junit.Assert;
import org.junit.Test;
import java.io.File;
import java.io.IOException;
public class AzureOutputSerdeTest
{
private static final ObjectMapper MAPPER = new ObjectMapper();
@Test
public void sanity() throws IOException
{
String json = jsonStringReadyForAssert("{\n"
+ " \"container\": \"TEST\",\n"
+ " \"prefix\": \"abc\",\n"
+ " \"tempDir\": \"/tmp\",\n"
+ " \"chunkSize\":104857600,\n"
+ " \"maxRetry\": 2\n"
+ "}\n");
AzureOutputConfig azureOutputConfig = new AzureOutputConfig(
"TEST",
"abc",
new File("/tmp"),
HumanReadableBytes.valueOf(HumanReadableBytes.parse("100Mib")),
2
);
Assert.assertEquals(
json,
MAPPER.writeValueAsString(azureOutputConfig)
);
Assert.assertEquals(azureOutputConfig, MAPPER.readValue(json, AzureOutputConfig.class));
}
@Test
public void noPrefix()
{
String json = jsonStringReadyForAssert("{\n"
+ " \"container\": \"TEST\",\n"
+ " \"tempDir\": \"/tmp\",\n"
+ " \"chunkSize\":104857600,\n"
+ " \"maxRetry\": 2\n"
+ "}\n");
Assert.assertThrows(MismatchedInputException.class, () -> MAPPER.readValue(json, AzureOutputConfig.class));
}
@Test
public void noContainer()
{
String json = jsonStringReadyForAssert("{\n"
+ " \"prefix\": \"abc\",\n"
+ " \"tempDir\": \"/tmp\",\n"
+ " \"chunkSize\":104857600,\n"
+ " \"maxRetry\": 2\n"
+ "}\n");
Assert.assertThrows(MismatchedInputException.class, () -> MAPPER.readValue(json, AzureOutputConfig.class));
}
@Test
public void noTempDir()
{
String json = jsonStringReadyForAssert("{\n"
+ " \"prefix\": \"abc\",\n"
+ " \"container\": \"TEST\",\n"
+ " \"chunkSize\":104857600,\n"
+ " \"maxRetry\": 2\n"
+ "}\n");
Assert.assertThrows(MismatchedInputException.class, () -> MAPPER.readValue(json, AzureOutputConfig.class));
}
@Test
public void leastArguments() throws JsonProcessingException
{
String json = jsonStringReadyForAssert("{\n"
+ " \"tempDir\": \"/tmp\",\n"
+ " \"prefix\": \"abc\",\n"
+ " \"container\": \"TEST\"\n"
+ "}\n");
AzureOutputConfig azureOutputConfig = new AzureOutputConfig(
"TEST",
"abc",
new File("/tmp"),
null,
null
);
Assert.assertEquals(azureOutputConfig, MAPPER.readValue(json, AzureOutputConfig.class));
}
@Test
public void testChunkValidation()
{
String json = jsonStringReadyForAssert("{\n"
+ " \"prefix\": \"abc\",\n"
+ " \"container\": \"TEST\",\n"
+ " \"tempDir\": \"/tmp\",\n"
+ " \"chunkSize\":104,\n"
+ " \"maxRetry\": 2\n"
+ "}\n");
Assert.assertThrows(ValueInstantiationException.class, () -> MAPPER.readValue(json, AzureOutputConfig.class));
}
private static String jsonStringReadyForAssert(String input)
{
return StringUtils.removeChar(StringUtils.removeChar(input, '\n'), ' ');
}
}

View File

@ -0,0 +1,148 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.druid.storage.azure.output;
import com.fasterxml.jackson.databind.InjectableValues;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.inject.Injector;
import com.google.inject.Key;
import com.google.inject.ProvisionException;
import com.google.inject.name.Names;
import org.apache.druid.guice.JsonConfigProvider;
import org.apache.druid.guice.LazySingleton;
import org.apache.druid.guice.StartupInjectorBuilder;
import org.apache.druid.storage.StorageConnector;
import org.apache.druid.storage.StorageConnectorModule;
import org.apache.druid.storage.StorageConnectorProvider;
import org.apache.druid.storage.azure.AzureStorage;
import org.apache.druid.storage.azure.AzureStorageDruidModule;
import org.easymock.EasyMock;
import org.junit.Assert;
import org.junit.Test;
import java.io.File;
import java.util.Properties;
public class AzureStorageConnectorProviderTest
{
private static final String CUSTOM_NAMESPACE = "custom";
@Test
public void createAzureStorageFactoryWithRequiredProperties()
{
final Properties properties = new Properties();
properties.setProperty(CUSTOM_NAMESPACE + ".type", "azure");
properties.setProperty(CUSTOM_NAMESPACE + ".container", "container");
properties.setProperty(CUSTOM_NAMESPACE + ".prefix", "prefix");
properties.setProperty(CUSTOM_NAMESPACE + ".tempDir", "/tmp");
StorageConnectorProvider s3StorageConnectorProvider = getStorageConnectorProvider(properties);
Assert.assertTrue(s3StorageConnectorProvider instanceof AzureStorageConnectorProvider);
Assert.assertTrue(s3StorageConnectorProvider.get() instanceof AzureStorageConnector);
Assert.assertEquals("container", ((AzureStorageConnectorProvider) s3StorageConnectorProvider).getContainer());
Assert.assertEquals("prefix", ((AzureStorageConnectorProvider) s3StorageConnectorProvider).getPrefix());
Assert.assertEquals(new File("/tmp"), ((AzureStorageConnectorProvider) s3StorageConnectorProvider).getTempDir());
}
@Test
public void createAzureStorageFactoryWithMissingPrefix()
{
final Properties properties = new Properties();
properties.setProperty(CUSTOM_NAMESPACE + ".type", "s3");
properties.setProperty(CUSTOM_NAMESPACE + ".container", "container");
properties.setProperty(CUSTOM_NAMESPACE + ".tempDir", "/tmp");
Assert.assertThrows(
"Missing required creator property 'prefix'",
ProvisionException.class,
() -> getStorageConnectorProvider(properties)
);
}
@Test
public void createAzureStorageFactoryWithMissingContainer()
{
final Properties properties = new Properties();
properties.setProperty(CUSTOM_NAMESPACE + ".type", "azure");
properties.setProperty(CUSTOM_NAMESPACE + ".prefix", "prefix");
properties.setProperty(CUSTOM_NAMESPACE + ".tempDir", "/tmp");
Assert.assertThrows(
"Missing required creator property 'container'",
ProvisionException.class,
() -> getStorageConnectorProvider(properties)
);
}
@Test
public void createAzureStorageFactoryWithMissingTempDir()
{
final Properties properties = new Properties();
properties.setProperty(CUSTOM_NAMESPACE + ".type", "azure");
properties.setProperty(CUSTOM_NAMESPACE + ".container", "container");
properties.setProperty(CUSTOM_NAMESPACE + ".prefix", "prefix");
Assert.assertThrows(
"Missing required creator property 'tempDir'",
ProvisionException.class,
() -> getStorageConnectorProvider(properties)
);
}
private StorageConnectorProvider getStorageConnectorProvider(Properties properties)
{
StartupInjectorBuilder startupInjectorBuilder = new StartupInjectorBuilder().add(
new AzureStorageDruidModule(),
new StorageConnectorModule(),
new AzureStorageConnectorModule(),
binder -> {
JsonConfigProvider.bind(
binder,
CUSTOM_NAMESPACE,
StorageConnectorProvider.class,
Names.named(CUSTOM_NAMESPACE)
);
binder.bind(Key.get(StorageConnector.class, Names.named(CUSTOM_NAMESPACE)))
.toProvider(Key.get(StorageConnectorProvider.class, Names.named(CUSTOM_NAMESPACE)))
.in(LazySingleton.class);
}
).withProperties(properties);
Injector injector = startupInjectorBuilder.build();
injector.getInstance(ObjectMapper.class).registerModules(new AzureStorageConnectorModule().getJacksonModules());
injector.getInstance(ObjectMapper.class).setInjectableValues(
new InjectableValues.Std()
.addValue(
AzureStorage.class,
EasyMock.mock(AzureStorage.class)
));
return injector.getInstance(Key.get(
StorageConnectorProvider.class,
Names.named(CUSTOM_NAMESPACE)
));
}
}

View File

@ -0,0 +1,202 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.druid.storage.azure.output;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import com.microsoft.azure.storage.StorageException;
import org.apache.commons.io.IOUtils;
import org.apache.druid.storage.StorageConnector;
import org.apache.druid.storage.azure.AzureStorage;
import org.easymock.Capture;
import org.easymock.EasyMock;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
import java.io.IOException;
import java.io.InputStream;
import java.net.URISyntaxException;
import java.nio.charset.StandardCharsets;
import java.util.Collections;
import java.util.List;
public class AzureStorageConnectorTest
{
private static final String CONTAINER = "CONTAINER";
private static final String PREFIX = "P/R/E/F/I/X";
public static final String TEST_FILE = "test.csv";
@Rule
public TemporaryFolder temporaryFolder = new TemporaryFolder();
private StorageConnector storageConnector;
private final AzureStorage azureStorage = EasyMock.createMock(AzureStorage.class);
@Before
public void setup() throws IOException
{
storageConnector = new AzureStorageConnector(
new AzureOutputConfig(CONTAINER, PREFIX, temporaryFolder.newFolder(), null, null),
azureStorage
);
}
@Test
public void testPathExistsSuccess() throws URISyntaxException, StorageException, IOException
{
final Capture<String> bucket = Capture.newInstance();
final Capture<String> path = Capture.newInstance();
EasyMock.reset(azureStorage);
EasyMock.expect(azureStorage.getBlockBlobExists(EasyMock.capture(bucket), EasyMock.capture(path), EasyMock.anyInt()))
.andReturn(true);
EasyMock.replay(azureStorage);
Assert.assertTrue(storageConnector.pathExists(TEST_FILE));
Assert.assertEquals(CONTAINER, bucket.getValue());
Assert.assertEquals(PREFIX + "/" + TEST_FILE, path.getValue());
EasyMock.verify(azureStorage);
}
@Test
public void testPathExistsNotFound() throws URISyntaxException, StorageException, IOException
{
final Capture<String> bucket = Capture.newInstance();
final Capture<String> path = Capture.newInstance();
EasyMock.reset(azureStorage);
EasyMock.expect(azureStorage.getBlockBlobExists(EasyMock.capture(bucket), EasyMock.capture(path), EasyMock.anyInt()))
.andReturn(false);
EasyMock.replay(azureStorage);
Assert.assertFalse(storageConnector.pathExists(TEST_FILE));
Assert.assertEquals(CONTAINER, bucket.getValue());
Assert.assertEquals(PREFIX + "/" + TEST_FILE, path.getValue());
EasyMock.verify(azureStorage);
}
@Test
public void testRead() throws URISyntaxException, StorageException, IOException
{
EasyMock.reset(azureStorage);
String data = "test";
EasyMock.expect(azureStorage.getBlockBlobLength(EasyMock.anyString(), EasyMock.anyString()))
.andReturn(4L);
EasyMock.expect(
azureStorage.getBlockBlobInputStream(
EasyMock.anyLong(),
EasyMock.anyLong(),
EasyMock.anyString(),
EasyMock.anyString(),
EasyMock.anyInt()
)
).andReturn(IOUtils.toInputStream(data, StandardCharsets.UTF_8));
EasyMock.replay(azureStorage);
InputStream is = storageConnector.read(TEST_FILE);
byte[] dataBytes = new byte[data.length()];
Assert.assertEquals(data.length(), is.read(dataBytes));
Assert.assertEquals(-1, is.read());
Assert.assertEquals(data, new String(dataBytes, StandardCharsets.UTF_8));
EasyMock.reset(azureStorage);
}
@Test
public void testReadRange() throws URISyntaxException, StorageException, IOException
{
String data = "test";
for (int start = 0; start < data.length(); ++start) {
for (long length = 1; length <= data.length() - start; ++length) {
String dataQueried = data.substring(start, start + ((Long) length).intValue());
EasyMock.reset(azureStorage);
EasyMock.expect(azureStorage.getBlockBlobInputStream(
EasyMock.anyLong(),
EasyMock.anyLong(),
EasyMock.anyString(),
EasyMock.anyString(),
EasyMock.anyInt()
))
.andReturn(IOUtils.toInputStream(dataQueried, StandardCharsets.UTF_8));
EasyMock.replay(azureStorage);
InputStream is = storageConnector.readRange(TEST_FILE, start, length);
byte[] dataBytes = new byte[((Long) length).intValue()];
Assert.assertEquals(length, is.read(dataBytes));
Assert.assertEquals(-1, is.read());
Assert.assertEquals(dataQueried, new String(dataBytes, StandardCharsets.UTF_8));
EasyMock.reset(azureStorage);
}
}
}
@Test
public void testDeleteSinglePath() throws URISyntaxException, StorageException, IOException
{
EasyMock.reset(azureStorage);
Capture<String> containerCapture = EasyMock.newCapture();
Capture<Iterable<String>> pathsCapture = EasyMock.newCapture();
azureStorage.batchDeleteFiles(
EasyMock.capture(containerCapture),
EasyMock.capture(pathsCapture),
EasyMock.anyInt()
);
EasyMock.replay(azureStorage);
storageConnector.deleteFile(TEST_FILE);
Assert.assertEquals(CONTAINER, containerCapture.getValue());
Assert.assertEquals(Collections.singletonList(PREFIX + "/" + TEST_FILE), pathsCapture.getValue());
EasyMock.reset(azureStorage);
}
@Test
public void testDeleteMultiplePaths() throws URISyntaxException, StorageException, IOException
{
EasyMock.reset(azureStorage);
Capture<String> containerCapture = EasyMock.newCapture();
Capture<Iterable<String>> pathsCapture = EasyMock.newCapture();
azureStorage.batchDeleteFiles(EasyMock.capture(containerCapture), EasyMock.capture(pathsCapture), EasyMock.anyInt());
EasyMock.replay(azureStorage);
storageConnector.deleteFiles(ImmutableList.of(TEST_FILE + "_1.part", TEST_FILE + "_2.part"));
Assert.assertEquals(CONTAINER, containerCapture.getValue());
Assert.assertEquals(
ImmutableList.of(
PREFIX + "/" + TEST_FILE + "_1.part",
PREFIX + "/" + TEST_FILE + "_2.part"
),
Lists.newArrayList(pathsCapture.getValue())
);
EasyMock.reset(azureStorage);
}
@Test
public void testListDir() throws URISyntaxException, StorageException, IOException
{
EasyMock.reset(azureStorage);
EasyMock.expect(azureStorage.listDir(EasyMock.anyString(), EasyMock.anyString(), EasyMock.anyInt()))
.andReturn(ImmutableList.of(PREFIX + "/x/y/z/" + TEST_FILE, PREFIX + "/p/q/r/" + TEST_FILE));
EasyMock.replay(azureStorage);
List<String> ret = Lists.newArrayList(storageConnector.listDir(""));
Assert.assertEquals(ImmutableList.of("x/y/z/" + TEST_FILE, "p/q/r/" + TEST_FILE), ret);
EasyMock.reset(azureStorage);
}
}

View File

@ -57,13 +57,13 @@ public class DoublesSketchApproxQuantileSqlAggregator implements SqlAggregator
private static final String NAME = "APPROX_QUANTILE_DS";
private static final SqlAggFunction FUNCTION_INSTANCE =
OperatorConversions.aggregatorBuilder(NAME)
.operandNames("column", "probability", "k")
.operandTypes(SqlTypeFamily.ANY, SqlTypeFamily.NUMERIC, SqlTypeFamily.EXACT_NUMERIC)
.returnTypeNonNull(SqlTypeName.DOUBLE)
.requiredOperandCount(2)
.literalOperands(1, 2)
.functionCategory(SqlFunctionCategory.NUMERIC)
.build();
.operandNames("column", "probability", "k")
.operandTypes(SqlTypeFamily.ANY, SqlTypeFamily.NUMERIC, SqlTypeFamily.EXACT_NUMERIC)
.returnTypeNonNull(SqlTypeName.DOUBLE)
.requiredOperandCount(2)
.literalOperands(1, 2)
.functionCategory(SqlFunctionCategory.NUMERIC)
.build();
@Override
public SqlAggFunction calciteFunction()

View File

@ -40,6 +40,7 @@ import org.apache.calcite.util.Static;
import org.apache.druid.java.util.common.StringUtils;
import org.apache.druid.query.aggregation.PostAggregator;
import org.apache.druid.segment.column.RowSignature;
import org.apache.druid.sql.calcite.expression.BasicOperandTypeChecker;
import org.apache.druid.sql.calcite.expression.DruidExpression;
import org.apache.druid.sql.calcite.expression.OperatorConversions;
import org.apache.druid.sql.calcite.expression.PostAggregatorVisitor;
@ -143,7 +144,7 @@ public abstract class DoublesSketchListArgBaseOperatorConversion implements SqlO
// Verify that 'operand' is a literal number.
if (!SqlUtil.isLiteral(operand)) {
return OperatorConversions.throwOrReturn(
return BasicOperandTypeChecker.throwOrReturn(
throwOnFailure,
callBinding,
cb -> cb.getValidator()
@ -155,7 +156,7 @@ public abstract class DoublesSketchListArgBaseOperatorConversion implements SqlO
}
if (!SqlTypeFamily.NUMERIC.contains(operandType)) {
return OperatorConversions.throwOrReturn(
return BasicOperandTypeChecker.throwOrReturn(
throwOnFailure,
callBinding,
SqlCallBinding::newValidationSignatureError

Some files were not shown because too many files have changed in this diff Show More