mirror of https://github.com/apache/druid.git
add 'prefixes' support to google input source (#8930)
* add prefixes support to google input source, making it symmetrical-ish with s3 * docs * more better, and tests * unused * formatting * javadoc * dependencies * oops * review comments * better javadoc
This commit is contained in:
parent
1cff73f3e0
commit
5ecdf94d83
|
@ -0,0 +1,181 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.druid.data.input.impl;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||||
|
import com.google.common.primitives.Ints;
|
||||||
|
import org.apache.druid.data.input.AbstractInputSource;
|
||||||
|
import org.apache.druid.data.input.InputEntity;
|
||||||
|
import org.apache.druid.data.input.InputFormat;
|
||||||
|
import org.apache.druid.data.input.InputRowSchema;
|
||||||
|
import org.apache.druid.data.input.InputSourceReader;
|
||||||
|
import org.apache.druid.data.input.InputSplit;
|
||||||
|
import org.apache.druid.data.input.SplitHintSpec;
|
||||||
|
import org.apache.druid.utils.CollectionUtils;
|
||||||
|
|
||||||
|
import javax.annotation.Nullable;
|
||||||
|
import java.io.File;
|
||||||
|
import java.net.URI;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Objects;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
|
public abstract class CloudObjectInputSource<T extends InputEntity> extends AbstractInputSource
|
||||||
|
implements SplittableInputSource<CloudObjectLocation>
|
||||||
|
{
|
||||||
|
private final List<URI> uris;
|
||||||
|
private final List<URI> prefixes;
|
||||||
|
private final List<CloudObjectLocation> objects;
|
||||||
|
|
||||||
|
public CloudObjectInputSource(
|
||||||
|
String scheme,
|
||||||
|
@Nullable List<URI> uris,
|
||||||
|
@Nullable List<URI> prefixes,
|
||||||
|
@Nullable List<CloudObjectLocation> objects
|
||||||
|
)
|
||||||
|
{
|
||||||
|
this.uris = uris;
|
||||||
|
this.prefixes = prefixes;
|
||||||
|
this.objects = objects;
|
||||||
|
|
||||||
|
if (!CollectionUtils.isNullOrEmpty(objects)) {
|
||||||
|
throwIfIllegalArgs(!CollectionUtils.isNullOrEmpty(uris) || !CollectionUtils.isNullOrEmpty(prefixes));
|
||||||
|
} else if (!CollectionUtils.isNullOrEmpty(uris)) {
|
||||||
|
throwIfIllegalArgs(!CollectionUtils.isNullOrEmpty(prefixes));
|
||||||
|
uris.forEach(uri -> CloudObjectLocation.validateUriScheme(scheme, uri));
|
||||||
|
} else if (!CollectionUtils.isNullOrEmpty(prefixes)) {
|
||||||
|
prefixes.forEach(uri -> CloudObjectLocation.validateUriScheme(scheme, uri));
|
||||||
|
} else {
|
||||||
|
throwIfIllegalArgs(true);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@JsonProperty
|
||||||
|
public List<URI> getUris()
|
||||||
|
{
|
||||||
|
return uris;
|
||||||
|
}
|
||||||
|
|
||||||
|
@JsonProperty
|
||||||
|
public List<URI> getPrefixes()
|
||||||
|
{
|
||||||
|
return prefixes;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Nullable
|
||||||
|
@JsonProperty
|
||||||
|
public List<CloudObjectLocation> getObjects()
|
||||||
|
{
|
||||||
|
return objects;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create the correct {@link InputEntity} for this input source given a split on a {@link CloudObjectLocation}. This
|
||||||
|
* is called internally by {@link #formattableReader} and operates on the output of {@link #createSplits}.
|
||||||
|
*/
|
||||||
|
protected abstract T createEntity(InputSplit<CloudObjectLocation> split);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a stream of {@link CloudObjectLocation} splits by listing objects that appear under {@link #prefixes} using
|
||||||
|
* this input sources backend API. This is called internally by {@link #createSplits} and {@link #estimateNumSplits},
|
||||||
|
* only if {@link #prefixes} is set, otherwise the splits are created directly from {@link #uris} or {@link #objects}.
|
||||||
|
* Calling if {@link #prefixes} is not set is likely to either lead to an empty iterator or null pointer exception.
|
||||||
|
*/
|
||||||
|
protected abstract Stream<InputSplit<CloudObjectLocation>> getPrefixesSplitStream();
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Stream<InputSplit<CloudObjectLocation>> createSplits(
|
||||||
|
InputFormat inputFormat,
|
||||||
|
@Nullable SplitHintSpec splitHintSpec
|
||||||
|
)
|
||||||
|
{
|
||||||
|
if (!CollectionUtils.isNullOrEmpty(objects)) {
|
||||||
|
return objects.stream().map(InputSplit::new);
|
||||||
|
}
|
||||||
|
if (!CollectionUtils.isNullOrEmpty(uris)) {
|
||||||
|
return uris.stream().map(CloudObjectLocation::new).map(InputSplit::new);
|
||||||
|
}
|
||||||
|
|
||||||
|
return getPrefixesSplitStream();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int estimateNumSplits(InputFormat inputFormat, @Nullable SplitHintSpec splitHintSpec)
|
||||||
|
{
|
||||||
|
if (!CollectionUtils.isNullOrEmpty(objects)) {
|
||||||
|
return objects.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!CollectionUtils.isNullOrEmpty(uris)) {
|
||||||
|
return uris.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
return Ints.checkedCast(getPrefixesSplitStream().count());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean needsFormat()
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected InputSourceReader formattableReader(
|
||||||
|
InputRowSchema inputRowSchema,
|
||||||
|
InputFormat inputFormat,
|
||||||
|
@Nullable File temporaryDirectory
|
||||||
|
)
|
||||||
|
{
|
||||||
|
return new InputEntityIteratingReader(
|
||||||
|
inputRowSchema,
|
||||||
|
inputFormat,
|
||||||
|
createSplits(inputFormat, null).map(this::createEntity),
|
||||||
|
temporaryDirectory
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object o)
|
||||||
|
{
|
||||||
|
if (this == o) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (o == null || getClass() != o.getClass()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
CloudObjectInputSource that = (CloudObjectInputSource) o;
|
||||||
|
return Objects.equals(uris, that.uris) &&
|
||||||
|
Objects.equals(prefixes, that.prefixes) &&
|
||||||
|
Objects.equals(objects, that.objects);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode()
|
||||||
|
{
|
||||||
|
return Objects.hash(uris, prefixes, objects);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void throwIfIllegalArgs(boolean clause) throws IllegalArgumentException
|
||||||
|
{
|
||||||
|
if (clause) {
|
||||||
|
throw new IllegalArgumentException("exactly one of either uris or prefixes or objects must be specified");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -22,6 +22,7 @@ package org.apache.druid.data.input.impl;
|
||||||
import com.fasterxml.jackson.annotation.JsonCreator;
|
import com.fasterxml.jackson.annotation.JsonCreator;
|
||||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||||
import com.google.common.base.Preconditions;
|
import com.google.common.base.Preconditions;
|
||||||
|
import org.apache.druid.java.util.common.IAE;
|
||||||
import org.apache.druid.java.util.common.StringUtils;
|
import org.apache.druid.java.util.common.StringUtils;
|
||||||
|
|
||||||
import java.net.URI;
|
import java.net.URI;
|
||||||
|
@ -45,6 +46,14 @@ import java.util.Objects;
|
||||||
*/
|
*/
|
||||||
public class CloudObjectLocation
|
public class CloudObjectLocation
|
||||||
{
|
{
|
||||||
|
public static URI validateUriScheme(String scheme, URI uri)
|
||||||
|
{
|
||||||
|
if (!scheme.equalsIgnoreCase(uri.getScheme())) {
|
||||||
|
throw new IAE("Invalid URI scheme [%s] must be [%s]", uri.toString(), scheme);
|
||||||
|
}
|
||||||
|
return uri;
|
||||||
|
}
|
||||||
|
|
||||||
private final String bucket;
|
private final String bucket;
|
||||||
private final String path;
|
private final String path;
|
||||||
|
|
||||||
|
@ -125,5 +134,4 @@ public class CloudObjectLocation
|
||||||
{
|
{
|
||||||
return Objects.hash(bucket, path);
|
return Objects.hash(bucket, path);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -60,12 +60,57 @@ This extension also provides an input source for Druid native batch ingestion to
|
||||||
...
|
...
|
||||||
```
|
```
|
||||||
|
|
||||||
|
```json
|
||||||
|
...
|
||||||
|
"ioConfig": {
|
||||||
|
"type": "index_parallel",
|
||||||
|
"inputSource": {
|
||||||
|
"type": "google",
|
||||||
|
"prefixes": ["gs://foo/bar", "gs://bar/foo"]
|
||||||
|
},
|
||||||
|
"inputFormat": {
|
||||||
|
"type": "json"
|
||||||
|
},
|
||||||
|
...
|
||||||
|
},
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
```json
|
||||||
|
...
|
||||||
|
"ioConfig": {
|
||||||
|
"type": "index_parallel",
|
||||||
|
"inputSource": {
|
||||||
|
"type": "google",
|
||||||
|
"objects": [
|
||||||
|
{ "bucket": "foo", "path": "bar/file1.json"},
|
||||||
|
{ "bucket": "bar", "path": "foo/file2.json"}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"inputFormat": {
|
||||||
|
"type": "json"
|
||||||
|
},
|
||||||
|
...
|
||||||
|
},
|
||||||
|
...
|
||||||
|
```
|
||||||
|
|
||||||
|property|description|default|required?|
|
|property|description|default|required?|
|
||||||
|--------|-----------|-------|---------|
|
|--------|-----------|-------|---------|
|
||||||
|type|This should be `google`.|N/A|yes|
|
|type|This should be `google`.|N/A|yes|
|
||||||
|uris|JSON array of URIs where Google Cloud Storage files to be ingested are located.|N/A|yes|
|
|uris|JSON array of URIs where Google Cloud Storage objects to be ingested are located.|N/A|`uris` or `prefixes` or `objects` must be set|
|
||||||
|
|prefixes|JSON array of URI prefixes for the locations of Google Cloud Storage objects to be ingested.|N/A|`uris` or `prefixes` or `objects` must be set|
|
||||||
|
|objects|JSON array of Google Cloud Storage objects to be ingested.|N/A|`uris` or `prefixes` or `objects` must be set|
|
||||||
|
|
||||||
|
|
||||||
|
Google Cloud Storage object:
|
||||||
|
|
||||||
|
|property|description|default|required?|
|
||||||
|
|--------|-----------|-------|---------|
|
||||||
|
|bucket|Name of the Google Cloud Storage bucket|N/A|yes|
|
||||||
|
|path|The path where data is located.|N/A|yes|
|
||||||
|
|
||||||
## Firehose
|
## Firehose
|
||||||
|
|
||||||
<a name="firehose"></a>
|
<a name="firehose"></a>
|
||||||
|
|
|
@ -124,6 +124,13 @@
|
||||||
<scope>provided</scope>
|
<scope>provided</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
<!-- Tests -->
|
<!-- Tests -->
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.druid</groupId>
|
||||||
|
<artifactId>druid-core</artifactId>
|
||||||
|
<version>${project.parent.version}</version>
|
||||||
|
<scope>test</scope>
|
||||||
|
<type>test-jar</type>
|
||||||
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.druid</groupId>
|
<groupId>org.apache.druid</groupId>
|
||||||
<artifactId>druid-processing</artifactId>
|
<artifactId>druid-processing</artifactId>
|
||||||
|
@ -158,5 +165,11 @@
|
||||||
<artifactId>jackson-module-guice</artifactId>
|
<artifactId>jackson-module-guice</artifactId>
|
||||||
<scope>test</scope>
|
<scope>test</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>joda-time</groupId>
|
||||||
|
<artifactId>joda-time</artifactId>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
</dependencies>
|
</dependencies>
|
||||||
</project>
|
</project>
|
||||||
|
|
|
@ -21,6 +21,7 @@ package org.apache.druid.data.input.google;
|
||||||
|
|
||||||
import com.google.common.base.Predicate;
|
import com.google.common.base.Predicate;
|
||||||
import org.apache.druid.data.input.RetryingInputEntity;
|
import org.apache.druid.data.input.RetryingInputEntity;
|
||||||
|
import org.apache.druid.data.input.impl.CloudObjectLocation;
|
||||||
import org.apache.druid.java.util.common.StringUtils;
|
import org.apache.druid.java.util.common.StringUtils;
|
||||||
import org.apache.druid.storage.google.GoogleByteSource;
|
import org.apache.druid.storage.google.GoogleByteSource;
|
||||||
import org.apache.druid.storage.google.GoogleStorage;
|
import org.apache.druid.storage.google.GoogleStorage;
|
||||||
|
@ -33,36 +34,33 @@ import java.net.URI;
|
||||||
|
|
||||||
public class GoogleCloudStorageEntity extends RetryingInputEntity
|
public class GoogleCloudStorageEntity extends RetryingInputEntity
|
||||||
{
|
{
|
||||||
private final GoogleStorage storage;
|
private final CloudObjectLocation location;
|
||||||
private final URI uri;
|
private final GoogleByteSource byteSource;
|
||||||
|
|
||||||
GoogleCloudStorageEntity(GoogleStorage storage, URI uri)
|
GoogleCloudStorageEntity(GoogleStorage storage, CloudObjectLocation location)
|
||||||
{
|
{
|
||||||
this.storage = storage;
|
this.location = location;
|
||||||
this.uri = uri;
|
this.byteSource = new GoogleByteSource(storage, location.getBucket(), location.getPath());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Nullable
|
@Nullable
|
||||||
@Override
|
@Override
|
||||||
public URI getUri()
|
public URI getUri()
|
||||||
{
|
{
|
||||||
return uri;
|
return location.toUri(GoogleCloudStorageInputSource.SCHEME);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected InputStream readFrom(long offset) throws IOException
|
protected InputStream readFrom(long offset) throws IOException
|
||||||
{
|
{
|
||||||
// Get data of the given object and open an input stream
|
// Get data of the given object and open an input stream
|
||||||
final String bucket = uri.getAuthority();
|
|
||||||
final String key = StringUtils.maybeRemoveLeadingSlash(uri.getPath());
|
|
||||||
final GoogleByteSource byteSource = new GoogleByteSource(storage, bucket, key);
|
|
||||||
return byteSource.openStream(offset);
|
return byteSource.openStream(offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected String getPath()
|
protected String getPath()
|
||||||
{
|
{
|
||||||
return StringUtils.maybeRemoveLeadingSlash(uri.getPath());
|
return StringUtils.maybeRemoveLeadingSlash(byteSource.getPath());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -22,104 +22,78 @@ package org.apache.druid.data.input.google;
|
||||||
import com.fasterxml.jackson.annotation.JacksonInject;
|
import com.fasterxml.jackson.annotation.JacksonInject;
|
||||||
import com.fasterxml.jackson.annotation.JsonCreator;
|
import com.fasterxml.jackson.annotation.JsonCreator;
|
||||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||||
|
import com.google.api.services.storage.model.StorageObject;
|
||||||
import com.google.common.collect.ImmutableList;
|
import com.google.common.collect.ImmutableList;
|
||||||
import org.apache.druid.data.input.AbstractInputSource;
|
|
||||||
import org.apache.druid.data.input.InputFormat;
|
|
||||||
import org.apache.druid.data.input.InputRowSchema;
|
|
||||||
import org.apache.druid.data.input.InputSourceReader;
|
|
||||||
import org.apache.druid.data.input.InputSplit;
|
import org.apache.druid.data.input.InputSplit;
|
||||||
import org.apache.druid.data.input.SplitHintSpec;
|
import org.apache.druid.data.input.impl.CloudObjectInputSource;
|
||||||
import org.apache.druid.data.input.impl.InputEntityIteratingReader;
|
import org.apache.druid.data.input.impl.CloudObjectLocation;
|
||||||
import org.apache.druid.data.input.impl.SplittableInputSource;
|
import org.apache.druid.data.input.impl.SplittableInputSource;
|
||||||
|
import org.apache.druid.java.util.common.RetryUtils;
|
||||||
import org.apache.druid.storage.google.GoogleStorage;
|
import org.apache.druid.storage.google.GoogleStorage;
|
||||||
|
import org.apache.druid.storage.google.GoogleUtils;
|
||||||
|
|
||||||
import javax.annotation.Nullable;
|
import javax.annotation.Nullable;
|
||||||
import java.io.File;
|
|
||||||
import java.net.URI;
|
import java.net.URI;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Objects;
|
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
import java.util.stream.StreamSupport;
|
||||||
|
|
||||||
public class GoogleCloudStorageInputSource extends AbstractInputSource implements SplittableInputSource<URI>
|
public class GoogleCloudStorageInputSource extends CloudObjectInputSource<GoogleCloudStorageEntity>
|
||||||
{
|
{
|
||||||
|
static final String SCHEME = "gs";
|
||||||
|
|
||||||
private final GoogleStorage storage;
|
private final GoogleStorage storage;
|
||||||
private final List<URI> uris;
|
|
||||||
|
|
||||||
@JsonCreator
|
@JsonCreator
|
||||||
public GoogleCloudStorageInputSource(
|
public GoogleCloudStorageInputSource(
|
||||||
@JacksonInject GoogleStorage storage,
|
@JacksonInject GoogleStorage storage,
|
||||||
@JsonProperty("uris") List<URI> uris
|
@JsonProperty("uris") @Nullable List<URI> uris,
|
||||||
|
@JsonProperty("prefixes") @Nullable List<URI> prefixes,
|
||||||
|
@JsonProperty("objects") @Nullable List<CloudObjectLocation> objects
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
|
super(SCHEME, uris, prefixes, objects);
|
||||||
this.storage = storage;
|
this.storage = storage;
|
||||||
this.uris = uris;
|
|
||||||
}
|
|
||||||
|
|
||||||
@JsonProperty("uris")
|
|
||||||
public List<URI> getUris()
|
|
||||||
{
|
|
||||||
return uris;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Stream<InputSplit<URI>> createSplits(InputFormat inputFormat, @Nullable SplitHintSpec splitHintSpec)
|
|
||||||
{
|
|
||||||
return uris.stream().map(InputSplit::new);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int estimateNumSplits(InputFormat inputFormat, @Nullable SplitHintSpec splitHintSpec)
|
protected GoogleCloudStorageEntity createEntity(InputSplit<CloudObjectLocation> split)
|
||||||
{
|
{
|
||||||
return uris.size();
|
return new GoogleCloudStorageEntity(storage, split.get());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public SplittableInputSource<URI> withSplit(InputSplit<URI> split)
|
protected Stream<InputSplit<CloudObjectLocation>> getPrefixesSplitStream()
|
||||||
{
|
{
|
||||||
return new GoogleCloudStorageInputSource(storage, ImmutableList.of(split.get()));
|
return StreamSupport.stream(storageObjectIterable().spliterator(), false)
|
||||||
|
.map(this::byteSourceFromStorageObject)
|
||||||
|
.map(InputSplit::new);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean needsFormat()
|
public SplittableInputSource<CloudObjectLocation> withSplit(InputSplit<CloudObjectLocation> split)
|
||||||
{
|
{
|
||||||
return true;
|
return new GoogleCloudStorageInputSource(storage, null, null, ImmutableList.of(split.get()));
|
||||||
|
}
|
||||||
|
|
||||||
|
private CloudObjectLocation byteSourceFromStorageObject(final StorageObject storageObject)
|
||||||
|
{
|
||||||
|
return new CloudObjectLocation(storageObject.getBucket(), storageObject.getName());
|
||||||
|
}
|
||||||
|
|
||||||
|
private Iterable<StorageObject> storageObjectIterable()
|
||||||
|
{
|
||||||
|
return () ->
|
||||||
|
GoogleUtils.lazyFetchingStorageObjectsIterator(storage, getPrefixes().iterator(), RetryUtils.DEFAULT_MAX_TRIES);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected InputSourceReader formattableReader(
|
public String toString()
|
||||||
InputRowSchema inputRowSchema,
|
|
||||||
InputFormat inputFormat,
|
|
||||||
@Nullable File temporaryDirectory
|
|
||||||
)
|
|
||||||
{
|
{
|
||||||
return new InputEntityIteratingReader(
|
return "GoogleCloudStorageInputSource{" +
|
||||||
inputRowSchema,
|
"uris=" + getUris() +
|
||||||
inputFormat,
|
", prefixes=" + getPrefixes() +
|
||||||
createSplits(inputFormat, null).map(split -> new GoogleCloudStorageEntity(
|
", objects=" + getObjects() +
|
||||||
storage,
|
'}';
|
||||||
split.get()
|
|
||||||
)),
|
|
||||||
temporaryDirectory
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean equals(Object o)
|
|
||||||
{
|
|
||||||
if (this == o) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
if (o == null || getClass() != o.getClass()) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
GoogleCloudStorageInputSource that = (GoogleCloudStorageInputSource) o;
|
|
||||||
return Objects.equals(uris, that.uris);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int hashCode()
|
|
||||||
{
|
|
||||||
return Objects.hash(uris);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,6 +23,7 @@ import com.google.common.io.ByteSource;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
public class GoogleByteSource extends ByteSource
|
public class GoogleByteSource extends ByteSource
|
||||||
{
|
{
|
||||||
|
@ -37,6 +38,16 @@ public class GoogleByteSource extends ByteSource
|
||||||
this.path = path;
|
this.path = path;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String getBucket()
|
||||||
|
{
|
||||||
|
return bucket;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getPath()
|
||||||
|
{
|
||||||
|
return path;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public InputStream openStream() throws IOException
|
public InputStream openStream() throws IOException
|
||||||
{
|
{
|
||||||
|
@ -47,4 +58,24 @@ public class GoogleByteSource extends ByteSource
|
||||||
{
|
{
|
||||||
return storage.get(bucket, path, start);
|
return storage.get(bucket, path, start);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object o)
|
||||||
|
{
|
||||||
|
if (this == o) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (o == null || getClass() != o.getClass()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
GoogleByteSource that = (GoogleByteSource) o;
|
||||||
|
return Objects.equals(bucket, that.bucket) &&
|
||||||
|
Objects.equals(path, that.path);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode()
|
||||||
|
{
|
||||||
|
return Objects.hash(bucket, path);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,12 +20,23 @@
|
||||||
package org.apache.druid.storage.google;
|
package org.apache.druid.storage.google;
|
||||||
|
|
||||||
import com.google.api.client.http.HttpResponseException;
|
import com.google.api.client.http.HttpResponseException;
|
||||||
|
import com.google.api.services.storage.Storage;
|
||||||
|
import com.google.api.services.storage.model.Objects;
|
||||||
|
import com.google.api.services.storage.model.StorageObject;
|
||||||
import com.google.common.base.Predicate;
|
import com.google.common.base.Predicate;
|
||||||
|
import org.apache.druid.java.util.common.ISE;
|
||||||
|
import org.apache.druid.java.util.common.RetryUtils;
|
||||||
|
import org.apache.druid.java.util.common.StringUtils;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.net.URI;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.NoSuchElementException;
|
||||||
|
|
||||||
public class GoogleUtils
|
public class GoogleUtils
|
||||||
{
|
{
|
||||||
|
public static final Predicate<Throwable> GOOGLE_RETRY = GoogleUtils::isRetryable;
|
||||||
|
|
||||||
public static boolean isRetryable(Throwable t)
|
public static boolean isRetryable(Throwable t)
|
||||||
{
|
{
|
||||||
if (t instanceof HttpResponseException) {
|
if (t instanceof HttpResponseException) {
|
||||||
|
@ -35,5 +46,103 @@ public class GoogleUtils
|
||||||
return t instanceof IOException;
|
return t instanceof IOException;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static final Predicate<Throwable> GOOGLE_RETRY = e -> isRetryable(e);
|
private static <T> T retryGoogleCloudStorageOperation(RetryUtils.Task<T> f) throws Exception
|
||||||
|
{
|
||||||
|
return RetryUtils.retry(f, GOOGLE_RETRY, RetryUtils.DEFAULT_MAX_TRIES);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static Iterator<StorageObject> lazyFetchingStorageObjectsIterator(
|
||||||
|
final GoogleStorage storage,
|
||||||
|
final Iterator<URI> uris,
|
||||||
|
final long maxListingLength
|
||||||
|
)
|
||||||
|
{
|
||||||
|
return new Iterator<StorageObject>()
|
||||||
|
{
|
||||||
|
private Storage.Objects.List listRequest;
|
||||||
|
private Objects results;
|
||||||
|
private URI currentUri;
|
||||||
|
private String currentBucket;
|
||||||
|
private String currentPrefix;
|
||||||
|
private String nextPageToken;
|
||||||
|
private Iterator<StorageObject> storageObjectsIterator;
|
||||||
|
|
||||||
|
{
|
||||||
|
nextPageToken = null;
|
||||||
|
prepareNextRequest();
|
||||||
|
fetchNextBatch();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void prepareNextRequest()
|
||||||
|
{
|
||||||
|
try {
|
||||||
|
currentUri = uris.next();
|
||||||
|
currentBucket = currentUri.getAuthority();
|
||||||
|
currentPrefix = StringUtils.maybeRemoveLeadingSlash(currentUri.getPath());
|
||||||
|
nextPageToken = null;
|
||||||
|
listRequest = storage.list(currentBucket)
|
||||||
|
.setPrefix(currentPrefix)
|
||||||
|
.setMaxResults(maxListingLength);
|
||||||
|
|
||||||
|
}
|
||||||
|
catch (IOException io) {
|
||||||
|
throw new RuntimeException(io);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void fetchNextBatch()
|
||||||
|
{
|
||||||
|
try {
|
||||||
|
listRequest.setPageToken(nextPageToken);
|
||||||
|
results = GoogleUtils.retryGoogleCloudStorageOperation(() -> listRequest.execute());
|
||||||
|
storageObjectsIterator = results.getItems().iterator();
|
||||||
|
nextPageToken = results.getNextPageToken();
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
throw new RuntimeException(ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean hasNext()
|
||||||
|
{
|
||||||
|
return storageObjectsIterator.hasNext() || nextPageToken != null || uris.hasNext();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public StorageObject next()
|
||||||
|
{
|
||||||
|
if (!hasNext()) {
|
||||||
|
throw new NoSuchElementException();
|
||||||
|
}
|
||||||
|
|
||||||
|
while (storageObjectsIterator.hasNext()) {
|
||||||
|
final StorageObject next = storageObjectsIterator.next();
|
||||||
|
// list with prefix can return directories, but they should always end with `/`, ignore them
|
||||||
|
if (!next.getName().endsWith("/")) {
|
||||||
|
return next;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (nextPageToken != null) {
|
||||||
|
fetchNextBatch();
|
||||||
|
} else if (uris.hasNext()) {
|
||||||
|
prepareNextRequest();
|
||||||
|
fetchNextBatch();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!storageObjectsIterator.hasNext()) {
|
||||||
|
throw new ISE(
|
||||||
|
"Failed to further iterate on bucket[%s] and prefix[%s]. The last page token was [%s]",
|
||||||
|
currentBucket,
|
||||||
|
currentPrefix,
|
||||||
|
nextPageToken
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
return next();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,66 +23,271 @@ import com.fasterxml.jackson.databind.Module;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import com.fasterxml.jackson.databind.module.SimpleModule;
|
import com.fasterxml.jackson.databind.module.SimpleModule;
|
||||||
import com.fasterxml.jackson.module.guice.ObjectMapperModule;
|
import com.fasterxml.jackson.module.guice.ObjectMapperModule;
|
||||||
|
import com.google.api.services.storage.Storage;
|
||||||
|
import com.google.api.services.storage.model.Objects;
|
||||||
|
import com.google.api.services.storage.model.StorageObject;
|
||||||
import com.google.common.collect.ImmutableList;
|
import com.google.common.collect.ImmutableList;
|
||||||
import com.google.inject.Binder;
|
import com.google.inject.Binder;
|
||||||
import com.google.inject.Guice;
|
import com.google.inject.Guice;
|
||||||
import com.google.inject.Injector;
|
import com.google.inject.Injector;
|
||||||
import com.google.inject.Provides;
|
import com.google.inject.Provides;
|
||||||
|
import org.apache.druid.data.input.InputRow;
|
||||||
|
import org.apache.druid.data.input.InputRowSchema;
|
||||||
|
import org.apache.druid.data.input.InputSourceReader;
|
||||||
import org.apache.druid.data.input.InputSplit;
|
import org.apache.druid.data.input.InputSplit;
|
||||||
|
import org.apache.druid.data.input.impl.CloudObjectLocation;
|
||||||
|
import org.apache.druid.data.input.impl.CsvInputFormat;
|
||||||
|
import org.apache.druid.data.input.impl.DimensionsSpec;
|
||||||
import org.apache.druid.data.input.impl.JsonInputFormat;
|
import org.apache.druid.data.input.impl.JsonInputFormat;
|
||||||
|
import org.apache.druid.data.input.impl.TimestampSpec;
|
||||||
import org.apache.druid.initialization.DruidModule;
|
import org.apache.druid.initialization.DruidModule;
|
||||||
import org.apache.druid.jackson.DefaultObjectMapper;
|
import org.apache.druid.jackson.DefaultObjectMapper;
|
||||||
|
import org.apache.druid.java.util.common.DateTimes;
|
||||||
|
import org.apache.druid.java.util.common.StringUtils;
|
||||||
|
import org.apache.druid.java.util.common.parsers.CloseableIterator;
|
||||||
import org.apache.druid.java.util.common.parsers.JSONPathSpec;
|
import org.apache.druid.java.util.common.parsers.JSONPathSpec;
|
||||||
import org.apache.druid.storage.google.GoogleStorage;
|
import org.apache.druid.storage.google.GoogleStorage;
|
||||||
|
import org.apache.druid.testing.InitializedNullHandlingTest;
|
||||||
|
import org.apache.druid.utils.CompressionUtils;
|
||||||
|
import org.easymock.EasyMock;
|
||||||
|
import org.joda.time.DateTime;
|
||||||
import org.junit.Assert;
|
import org.junit.Assert;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
|
import java.io.ByteArrayOutputStream;
|
||||||
|
import java.io.IOException;
|
||||||
import java.net.URI;
|
import java.net.URI;
|
||||||
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
public class GoogleCloudStorageInputSourceTest
|
public class GoogleCloudStorageInputSourceTest extends InitializedNullHandlingTest
|
||||||
{
|
{
|
||||||
private static final GoogleStorage STORAGE = new GoogleStorage(null);
|
private static final GoogleStorage STORAGE = EasyMock.createMock(GoogleStorage.class);
|
||||||
|
|
||||||
|
private static final List<URI> EXPECTED_URIS = Arrays.asList(
|
||||||
|
URI.create("gs://foo/bar/file.csv"),
|
||||||
|
URI.create("gs://bar/foo/file2.csv")
|
||||||
|
);
|
||||||
|
|
||||||
|
private static final List<URI> EXPECTED_COMPRESSED_URIS = Arrays.asList(
|
||||||
|
URI.create("gs://foo/bar/file.csv.gz"),
|
||||||
|
URI.create("gs://bar/foo/file2.csv.gz")
|
||||||
|
);
|
||||||
|
|
||||||
|
private static final List<CloudObjectLocation> EXPECTED_OBJECTS =
|
||||||
|
EXPECTED_URIS.stream().map(CloudObjectLocation::new).collect(Collectors.toList());
|
||||||
|
|
||||||
|
private static final List<URI> PREFIXES = Arrays.asList(
|
||||||
|
URI.create("gs://foo/bar"),
|
||||||
|
URI.create("gs://bar/foo")
|
||||||
|
);
|
||||||
|
|
||||||
|
private static final List<CloudObjectLocation> EXPECTED_LOCATION =
|
||||||
|
ImmutableList.of(new CloudObjectLocation("foo", "bar/file.csv"));
|
||||||
|
|
||||||
|
private static final DateTime NOW = DateTimes.nowUtc();
|
||||||
|
private static final byte[] CONTENT =
|
||||||
|
StringUtils.toUtf8(StringUtils.format("%d,hello,world", NOW.getMillis()));
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testSerde() throws Exception
|
public void testSerde() throws Exception
|
||||||
{
|
{
|
||||||
final ObjectMapper mapper = createGoogleObjectMapper();
|
final ObjectMapper mapper = createGoogleObjectMapper();
|
||||||
|
final GoogleCloudStorageInputSource withUris =
|
||||||
final List<URI> uris = Arrays.asList(
|
new GoogleCloudStorageInputSource(STORAGE, EXPECTED_URIS, ImmutableList.of(), null);
|
||||||
new URI("gs://foo/bar/file.gz"),
|
|
||||||
new URI("gs://bar/foo/file2.gz")
|
|
||||||
);
|
|
||||||
|
|
||||||
final List<URI> prefixes = Arrays.asList(
|
|
||||||
new URI("gs://foo/bar"),
|
|
||||||
new URI("gs://bar/foo")
|
|
||||||
);
|
|
||||||
|
|
||||||
final GoogleCloudStorageInputSource withUris = new GoogleCloudStorageInputSource(STORAGE, uris);
|
|
||||||
final GoogleCloudStorageInputSource serdeWithUris =
|
final GoogleCloudStorageInputSource serdeWithUris =
|
||||||
mapper.readValue(mapper.writeValueAsString(withUris), GoogleCloudStorageInputSource.class);
|
mapper.readValue(mapper.writeValueAsString(withUris), GoogleCloudStorageInputSource.class);
|
||||||
Assert.assertEquals(withUris, serdeWithUris);
|
Assert.assertEquals(withUris, serdeWithUris);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testSerdePrefixes() throws Exception
|
||||||
|
{
|
||||||
|
final ObjectMapper mapper = createGoogleObjectMapper();
|
||||||
|
final GoogleCloudStorageInputSource withPrefixes =
|
||||||
|
new GoogleCloudStorageInputSource(STORAGE, ImmutableList.of(), PREFIXES, null);
|
||||||
|
final GoogleCloudStorageInputSource serdeWithPrefixes =
|
||||||
|
mapper.readValue(mapper.writeValueAsString(withPrefixes), GoogleCloudStorageInputSource.class);
|
||||||
|
Assert.assertEquals(withPrefixes, serdeWithPrefixes);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testSerdeObjects() throws Exception
|
||||||
|
{
|
||||||
|
final ObjectMapper mapper = createGoogleObjectMapper();
|
||||||
|
final GoogleCloudStorageInputSource withObjects =
|
||||||
|
new GoogleCloudStorageInputSource(
|
||||||
|
STORAGE,
|
||||||
|
null,
|
||||||
|
null,
|
||||||
|
ImmutableList.of(new CloudObjectLocation("foo", "bar/file.gz"))
|
||||||
|
);
|
||||||
|
final GoogleCloudStorageInputSource serdeWithObjects =
|
||||||
|
mapper.readValue(mapper.writeValueAsString(withObjects), GoogleCloudStorageInputSource.class);
|
||||||
|
Assert.assertEquals(withObjects, serdeWithObjects);
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testWithUrisSplit()
|
public void testWithUrisSplit()
|
||||||
{
|
{
|
||||||
final List<URI> uris = Arrays.asList(
|
|
||||||
URI.create("gs://foo/bar/file.gz"),
|
|
||||||
URI.create("gs://bar/foo/file2.gz")
|
|
||||||
);
|
|
||||||
|
|
||||||
GoogleCloudStorageInputSource inputSource = new GoogleCloudStorageInputSource(STORAGE, uris);
|
GoogleCloudStorageInputSource inputSource =
|
||||||
|
new GoogleCloudStorageInputSource(STORAGE, EXPECTED_URIS, ImmutableList.of(), null);
|
||||||
|
|
||||||
Stream<InputSplit<URI>> splits = inputSource.createSplits(
|
Stream<InputSplit<CloudObjectLocation>> splits = inputSource.createSplits(
|
||||||
new JsonInputFormat(JSONPathSpec.DEFAULT, null),
|
new JsonInputFormat(JSONPathSpec.DEFAULT, null),
|
||||||
null
|
null
|
||||||
);
|
);
|
||||||
Assert.assertEquals(uris, splits.map(InputSplit::get).collect(Collectors.toList()));
|
Assert.assertEquals(EXPECTED_OBJECTS, splits.map(InputSplit::get).collect(Collectors.toList()));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testWithPrefixesSplit() throws IOException
|
||||||
|
{
|
||||||
|
EasyMock.reset(STORAGE);
|
||||||
|
addExpectedPrefixObjects(PREFIXES.get(0), ImmutableList.of(EXPECTED_URIS.get(0)));
|
||||||
|
addExpectedPrefixObjects(PREFIXES.get(1), ImmutableList.of(EXPECTED_URIS.get(1)));
|
||||||
|
EasyMock.replay(STORAGE);
|
||||||
|
|
||||||
|
GoogleCloudStorageInputSource inputSource =
|
||||||
|
new GoogleCloudStorageInputSource(STORAGE, null, PREFIXES, null);
|
||||||
|
|
||||||
|
Stream<InputSplit<CloudObjectLocation>> splits = inputSource.createSplits(
|
||||||
|
new JsonInputFormat(JSONPathSpec.DEFAULT, null),
|
||||||
|
null
|
||||||
|
);
|
||||||
|
|
||||||
|
Assert.assertEquals(EXPECTED_OBJECTS, splits.map(InputSplit::get).collect(Collectors.toList()));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testReader() throws IOException
|
||||||
|
{
|
||||||
|
EasyMock.reset(STORAGE);
|
||||||
|
addExpectedPrefixObjects(PREFIXES.get(0), ImmutableList.of(EXPECTED_URIS.get(0)));
|
||||||
|
addExpectedGetObjectMock(EXPECTED_URIS.get(0));
|
||||||
|
addExpectedPrefixObjects(PREFIXES.get(1), ImmutableList.of(EXPECTED_URIS.get(1)));
|
||||||
|
addExpectedGetObjectMock(EXPECTED_URIS.get(1));
|
||||||
|
EasyMock.replay(STORAGE);
|
||||||
|
|
||||||
|
GoogleCloudStorageInputSource inputSource = new GoogleCloudStorageInputSource(
|
||||||
|
STORAGE,
|
||||||
|
null,
|
||||||
|
PREFIXES,
|
||||||
|
null
|
||||||
|
);
|
||||||
|
|
||||||
|
InputRowSchema someSchema = new InputRowSchema(
|
||||||
|
new TimestampSpec("time", "auto", null),
|
||||||
|
new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("dim1", "dim2"))),
|
||||||
|
ImmutableList.of("count")
|
||||||
|
);
|
||||||
|
|
||||||
|
InputSourceReader reader = inputSource.reader(
|
||||||
|
someSchema,
|
||||||
|
new CsvInputFormat(ImmutableList.of("time", "dim1", "dim2"), "|", false, null, 0),
|
||||||
|
null
|
||||||
|
);
|
||||||
|
|
||||||
|
CloseableIterator<InputRow> iterator = reader.read();
|
||||||
|
|
||||||
|
while (iterator.hasNext()) {
|
||||||
|
InputRow nextRow = iterator.next();
|
||||||
|
Assert.assertEquals(NOW, nextRow.getTimestamp());
|
||||||
|
Assert.assertEquals("hello", nextRow.getDimension("dim1").get(0));
|
||||||
|
Assert.assertEquals("world", nextRow.getDimension("dim2").get(0));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testCompressedReader() throws IOException
|
||||||
|
{
|
||||||
|
EasyMock.reset(STORAGE);
|
||||||
|
addExpectedPrefixObjects(PREFIXES.get(0), ImmutableList.of(EXPECTED_COMPRESSED_URIS.get(0)));
|
||||||
|
addExpectedGetCompressedObjectMock(EXPECTED_COMPRESSED_URIS.get(0));
|
||||||
|
addExpectedPrefixObjects(PREFIXES.get(1), ImmutableList.of(EXPECTED_COMPRESSED_URIS.get(1)));
|
||||||
|
addExpectedGetCompressedObjectMock(EXPECTED_COMPRESSED_URIS.get(1));
|
||||||
|
EasyMock.replay(STORAGE);
|
||||||
|
|
||||||
|
GoogleCloudStorageInputSource inputSource = new GoogleCloudStorageInputSource(
|
||||||
|
STORAGE,
|
||||||
|
null,
|
||||||
|
PREFIXES,
|
||||||
|
null
|
||||||
|
);
|
||||||
|
|
||||||
|
InputRowSchema someSchema = new InputRowSchema(
|
||||||
|
new TimestampSpec("time", "auto", null),
|
||||||
|
new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("dim1", "dim2"))),
|
||||||
|
ImmutableList.of("count")
|
||||||
|
);
|
||||||
|
|
||||||
|
InputSourceReader reader = inputSource.reader(
|
||||||
|
someSchema,
|
||||||
|
new CsvInputFormat(ImmutableList.of("time", "dim1", "dim2"), "|", false, null, 0),
|
||||||
|
null
|
||||||
|
);
|
||||||
|
|
||||||
|
CloseableIterator<InputRow> iterator = reader.read();
|
||||||
|
|
||||||
|
while (iterator.hasNext()) {
|
||||||
|
InputRow nextRow = iterator.next();
|
||||||
|
Assert.assertEquals(NOW, nextRow.getTimestamp());
|
||||||
|
Assert.assertEquals("hello", nextRow.getDimension("dim1").get(0));
|
||||||
|
Assert.assertEquals("world", nextRow.getDimension("dim2").get(0));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void addExpectedPrefixObjects(URI prefix, List<URI> uris) throws IOException
|
||||||
|
{
|
||||||
|
final String bucket = prefix.getAuthority();
|
||||||
|
|
||||||
|
Storage.Objects.List listRequest = EasyMock.createMock(Storage.Objects.List.class);
|
||||||
|
EasyMock.expect(STORAGE.list(EasyMock.eq(bucket))).andReturn(listRequest).once();
|
||||||
|
EasyMock.expect(listRequest.setPageToken(EasyMock.anyString())).andReturn(listRequest).once();
|
||||||
|
EasyMock.expect(listRequest.setMaxResults(EasyMock.anyLong())).andReturn(listRequest).once();
|
||||||
|
EasyMock.expect(listRequest.setPrefix(EasyMock.eq(StringUtils.maybeRemoveLeadingSlash(prefix.getPath()))))
|
||||||
|
.andReturn(listRequest)
|
||||||
|
.once();
|
||||||
|
|
||||||
|
List<StorageObject> mockObjects = new ArrayList<>();
|
||||||
|
for (URI uri : uris) {
|
||||||
|
StorageObject s = new StorageObject();
|
||||||
|
s.setBucket(bucket);
|
||||||
|
s.setName(uri.getPath());
|
||||||
|
mockObjects.add(s);
|
||||||
|
}
|
||||||
|
Objects response = new Objects();
|
||||||
|
response.setItems(mockObjects);
|
||||||
|
EasyMock.expect(listRequest.execute()).andReturn(response).once();
|
||||||
|
EasyMock.expect(response.getItems()).andReturn(mockObjects).once();
|
||||||
|
|
||||||
|
EasyMock.replay(listRequest);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void addExpectedGetObjectMock(URI uri) throws IOException
|
||||||
|
{
|
||||||
|
CloudObjectLocation location = new CloudObjectLocation(uri);
|
||||||
|
|
||||||
|
EasyMock.expect(
|
||||||
|
STORAGE.get(EasyMock.eq(location.getBucket()), EasyMock.eq(location.getPath()), EasyMock.eq(0L))
|
||||||
|
).andReturn(new ByteArrayInputStream(CONTENT)).once();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void addExpectedGetCompressedObjectMock(URI uri) throws IOException
|
||||||
|
{
|
||||||
|
CloudObjectLocation location = new CloudObjectLocation(uri);
|
||||||
|
|
||||||
|
ByteArrayOutputStream gzipped = new ByteArrayOutputStream();
|
||||||
|
CompressionUtils.gzip(new ByteArrayInputStream(CONTENT), gzipped);
|
||||||
|
|
||||||
|
EasyMock.expect(
|
||||||
|
STORAGE.get(EasyMock.eq(location.getBucket()), EasyMock.eq(location.getPath()), EasyMock.eq(0L))
|
||||||
|
).andReturn(new ByteArrayInputStream(gzipped.toByteArray())).once();
|
||||||
}
|
}
|
||||||
|
|
||||||
public static ObjectMapper createGoogleObjectMapper()
|
public static ObjectMapper createGoogleObjectMapper()
|
||||||
|
|
|
@ -113,11 +113,6 @@
|
||||||
<artifactId>aws-java-sdk-s3</artifactId>
|
<artifactId>aws-java-sdk-s3</artifactId>
|
||||||
<scope>provided</scope>
|
<scope>provided</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
|
||||||
<groupId>joda-time</groupId>
|
|
||||||
<artifactId>joda-time</artifactId>
|
|
||||||
<scope>provided</scope>
|
|
||||||
</dependency>
|
|
||||||
<!-- Tests -->
|
<!-- Tests -->
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.druid</groupId>
|
<groupId>org.apache.druid</groupId>
|
||||||
|
@ -155,6 +150,11 @@
|
||||||
<artifactId>easymock</artifactId>
|
<artifactId>easymock</artifactId>
|
||||||
<scope>test</scope>
|
<scope>test</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>joda-time</groupId>
|
||||||
|
<artifactId>joda-time</artifactId>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
</project>
|
</project>
|
||||||
|
|
|
@ -26,6 +26,7 @@ import com.google.common.base.Predicate;
|
||||||
import org.apache.druid.data.input.RetryingInputEntity;
|
import org.apache.druid.data.input.RetryingInputEntity;
|
||||||
import org.apache.druid.data.input.impl.CloudObjectLocation;
|
import org.apache.druid.data.input.impl.CloudObjectLocation;
|
||||||
import org.apache.druid.java.util.common.ISE;
|
import org.apache.druid.java.util.common.ISE;
|
||||||
|
import org.apache.druid.storage.s3.S3StorageDruidModule;
|
||||||
import org.apache.druid.storage.s3.S3Utils;
|
import org.apache.druid.storage.s3.S3Utils;
|
||||||
import org.apache.druid.storage.s3.ServerSideEncryptingAmazonS3;
|
import org.apache.druid.storage.s3.ServerSideEncryptingAmazonS3;
|
||||||
|
|
||||||
|
@ -47,7 +48,7 @@ public class S3Entity extends RetryingInputEntity
|
||||||
@Override
|
@Override
|
||||||
public URI getUri()
|
public URI getUri()
|
||||||
{
|
{
|
||||||
return null;
|
return object.toUri(S3StorageDruidModule.SCHEME);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -25,35 +25,25 @@ import com.fasterxml.jackson.annotation.JsonCreator;
|
||||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||||
import com.google.common.base.Preconditions;
|
import com.google.common.base.Preconditions;
|
||||||
import com.google.common.collect.ImmutableList;
|
import com.google.common.collect.ImmutableList;
|
||||||
import org.apache.druid.data.input.AbstractInputSource;
|
|
||||||
import org.apache.druid.data.input.InputFormat;
|
|
||||||
import org.apache.druid.data.input.InputRowSchema;
|
|
||||||
import org.apache.druid.data.input.InputSourceReader;
|
|
||||||
import org.apache.druid.data.input.InputSplit;
|
import org.apache.druid.data.input.InputSplit;
|
||||||
import org.apache.druid.data.input.SplitHintSpec;
|
import org.apache.druid.data.input.impl.CloudObjectInputSource;
|
||||||
import org.apache.druid.data.input.impl.CloudObjectLocation;
|
import org.apache.druid.data.input.impl.CloudObjectLocation;
|
||||||
import org.apache.druid.data.input.impl.InputEntityIteratingReader;
|
|
||||||
import org.apache.druid.data.input.impl.SplittableInputSource;
|
import org.apache.druid.data.input.impl.SplittableInputSource;
|
||||||
|
import org.apache.druid.storage.s3.S3StorageDruidModule;
|
||||||
import org.apache.druid.storage.s3.S3Utils;
|
import org.apache.druid.storage.s3.S3Utils;
|
||||||
import org.apache.druid.storage.s3.ServerSideEncryptingAmazonS3;
|
import org.apache.druid.storage.s3.ServerSideEncryptingAmazonS3;
|
||||||
import org.apache.druid.utils.CollectionUtils;
|
|
||||||
|
|
||||||
import javax.annotation.Nullable;
|
import javax.annotation.Nullable;
|
||||||
import java.io.File;
|
|
||||||
import java.net.URI;
|
import java.net.URI;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Objects;
|
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
import java.util.stream.StreamSupport;
|
import java.util.stream.StreamSupport;
|
||||||
|
|
||||||
public class S3InputSource extends AbstractInputSource implements SplittableInputSource<CloudObjectLocation>
|
public class S3InputSource extends CloudObjectInputSource<S3Entity>
|
||||||
{
|
{
|
||||||
private static final int MAX_LISTING_LENGTH = 1024;
|
private static final int MAX_LISTING_LENGTH = 1024;
|
||||||
|
|
||||||
private final ServerSideEncryptingAmazonS3 s3Client;
|
private final ServerSideEncryptingAmazonS3 s3Client;
|
||||||
private final List<URI> uris;
|
|
||||||
private final List<URI> prefixes;
|
|
||||||
private final List<CloudObjectLocation> objects;
|
|
||||||
|
|
||||||
@JsonCreator
|
@JsonCreator
|
||||||
public S3InputSource(
|
public S3InputSource(
|
||||||
|
@ -63,144 +53,42 @@ public class S3InputSource extends AbstractInputSource implements SplittableInpu
|
||||||
@JsonProperty("objects") @Nullable List<CloudObjectLocation> objects
|
@JsonProperty("objects") @Nullable List<CloudObjectLocation> objects
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
|
super(S3StorageDruidModule.SCHEME, uris, prefixes, objects);
|
||||||
this.s3Client = Preconditions.checkNotNull(s3Client, "s3Client");
|
this.s3Client = Preconditions.checkNotNull(s3Client, "s3Client");
|
||||||
this.uris = uris;
|
|
||||||
this.prefixes = prefixes;
|
|
||||||
this.objects = objects;
|
|
||||||
|
|
||||||
if (!CollectionUtils.isNullOrEmpty(objects)) {
|
|
||||||
throwIfIllegalArgs(!CollectionUtils.isNullOrEmpty(uris) || !CollectionUtils.isNullOrEmpty(prefixes));
|
|
||||||
} else if (!CollectionUtils.isNullOrEmpty(uris)) {
|
|
||||||
throwIfIllegalArgs(!CollectionUtils.isNullOrEmpty(prefixes));
|
|
||||||
uris.forEach(S3Utils::checkURI);
|
|
||||||
} else if (!CollectionUtils.isNullOrEmpty(prefixes)) {
|
|
||||||
prefixes.forEach(S3Utils::checkURI);
|
|
||||||
} else {
|
|
||||||
throwIfIllegalArgs(true);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@JsonProperty
|
|
||||||
public List<URI> getUris()
|
|
||||||
{
|
|
||||||
return uris;
|
|
||||||
}
|
|
||||||
|
|
||||||
@JsonProperty
|
|
||||||
public List<URI> getPrefixes()
|
|
||||||
{
|
|
||||||
return prefixes;
|
|
||||||
}
|
|
||||||
|
|
||||||
@JsonProperty
|
|
||||||
public List<CloudObjectLocation> getObjects()
|
|
||||||
{
|
|
||||||
return objects;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Stream<InputSplit<CloudObjectLocation>> createSplits(
|
protected S3Entity createEntity(InputSplit<CloudObjectLocation> split)
|
||||||
InputFormat inputFormat,
|
|
||||||
@Nullable SplitHintSpec splitHintSpec
|
|
||||||
)
|
|
||||||
{
|
{
|
||||||
if (objects != null) {
|
return new S3Entity(s3Client, split.get());
|
||||||
return objects.stream().map(InputSplit::new);
|
}
|
||||||
}
|
|
||||||
|
|
||||||
if (uris != null) {
|
|
||||||
return uris.stream().map(CloudObjectLocation::new).map(InputSplit::new);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Stream<InputSplit<CloudObjectLocation>> getPrefixesSplitStream()
|
||||||
|
{
|
||||||
return StreamSupport.stream(getIterableObjectsFromPrefixes().spliterator(), false)
|
return StreamSupport.stream(getIterableObjectsFromPrefixes().spliterator(), false)
|
||||||
.map(S3Utils::summaryToCloudObjectLocation)
|
.map(S3Utils::summaryToCloudObjectLocation)
|
||||||
.map(InputSplit::new);
|
.map(InputSplit::new);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public int estimateNumSplits(InputFormat inputFormat, @Nullable SplitHintSpec splitHintSpec)
|
|
||||||
{
|
|
||||||
if (objects != null) {
|
|
||||||
return objects.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (uris != null) {
|
|
||||||
return uris.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
return (int) StreamSupport.stream(getIterableObjectsFromPrefixes().spliterator(), false).count();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public SplittableInputSource<CloudObjectLocation> withSplit(InputSplit<CloudObjectLocation> split)
|
public SplittableInputSource<CloudObjectLocation> withSplit(InputSplit<CloudObjectLocation> split)
|
||||||
{
|
{
|
||||||
return new S3InputSource(s3Client, null, null, ImmutableList.of(split.get()));
|
return new S3InputSource(s3Client, null, null, ImmutableList.of(split.get()));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean needsFormat()
|
|
||||||
{
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
protected InputSourceReader formattableReader(
|
|
||||||
InputRowSchema inputRowSchema,
|
|
||||||
InputFormat inputFormat,
|
|
||||||
@Nullable File temporaryDirectory
|
|
||||||
)
|
|
||||||
{
|
|
||||||
return new InputEntityIteratingReader(
|
|
||||||
inputRowSchema,
|
|
||||||
inputFormat,
|
|
||||||
// formattableReader() is supposed to be called in each task that actually creates segments.
|
|
||||||
// The task should already have only one split in parallel indexing,
|
|
||||||
// while there's no need to make splits using splitHintSpec in sequential indexing.
|
|
||||||
createSplits(inputFormat, null).map(split -> new S3Entity(s3Client, split.get())),
|
|
||||||
temporaryDirectory
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean equals(Object o)
|
|
||||||
{
|
|
||||||
if (this == o) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
if (o == null || getClass() != o.getClass()) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
S3InputSource that = (S3InputSource) o;
|
|
||||||
return Objects.equals(uris, that.uris) &&
|
|
||||||
Objects.equals(prefixes, that.prefixes) &&
|
|
||||||
Objects.equals(objects, that.objects);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int hashCode()
|
|
||||||
{
|
|
||||||
return Objects.hash(uris, prefixes, objects);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString()
|
public String toString()
|
||||||
{
|
{
|
||||||
return "S3InputSource{" +
|
return "S3InputSource{" +
|
||||||
"uris=" + uris +
|
"uris=" + getUris() +
|
||||||
", prefixes=" + prefixes +
|
", prefixes=" + getPrefixes() +
|
||||||
", objects=" + objects +
|
", objects=" + getObjects() +
|
||||||
'}';
|
'}';
|
||||||
}
|
}
|
||||||
|
|
||||||
private Iterable<S3ObjectSummary> getIterableObjectsFromPrefixes()
|
private Iterable<S3ObjectSummary> getIterableObjectsFromPrefixes()
|
||||||
{
|
{
|
||||||
return () -> S3Utils.lazyFetchingObjectSummariesIterator(s3Client, prefixes.iterator(), MAX_LISTING_LENGTH);
|
return () -> S3Utils.lazyFetchingObjectSummariesIterator(s3Client, getPrefixes().iterator(), MAX_LISTING_LENGTH);
|
||||||
}
|
|
||||||
|
|
||||||
private void throwIfIllegalArgs(boolean clause) throws IllegalArgumentException
|
|
||||||
{
|
|
||||||
if (clause) {
|
|
||||||
throw new IllegalArgumentException("exactly one of either uris or prefixes or objects must be specified");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -34,7 +34,6 @@ import com.google.common.base.Joiner;
|
||||||
import com.google.common.base.Predicate;
|
import com.google.common.base.Predicate;
|
||||||
import com.google.common.collect.Iterators;
|
import com.google.common.collect.Iterators;
|
||||||
import org.apache.druid.data.input.impl.CloudObjectLocation;
|
import org.apache.druid.data.input.impl.CloudObjectLocation;
|
||||||
import org.apache.druid.java.util.common.IAE;
|
|
||||||
import org.apache.druid.java.util.common.ISE;
|
import org.apache.druid.java.util.common.ISE;
|
||||||
import org.apache.druid.java.util.common.RE;
|
import org.apache.druid.java.util.common.RE;
|
||||||
import org.apache.druid.java.util.common.RetryUtils;
|
import org.apache.druid.java.util.common.RetryUtils;
|
||||||
|
@ -289,10 +288,8 @@ public class S3Utils
|
||||||
{
|
{
|
||||||
if (uri.getScheme().equalsIgnoreCase(S3StorageDruidModule.SCHEME_S3_ZIP)) {
|
if (uri.getScheme().equalsIgnoreCase(S3StorageDruidModule.SCHEME_S3_ZIP)) {
|
||||||
uri = URI.create(SCHEME + uri.toString().substring(S3StorageDruidModule.SCHEME_S3_ZIP.length()));
|
uri = URI.create(SCHEME + uri.toString().substring(S3StorageDruidModule.SCHEME_S3_ZIP.length()));
|
||||||
} else if (!SCHEME.equalsIgnoreCase(uri.getScheme())) {
|
|
||||||
throw new IAE("Invalid URI scheme [%s] must be [%s]", uri.toString(), SCHEME);
|
|
||||||
}
|
}
|
||||||
return uri;
|
return CloudObjectLocation.validateUriScheme(SCHEME, uri);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Copied from org.jets3t.service.model.StorageObject.isDirectoryPlaceholder()
|
// Copied from org.jets3t.service.model.StorageObject.isDirectoryPlaceholder()
|
||||||
|
|
Loading…
Reference in New Issue