Reduce list operation calls when pulling segments from S3 (#11899)

* Lazy lists

* Fix objectsummary init
This commit is contained in:
Atul Mohan 2021-11-10 19:13:46 -08:00 committed by GitHub
parent 5baa22148e
commit f9941c12c3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 85 additions and 20 deletions

View File

@ -71,7 +71,8 @@ public class S3DataSegmentPuller implements URIDataPuller
this.s3Client = s3Client;
}
FileUtils.FileCopyResult getSegmentFiles(final CloudObjectLocation s3Coords, final File outDir) throws SegmentLoadingException
FileUtils.FileCopyResult getSegmentFiles(final CloudObjectLocation s3Coords, final File outDir)
throws SegmentLoadingException
{
log.info("Pulling index at path[%s] to outDir[%s]", s3Coords, outDir);
@ -149,16 +150,15 @@ public class S3DataSegmentPuller implements URIDataPuller
}
}
private FileObject buildFileObject(final URI uri) throws AmazonServiceException
public FileObject buildFileObject(final URI uri) throws AmazonServiceException
{
final CloudObjectLocation coords = new CloudObjectLocation(S3Utils.checkURI(uri));
final S3ObjectSummary objectSummary =
S3Utils.getSingleObjectSummary(s3Client, coords.getBucket(), coords.getPath());
final String path = uri.getPath();
return new FileObject()
{
S3Object s3Object = null;
S3ObjectSummary objectSummary = null;
@Override
public URI toUri()
@ -182,7 +182,7 @@ public class S3DataSegmentPuller implements URIDataPuller
try {
if (s3Object == null) {
// lazily promote to full GET
s3Object = s3Client.getObject(objectSummary.getBucketName(), objectSummary.getKey());
s3Object = s3Client.getObject(coords.getBucket(), coords.getPath());
}
final InputStream in = s3Object.getObjectContent();
@ -231,6 +231,13 @@ public class S3DataSegmentPuller implements URIDataPuller
@Override
public long getLastModified()
{
if (s3Object != null) {
return s3Object.getObjectMetadata().getLastModified().getTime();
}
if (objectSummary == null) {
objectSummary =
S3Utils.getSingleObjectSummary(s3Client, coords.getBucket(), coords.getPath());
}
return objectSummary.getLastModified().getTime();
}
@ -252,9 +259,7 @@ public class S3DataSegmentPuller implements URIDataPuller
* Returns the "version" (aka last modified timestamp) of the URI
*
* @param uri The URI to check the last timestamp
*
* @return The time in ms of the last modification of the URI in String format
*
* @throws IOException
*/
@Override

View File

@ -24,6 +24,7 @@ import com.amazonaws.services.s3.model.ListObjectsV2Request;
import com.amazonaws.services.s3.model.ListObjectsV2Result;
import com.amazonaws.services.s3.model.S3Object;
import com.amazonaws.services.s3.model.S3ObjectSummary;
import org.apache.commons.io.IOUtils;
import org.apache.druid.data.input.impl.CloudObjectLocation;
import org.apache.druid.java.util.common.FileUtils;
import org.apache.druid.java.util.common.StringUtils;
@ -38,6 +39,7 @@ import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URI;
import java.nio.charset.StandardCharsets;
@ -107,18 +109,11 @@ public class S3DataSegmentPullerTest
objectSummary.setKey(keyPrefix + "/renames-0.gz");
objectSummary.setLastModified(new Date(0));
final ListObjectsV2Result listObjectsResult = new ListObjectsV2Result();
listObjectsResult.setKeyCount(1);
listObjectsResult.getObjectSummaries().add(objectSummary);
final File tmpDir = temporaryFolder.newFolder("gzTestDir");
EasyMock.expect(s3Client.doesObjectExist(EasyMock.eq(object0.getBucketName()), EasyMock.eq(object0.getKey())))
.andReturn(true)
.once();
EasyMock.expect(s3Client.listObjectsV2(EasyMock.anyObject(ListObjectsV2Request.class)))
.andReturn(listObjectsResult)
.once();
EasyMock.expect(s3Client.getObject(EasyMock.eq(object0.getBucketName()), EasyMock.eq(object0.getKey())))
.andReturn(object0)
.once();
@ -177,15 +172,9 @@ public class S3DataSegmentPullerTest
EasyMock.expect(s3Client.doesObjectExist(EasyMock.eq(object0.getBucketName()), EasyMock.eq(object0.getKey())))
.andReturn(true)
.once();
EasyMock.expect(s3Client.listObjectsV2(EasyMock.anyObject(ListObjectsV2Request.class)))
.andReturn(listObjectsResult)
.once();
EasyMock.expect(s3Client.getObject(EasyMock.eq(bucket), EasyMock.eq(object0.getKey())))
.andThrow(exception)
.once();
EasyMock.expect(s3Client.listObjectsV2(EasyMock.anyObject(ListObjectsV2Request.class)))
.andReturn(listObjectsResult)
.once();
EasyMock.expect(s3Client.getObject(EasyMock.eq(bucket), EasyMock.eq(object0.getKey())))
.andReturn(object0)
.once();
@ -206,4 +195,75 @@ public class S3DataSegmentPullerTest
Assert.assertEquals(value.length, expected.length());
}
@Test
public void testS3ObjectStream() throws IOException
{
final String bucket = "bucket";
final String keyPrefix = "prefix/dir/0";
final ServerSideEncryptingAmazonS3 s3Client = EasyMock.createStrictMock(ServerSideEncryptingAmazonS3.class);
final byte[] value = bucket.getBytes(StandardCharsets.UTF_8);
final File tmpFile = temporaryFolder.newFile("testObjectFile");
try (OutputStream outputStream = new FileOutputStream(tmpFile)) {
outputStream.write(value);
}
final S3Object object0 = new S3Object();
object0.setBucketName(bucket);
object0.setKey(keyPrefix + "/test-object");
object0.getObjectMetadata().setLastModified(new Date(0));
object0.setObjectContent(new FileInputStream(tmpFile));
EasyMock.expect(s3Client.getObject(EasyMock.eq(object0.getBucketName()), EasyMock.eq(object0.getKey())))
.andReturn(object0)
.once();
S3DataSegmentPuller puller = new S3DataSegmentPuller(s3Client);
EasyMock.replay(s3Client);
InputStream stream = puller.buildFileObject(URI.create(StringUtils.format("s3://%s/%s", bucket, object0.getKey())))
.openInputStream();
EasyMock.verify(s3Client);
Assert.assertEquals(bucket, IOUtils.toString(stream, StandardCharsets.UTF_8));
}
@Test
public void testS3ObjectModifiedDate() throws IOException
{
final String bucket = "bucket";
final String keyPrefix = "prefix/dir/0";
final ServerSideEncryptingAmazonS3 s3Client = EasyMock.createStrictMock(ServerSideEncryptingAmazonS3.class);
final byte[] value = bucket.getBytes(StandardCharsets.UTF_8);
final File tmpFile = temporaryFolder.newFile("testObjectFile");
try (OutputStream outputStream = new FileOutputStream(tmpFile)) {
outputStream.write(value);
}
final S3Object object0 = new S3Object();
object0.setBucketName(bucket);
object0.setKey(keyPrefix + "/test-object");
object0.getObjectMetadata().setLastModified(new Date(0));
object0.setObjectContent(new FileInputStream(tmpFile));
final S3ObjectSummary objectSummary = new S3ObjectSummary();
objectSummary.setBucketName(bucket);
objectSummary.setKey(keyPrefix + "/test-object");
objectSummary.setLastModified(new Date(0));
final ListObjectsV2Result result = new ListObjectsV2Result();
result.setKeyCount(1);
result.getObjectSummaries().add(objectSummary);
EasyMock.expect(s3Client.listObjectsV2(EasyMock.anyObject(ListObjectsV2Request.class)))
.andReturn(result)
.once();
S3DataSegmentPuller puller = new S3DataSegmentPuller(s3Client);
EasyMock.replay(s3Client);
long modifiedDate = puller.buildFileObject(URI.create(StringUtils.format("s3://%s/%s", bucket, object0.getKey())))
.getLastModified();
EasyMock.verify(s3Client);
Assert.assertEquals(0, modifiedDate);
}
}