Reduce upload buffer size in GoogleTaskLogs. (#16236)

* Reduce upload buffer size in GoogleTaskLogs.

Use a 1MB upload buffer, rather than the default of 15 MB in the API client. This is
mainly because MMs may upload logs in parallel, and typically have small heaps. The
default-sized 15 MB buffers add up quickly and can cause a MM to run out of memory.

* Make bufferSize a nullable Integer. Add tests.
This commit is contained in:
Gian Merlino 2024-04-08 12:54:31 -07:00 committed by GitHub
parent 4ff7e2c6c9
commit 5e5cf9af99
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 101 additions and 18 deletions

View File

@ -83,7 +83,7 @@ public class GoogleDataSegmentPusher implements DataSegmentPusher
try { try {
RetryUtils.retry( RetryUtils.retry(
(RetryUtils.Task<Void>) () -> { (RetryUtils.Task<Void>) () -> {
storage.insert(config.getBucket(), path, new FileContent(contentType, file)); storage.insert(config.getBucket(), path, new FileContent(contentType, file), null);
return null; return null;
}, },
GoogleUtils::isRetryable, GoogleUtils::isRetryable,

View File

@ -44,6 +44,9 @@ import java.util.stream.Collectors;
public class GoogleStorage public class GoogleStorage
{ {
private static final Logger log = new Logger(GoogleStorage.class);
private static final HumanReadableBytes DEFAULT_WRITE_CHUNK_SIZE = new HumanReadableBytes("4MiB");
/** /**
* Some segment processing tools such as DataSegmentKiller are initialized when an ingestion job starts * Some segment processing tools such as DataSegmentKiller are initialized when an ingestion job starts
* if the extension is loaded, even when the implementation of DataSegmentKiller is not used. As a result, * if the extension is loaded, even when the implementation of DataSegmentKiller is not used. As a result,
@ -53,20 +56,34 @@ public class GoogleStorage
* <p> * <p>
* See OmniDataSegmentKiller for how DataSegmentKillers are initialized. * See OmniDataSegmentKiller for how DataSegmentKillers are initialized.
*/ */
private static final Logger log = new Logger(GoogleStorage.class);
private final Supplier<Storage> storage; private final Supplier<Storage> storage;
private final HumanReadableBytes DEFAULT_WRITE_CHUNK_SIZE = new HumanReadableBytes("4MiB");
public GoogleStorage(final Supplier<Storage> storage) public GoogleStorage(final Supplier<Storage> storage)
{ {
this.storage = storage; this.storage = storage;
} }
public void insert(final String bucket, final String path, AbstractInputStreamContent mediaContent) throws IOException /**
* Upload an object. From {@link Storage#createFrom(BlobInfo, InputStream, int, Storage.BlobWriteOption...)},
* "larger buffer sizes might improve the upload performance but require more memory."
*
* @param bucket target bucket
* @param path target path
* @param mediaContent content to upload
* @param bufferSize size of upload buffer, or null to use the upstream default (15 MB as of this writing)
*/
public void insert(
final String bucket,
final String path,
final AbstractInputStreamContent mediaContent,
@Nullable final Integer bufferSize
) throws IOException
{ {
if (bufferSize == null) {
storage.get().createFrom(getBlobInfo(bucket, path), mediaContent.getInputStream()); storage.get().createFrom(getBlobInfo(bucket, path), mediaContent.getInputStream());
} else {
storage.get().createFrom(getBlobInfo(bucket, path), mediaContent.getInputStream(), bufferSize);
}
} }
public InputStream getInputStream(final String bucket, final String path) throws IOException public InputStream getInputStream(final String bucket, final String path) throws IOException
@ -148,10 +165,10 @@ public class GoogleStorage
/** /**
* Deletes an object in a bucket on the specified path * Deletes an object in a bucket on the specified path
*
* A false response from GCS delete API is indicative of file not found. Any other error is raised as a StorageException * A false response from GCS delete API is indicative of file not found. Any other error is raised as a StorageException
* and should be explicitly handled. * and should be explicitly handled.
Ref: <a href="https://github.com/googleapis/java-storage/blob/v2.29.1/google-cloud-storage/src/main/java/com/google/cloud/storage/spi/v1/HttpStorageRpc.java">HttpStorageRpc.java</a> * Ref: <a href="https://github.com/googleapis/java-storage/blob/v2.29.1/google-cloud-storage/src/main/java/com/google/cloud/storage/spi/v1/HttpStorageRpc.java">HttpStorageRpc.java</a>
* *
* @param bucket GCS bucket * @param bucket GCS bucket
* @param path Object path * @param path Object path
@ -202,9 +219,12 @@ public class GoogleStorage
* Return the etag for an object. This is a value that changes whenever the object's data or metadata changes and is * Return the etag for an object. This is a value that changes whenever the object's data or metadata changes and is
* typically but not always the MD5 hash of the object. Ref: * typically but not always the MD5 hash of the object. Ref:
* <a href="https://cloud.google.com/storage/docs/hashes-etags#etags">ETags</a> * <a href="https://cloud.google.com/storage/docs/hashes-etags#etags">ETags</a>
*
* @param bucket * @param bucket
* @param path * @param path
*
* @return * @return
*
* @throws IOException * @throws IOException
*/ */
public String version(final String bucket, final String path) throws IOException public String version(final String bucket, final String path) throws IOException

View File

@ -39,6 +39,12 @@ public class GoogleTaskLogs implements TaskLogs
{ {
private static final Logger LOG = new Logger(GoogleTaskLogs.class); private static final Logger LOG = new Logger(GoogleTaskLogs.class);
/**
* Use 1MB upload buffer, rather than the default of 15 MB in the API client. Mainly because MMs may upload logs
* in parallel, and typically have small heaps. The default-sized 15 MB buffers add up quickly.
*/
static final int UPLOAD_BUFFER_SIZE = 1024 * 1024;
private final GoogleTaskLogsConfig config; private final GoogleTaskLogsConfig config;
private final GoogleStorage storage; private final GoogleStorage storage;
private final GoogleInputDataConfig inputDataConfig; private final GoogleInputDataConfig inputDataConfig;
@ -92,7 +98,7 @@ public class GoogleTaskLogs implements TaskLogs
try { try {
RetryUtils.retry( RetryUtils.retry(
(RetryUtils.Task<Void>) () -> { (RetryUtils.Task<Void>) () -> {
storage.insert(config.getBucket(), taskKey, mediaContent); storage.insert(config.getBucket(), taskKey, mediaContent, UPLOAD_BUFFER_SIZE);
return null; return null;
}, },
GoogleUtils::isRetryable, GoogleUtils::isRetryable,

View File

@ -19,9 +19,11 @@
package org.apache.druid.storage.google; package org.apache.druid.storage.google;
import com.google.api.client.http.AbstractInputStreamContent;
import com.google.api.gax.paging.Page; import com.google.api.gax.paging.Page;
import com.google.cloud.storage.Blob; import com.google.cloud.storage.Blob;
import com.google.cloud.storage.BlobId; import com.google.cloud.storage.BlobId;
import com.google.cloud.storage.BlobInfo;
import com.google.cloud.storage.Storage; import com.google.cloud.storage.Storage;
import com.google.cloud.storage.StorageException; import com.google.cloud.storage.StorageException;
import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableList;
@ -31,7 +33,9 @@ import org.junit.Assert;
import org.junit.Before; import org.junit.Before;
import org.junit.Test; import org.junit.Test;
import java.io.ByteArrayInputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream;
import java.time.OffsetDateTime; import java.time.OffsetDateTime;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
@ -65,12 +69,51 @@ public class GoogleStorageTest
blob = EasyMock.mock(Blob.class); blob = EasyMock.mock(Blob.class);
} }
@Test
public void testInsertDefaultBufferSize() throws IOException
{
final ByteArrayInputStream inputStream = new ByteArrayInputStream(new byte[0]);
final Capture<InputStream> inputStreamCapture = Capture.newInstance();
final AbstractInputStreamContent httpContent = EasyMock.createMock(AbstractInputStreamContent.class);
EasyMock.expect(httpContent.getInputStream()).andReturn(inputStream);
EasyMock.expect(
mockStorage.createFrom(
EasyMock.eq(BlobInfo.newBuilder(BlobId.of(BUCKET, PATH)).build()),
EasyMock.capture(inputStreamCapture)
)
).andReturn(blob);
EasyMock.replay(httpContent, mockStorage, blob);
googleStorage.insert(BUCKET, PATH, httpContent, null);
EasyMock.verify(httpContent, mockStorage, blob);
}
@Test
public void testInsertCustomBufferSize() throws IOException
{
final int bufferSize = 100;
final ByteArrayInputStream inputStream = new ByteArrayInputStream(new byte[0]);
final Capture<InputStream> inputStreamCapture = Capture.newInstance();
final AbstractInputStreamContent httpContent = EasyMock.createMock(AbstractInputStreamContent.class);
EasyMock.expect(httpContent.getInputStream()).andReturn(inputStream);
EasyMock.expect(
mockStorage.createFrom(
EasyMock.eq(BlobInfo.newBuilder(BlobId.of(BUCKET, PATH)).build()),
EasyMock.capture(inputStreamCapture),
EasyMock.eq(bufferSize)
)
).andReturn(blob);
EasyMock.replay(httpContent, mockStorage, blob);
googleStorage.insert(BUCKET, PATH, httpContent, bufferSize);
EasyMock.verify(httpContent, mockStorage, blob);
}
@Test @Test
public void testDeleteSuccess() public void testDeleteSuccess()
{ {
EasyMock.expect(mockStorage.delete(EasyMock.eq(BUCKET), EasyMock.eq(PATH))).andReturn(true); EasyMock.expect(mockStorage.delete(EasyMock.eq(BUCKET), EasyMock.eq(PATH))).andReturn(true);
EasyMock.replay(mockStorage); EasyMock.replay(mockStorage);
googleStorage.delete(BUCKET, PATH); googleStorage.delete(BUCKET, PATH);
EasyMock.verify(mockStorage);
} }
@Test @Test
@ -79,6 +122,7 @@ public class GoogleStorageTest
EasyMock.expect(mockStorage.delete(EasyMock.eq(BUCKET), EasyMock.eq(PATH))).andReturn(false); EasyMock.expect(mockStorage.delete(EasyMock.eq(BUCKET), EasyMock.eq(PATH))).andReturn(false);
EasyMock.replay(mockStorage); EasyMock.replay(mockStorage);
googleStorage.delete(BUCKET, PATH); googleStorage.delete(BUCKET, PATH);
EasyMock.verify(mockStorage);
} }
@Test @Test
@ -87,6 +131,7 @@ public class GoogleStorageTest
EasyMock.expect(mockStorage.delete(EasyMock.eq(BUCKET), EasyMock.eq(PATH))).andThrow(STORAGE_EXCEPTION); EasyMock.expect(mockStorage.delete(EasyMock.eq(BUCKET), EasyMock.eq(PATH))).andThrow(STORAGE_EXCEPTION);
EasyMock.replay(mockStorage); EasyMock.replay(mockStorage);
Assert.assertThrows(StorageException.class, () -> googleStorage.delete(BUCKET, PATH)); Assert.assertThrows(StorageException.class, () -> googleStorage.delete(BUCKET, PATH));
EasyMock.verify(mockStorage);
} }
@Test @Test
@ -107,7 +152,7 @@ public class GoogleStorageTest
assertTrue(paths.size() == recordedPaths.size() && paths.containsAll(recordedPaths) && recordedPaths.containsAll( assertTrue(paths.size() == recordedPaths.size() && paths.containsAll(recordedPaths) && recordedPaths.containsAll(
paths)); paths));
assertEquals(BUCKET, recordedBlobIds.get(0).getBucket()); assertEquals(BUCKET, recordedBlobIds.get(0).getBucket());
EasyMock.verify(mockStorage);
} }
@Test @Test
@ -129,7 +174,7 @@ public class GoogleStorageTest
assertTrue(paths.containsAll(recordedPaths)); assertTrue(paths.containsAll(recordedPaths));
assertTrue(recordedPaths.containsAll(paths)); assertTrue(recordedPaths.containsAll(paths));
assertEquals(BUCKET, recordedBlobIds.get(0).getBucket()); assertEquals(BUCKET, recordedBlobIds.get(0).getBucket());
EasyMock.verify(mockStorage);
} }
@Test @Test
@ -140,6 +185,7 @@ public class GoogleStorageTest
.andThrow(STORAGE_EXCEPTION); .andThrow(STORAGE_EXCEPTION);
EasyMock.replay(mockStorage); EasyMock.replay(mockStorage);
Assert.assertThrows(StorageException.class, () -> googleStorage.batchDelete(BUCKET, paths)); Assert.assertThrows(StorageException.class, () -> googleStorage.batchDelete(BUCKET, paths));
EasyMock.verify(mockStorage);
} }
@Test @Test
@ -164,6 +210,7 @@ public class GoogleStorageTest
new GoogleStorageObjectMetadata(BUCKET, PATH, SIZE, UPDATE_TIME.toEpochSecond() * 1000) new GoogleStorageObjectMetadata(BUCKET, PATH, SIZE, UPDATE_TIME.toEpochSecond() * 1000)
); );
EasyMock.verify(mockStorage);
} }
@Test @Test
@ -172,6 +219,7 @@ public class GoogleStorageTest
EasyMock.expect(mockStorage.get(EasyMock.eq(BUCKET), EasyMock.eq(PATH))).andReturn(blob); EasyMock.expect(mockStorage.get(EasyMock.eq(BUCKET), EasyMock.eq(PATH))).andReturn(blob);
EasyMock.replay(mockStorage); EasyMock.replay(mockStorage);
assertTrue(googleStorage.exists(BUCKET, PATH)); assertTrue(googleStorage.exists(BUCKET, PATH));
EasyMock.verify(mockStorage);
} }
@Test @Test
@ -180,6 +228,7 @@ public class GoogleStorageTest
EasyMock.expect(mockStorage.get(EasyMock.eq(BUCKET), EasyMock.eq(PATH))).andReturn(null); EasyMock.expect(mockStorage.get(EasyMock.eq(BUCKET), EasyMock.eq(PATH))).andReturn(null);
EasyMock.replay(mockStorage); EasyMock.replay(mockStorage);
assertFalse(googleStorage.exists(BUCKET, PATH)); assertFalse(googleStorage.exists(BUCKET, PATH));
EasyMock.verify(mockStorage);
} }
@Test @Test
@ -198,6 +247,7 @@ public class GoogleStorageTest
long size = googleStorage.size(BUCKET, PATH); long size = googleStorage.size(BUCKET, PATH);
assertEquals(size, SIZE); assertEquals(size, SIZE);
EasyMock.verify(mockStorage, blob);
} }
@Test @Test
@ -215,6 +265,7 @@ public class GoogleStorageTest
EasyMock.replay(mockStorage, blob); EasyMock.replay(mockStorage, blob);
assertEquals(etag, googleStorage.version(BUCKET, PATH)); assertEquals(etag, googleStorage.version(BUCKET, PATH));
EasyMock.verify(mockStorage, blob);
} }
@Test @Test
@ -279,5 +330,7 @@ public class GoogleStorageTest
assertEquals(objectPage.getObjectList().get(0), objectMetadata1); assertEquals(objectPage.getObjectList().get(0), objectMetadata1);
assertEquals(objectPage.getObjectList().get(1), objectMetadata2); assertEquals(objectPage.getObjectList().get(1), objectMetadata2);
assertEquals(objectPage.getNextPageToken(), nextPageToken); assertEquals(objectPage.getNextPageToken(), nextPageToken);
EasyMock.verify(mockStorage, blobPage, blob1, blob2);
} }
} }

View File

@ -91,7 +91,8 @@ public class GoogleTaskLogsTest extends EasyMockSupport
storage.insert( storage.insert(
EasyMock.eq(BUCKET), EasyMock.eq(BUCKET),
EasyMock.eq(PREFIX + "/" + TASKID), EasyMock.eq(PREFIX + "/" + TASKID),
EasyMock.anyObject(InputStreamContent.class) EasyMock.anyObject(InputStreamContent.class),
EasyMock.eq(GoogleTaskLogs.UPLOAD_BUFFER_SIZE)
); );
EasyMock.expectLastCall(); EasyMock.expectLastCall();
@ -120,7 +121,8 @@ public class GoogleTaskLogsTest extends EasyMockSupport
storage.insert( storage.insert(
EasyMock.eq(BUCKET), EasyMock.eq(BUCKET),
EasyMock.eq(PREFIX + "/" + TASKID), EasyMock.eq(PREFIX + "/" + TASKID),
EasyMock.anyObject(InputStreamContent.class) EasyMock.anyObject(InputStreamContent.class),
EasyMock.eq(GoogleTaskLogs.UPLOAD_BUFFER_SIZE)
); );
EasyMock.expectLastCall(); EasyMock.expectLastCall();

View File

@ -93,9 +93,11 @@ public class GcsTestUtil
{ {
LOG.info("Uploading file %s at path %s in bucket %s", filePath, GOOGLE_PREFIX, GOOGLE_BUCKET); LOG.info("Uploading file %s at path %s in bucket %s", filePath, GOOGLE_PREFIX, GOOGLE_BUCKET);
File file = new File(filePath); File file = new File(filePath);
googleStorageClient.insert(GOOGLE_BUCKET, googleStorageClient.insert(
GOOGLE_BUCKET,
GOOGLE_PREFIX + "/" + file.getName(), GOOGLE_PREFIX + "/" + file.getName(),
new FileContent(contentType, file) new FileContent(contentType, file),
null
); );
} }