reject publishing actions with a retriable error code if a earlier task is still publishing (#17509)

* Working queuing of publishing

* fix style

* Add unit tests

* add tests

* retry within the connector

* fix unit tests

* Update indexing-service/src/main/java/org/apache/druid/indexing/common/actions/LocalTaskActionClient.java

Co-authored-by: Kashif Faraz <kashif.faraz@gmail.com>

* Add comment

* fix style

* Fix unit tests

* style fix

---------

Co-authored-by: Kashif Faraz <kashif.faraz@gmail.com>
This commit is contained in:
George Shiqi Wu 2024-12-12 10:37:53 -05:00 committed by GitHub
parent 1a38434d8d
commit aca56d6bb8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 9 additions and 6 deletions

View File

@ -69,6 +69,7 @@ public class LocalTaskActionClient implements TaskActionClient
return result; return result;
} }
catch (Throwable t) { catch (Throwable t) {
log.error(t, "Failed to perform action[%s]", taskAction);
throw new RuntimeException(t); throw new RuntimeException(t);
} }
} }

View File

@ -22,7 +22,6 @@ package org.apache.druid.indexing.common.actions;
import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet; import com.google.common.collect.ImmutableSet;
import org.apache.druid.error.InvalidInput;
import org.apache.druid.indexing.common.TaskLockType; import org.apache.druid.indexing.common.TaskLockType;
import org.apache.druid.indexing.common.task.NoopTask; import org.apache.druid.indexing.common.task.NoopTask;
import org.apache.druid.indexing.common.task.Task; import org.apache.druid.indexing.common.task.Task;
@ -32,6 +31,7 @@ import org.apache.druid.indexing.overlord.SegmentPublishResult;
import org.apache.druid.indexing.overlord.Segments; import org.apache.druid.indexing.overlord.Segments;
import org.apache.druid.indexing.overlord.TimeChunkLockRequest; import org.apache.druid.indexing.overlord.TimeChunkLockRequest;
import org.apache.druid.java.util.common.Intervals; import org.apache.druid.java.util.common.Intervals;
import org.apache.druid.metadata.RetryTransactionException;
import org.apache.druid.timeline.DataSegment; import org.apache.druid.timeline.DataSegment;
import org.apache.druid.timeline.partition.LinearShardSpec; import org.apache.druid.timeline.partition.LinearShardSpec;
import org.assertj.core.api.Assertions; import org.assertj.core.api.Assertions;
@ -151,7 +151,7 @@ public class SegmentTransactionalInsertActionTest
Assert.assertEquals( Assert.assertEquals(
SegmentPublishResult.fail( SegmentPublishResult.fail(
InvalidInput.exception( new RetryTransactionException(
"The new start metadata state[ObjectMetadata{theObject=[1]}] is" "The new start metadata state[ObjectMetadata{theObject=[1]}] is"
+ " ahead of the last committed end state[null]. Try resetting the supervisor." + " ahead of the last committed end state[null]. Try resetting the supervisor."
).toString() ).toString()

View File

@ -2687,7 +2687,9 @@ public class IndexerSQLMetadataStorageCoordinator implements IndexerMetadataStor
if (startMetadataGreaterThanExisting && !startMetadataMatchesExisting) { if (startMetadataGreaterThanExisting && !startMetadataMatchesExisting) {
// Offsets stored in startMetadata is greater than the last commited metadata. // Offsets stored in startMetadata is greater than the last commited metadata.
return DataStoreMetadataUpdateResult.failure( // This can happen because the previous task is still publishing its segments and can resolve once
// the previous task finishes publishing.
return DataStoreMetadataUpdateResult.retryableFailure(
"The new start metadata state[%s] is ahead of the last committed" "The new start metadata state[%s] is ahead of the last committed"
+ " end state[%s]. Try resetting the supervisor.", + " end state[%s]. Try resetting the supervisor.",
startMetadata, oldCommitMetadataFromDb startMetadata, oldCommitMetadataFromDb

View File

@ -784,15 +784,15 @@ public class IndexerSQLMetadataStorageCoordinatorTest extends IndexerSqlMetadata
); );
Assert.assertEquals( Assert.assertEquals(
SegmentPublishResult.fail( SegmentPublishResult.fail(
InvalidInput.exception( new RetryTransactionException(
"The new start metadata state[ObjectMetadata{theObject={foo=bar}}] is ahead of the last committed" "The new start metadata state[ObjectMetadata{theObject={foo=bar}}] is ahead of the last committed"
+ " end state[null]. Try resetting the supervisor." + " end state[null]. Try resetting the supervisor."
).toString()), ).toString()),
result1 result1
); );
// Should only be tried once. // Should be retried.
Assert.assertEquals(1, metadataUpdateCounter.get()); Assert.assertEquals(2, metadataUpdateCounter.get());
} }
@Test @Test