Fix error handling after pause request in Kafka supervisor (#6754)

* Fix error handling after pause request in kafka supervisor

* fix test

* fix test
This commit is contained in:
Jihoon Son 2018-12-18 17:52:44 -08:00 committed by Gian Merlino
parent 9505074530
commit 4591c56afb
4 changed files with 74 additions and 28 deletions

View File

@ -25,6 +25,7 @@ import com.google.common.collect.ImmutableMap;
import com.google.common.util.concurrent.ListenableFuture; import com.google.common.util.concurrent.ListenableFuture;
import org.apache.druid.indexing.common.IndexTaskClient; import org.apache.druid.indexing.common.IndexTaskClient;
import org.apache.druid.indexing.common.TaskInfoProvider; import org.apache.druid.indexing.common.TaskInfoProvider;
import org.apache.druid.indexing.kafka.KafkaIndexTask.Status;
import org.apache.druid.java.util.common.ISE; import org.apache.druid.java.util.common.ISE;
import org.apache.druid.java.util.common.RE; import org.apache.druid.java.util.common.RE;
import org.apache.druid.java.util.common.StringUtils; import org.apache.druid.java.util.common.StringUtils;
@ -115,32 +116,49 @@ public class KafkaIndexTaskClient extends IndexTaskClient
true true
); );
if (response.getStatus().equals(HttpResponseStatus.OK)) { final HttpResponseStatus responseStatus = response.getStatus();
final String responseContent = response.getContent();
if (responseStatus.equals(HttpResponseStatus.OK)) {
log.info("Task [%s] paused successfully", id); log.info("Task [%s] paused successfully", id);
return deserialize(response.getContent(), new TypeReference<Map<Integer, Long>>() return deserialize(responseContent, new TypeReference<Map<Integer, Long>>()
{ {
}); });
} } else if (responseStatus.equals(HttpResponseStatus.ACCEPTED)) {
// The task received the pause request, but its status hasn't been changed yet.
while (true) { while (true) {
if (getStatus(id) == KafkaIndexTask.Status.PAUSED) { final Status status = getStatus(id);
if (status == KafkaIndexTask.Status.PAUSED) {
return getCurrentOffsets(id, true); return getCurrentOffsets(id, true);
} }
final Duration delay = newRetryPolicy().getAndIncrementRetryDelay(); final Duration delay = newRetryPolicy().getAndIncrementRetryDelay();
if (delay == null) { if (delay == null) {
log.error("Task [%s] failed to pause, aborting", id); throw new ISE(
throw new ISE("Task [%s] failed to pause, aborting", id); "Task [%s] failed to change its status from [%s] to [%s], aborting",
id,
status,
Status.PAUSED
);
} else { } else {
final long sleepTime = delay.getMillis(); final long sleepTime = delay.getMillis();
log.info( log.info(
"Still waiting for task [%s] to pause; will try again in [%s]", "Still waiting for task [%s] to change its status to [%s]; will try again in [%s]",
id, id,
Status.PAUSED,
new Duration(sleepTime).toString() new Duration(sleepTime).toString()
); );
Thread.sleep(sleepTime); Thread.sleep(sleepTime);
} }
} }
} else {
throw new ISE(
"Pause request for task [%s] failed with response [%s] : [%s]",
id,
responseStatus,
responseContent
);
}
} }
catch (NoTaskLocationException e) { catch (NoTaskLocationException e) {
log.error("Exception [%s] while pausing Task [%s]", e.getMessage(), id); log.error("Exception [%s] while pausing Task [%s]", e.getMessage(), id);

View File

@ -1628,13 +1628,39 @@ public class KafkaSupervisor implements Supervisor
// 3) Build a map of the highest offset read by any task in the group for each partition // 3) Build a map of the highest offset read by any task in the group for each partition
final Map<Integer, Long> endOffsets = new HashMap<>(); final Map<Integer, Long> endOffsets = new HashMap<>();
for (int i = 0; i < input.size(); i++) { for (int i = 0; i < input.size(); i++) {
Map<Integer, Long> result = input.get(i); final Map<Integer, Long> result = input.get(i);
final String taskId = pauseTaskIds.get(i);
if (result == null || result.isEmpty()) { // kill tasks that didn't return a value if (result == null) {
String taskId = pauseTaskIds.get(i); // Get the exception
killTask(taskId, "Task [%s] failed to respond to [pause] in a timely manner, killing task", taskId); final Throwable pauseException;
try {
// The below get should throw ExecutionException since result is null.
final Map<Integer, Long> pauseResult = pauseFutures.get(i).get();
throw new ISE(
"WTH? The pause request for task [%s] is supposed to fail, but returned [%s]",
taskId,
pauseResult
);
}
catch (InterruptedException e) {
throw new RuntimeException(e);
}
catch (ExecutionException e) {
pauseException = e.getCause() == null ? e : e.getCause();
}
killTask(
taskId,
"An exception occured while waiting for task [%s] to pause: [%s]",
taskId,
pauseException
);
taskGroup.tasks.remove(taskId); taskGroup.tasks.remove(taskId);
} else if (result.isEmpty()) {
killTask(taskId, "Task [%s] returned empty offsets after pause", taskId);
taskGroup.tasks.remove(taskId);
} else { // otherwise build a map of the highest offsets seen } else { // otherwise build a map of the highest offsets seen
for (Entry<Integer, Long> offset : result.entrySet()) { for (Entry<Integer, Long> offset : result.entrySet()) {
if (!endOffsets.containsKey(offset.getKey()) if (!endOffsets.containsKey(offset.getKey())

View File

@ -457,9 +457,10 @@ public class KafkaIndexTaskClientTest extends EasyMockSupport
Capture<Request> captured = Capture.newInstance(); Capture<Request> captured = Capture.newInstance();
Capture<Request> captured2 = Capture.newInstance(); Capture<Request> captured2 = Capture.newInstance();
Capture<Request> captured3 = Capture.newInstance(); Capture<Request> captured3 = Capture.newInstance();
// one time in IndexTaskClient.submitRequest() and another in KafkaIndexTaskClient.pause()
expect(responseHolder.getStatus()).andReturn(HttpResponseStatus.ACCEPTED).times(2) expect(responseHolder.getStatus()).andReturn(HttpResponseStatus.ACCEPTED).times(2)
.andReturn(HttpResponseStatus.OK).times(2); .andReturn(HttpResponseStatus.OK).anyTimes();
expect(responseHolder.getContent()).andReturn("\"PAUSED\"") expect(responseHolder.getContent()).andReturn("\"PAUSED\"").times(2)
.andReturn("{\"0\":1, \"1\":10}").anyTimes(); .andReturn("{\"0\":1, \"1\":10}").anyTimes();
expect(httpClient.go(capture(captured), anyObject(FullResponseHandler.class), eq(TEST_HTTP_TIMEOUT))).andReturn( expect(httpClient.go(capture(captured), anyObject(FullResponseHandler.class), eq(TEST_HTTP_TIMEOUT))).andReturn(
Futures.immediateFuture(responseHolder) Futures.immediateFuture(responseHolder)

View File

@ -1537,8 +1537,9 @@ public class KafkaSupervisorTest extends EasyMockSupport
.andReturn(Futures.immediateFailedFuture(new RuntimeException())).times(2); .andReturn(Futures.immediateFailedFuture(new RuntimeException())).times(2);
taskQueue.shutdown( taskQueue.shutdown(
EasyMock.contains("sequenceName-0"), EasyMock.contains("sequenceName-0"),
EasyMock.eq("Task [%s] failed to respond to [pause] in a timely manner, killing task"), EasyMock.eq("An exception occured while waiting for task [%s] to pause: [%s]"),
EasyMock.contains("sequenceName-0") EasyMock.contains("sequenceName-0"),
EasyMock.anyString()
); );
expectLastCall().times(2); expectLastCall().times(2);
expect(taskQueue.add(capture(captured))).andReturn(true).times(2); expect(taskQueue.add(capture(captured))).andReturn(true).times(2);