mirror of https://github.com/apache/druid.git
Fix error handling after pause request in Kafka supervisor (#6754)
* Fix error handling after pause request in kafka supervisor * fix test * fix test
This commit is contained in:
parent
9505074530
commit
4591c56afb
|
@ -25,6 +25,7 @@ import com.google.common.collect.ImmutableMap;
|
||||||
import com.google.common.util.concurrent.ListenableFuture;
|
import com.google.common.util.concurrent.ListenableFuture;
|
||||||
import org.apache.druid.indexing.common.IndexTaskClient;
|
import org.apache.druid.indexing.common.IndexTaskClient;
|
||||||
import org.apache.druid.indexing.common.TaskInfoProvider;
|
import org.apache.druid.indexing.common.TaskInfoProvider;
|
||||||
|
import org.apache.druid.indexing.kafka.KafkaIndexTask.Status;
|
||||||
import org.apache.druid.java.util.common.ISE;
|
import org.apache.druid.java.util.common.ISE;
|
||||||
import org.apache.druid.java.util.common.RE;
|
import org.apache.druid.java.util.common.RE;
|
||||||
import org.apache.druid.java.util.common.StringUtils;
|
import org.apache.druid.java.util.common.StringUtils;
|
||||||
|
@ -115,31 +116,48 @@ public class KafkaIndexTaskClient extends IndexTaskClient
|
||||||
true
|
true
|
||||||
);
|
);
|
||||||
|
|
||||||
if (response.getStatus().equals(HttpResponseStatus.OK)) {
|
final HttpResponseStatus responseStatus = response.getStatus();
|
||||||
|
final String responseContent = response.getContent();
|
||||||
|
|
||||||
|
if (responseStatus.equals(HttpResponseStatus.OK)) {
|
||||||
log.info("Task [%s] paused successfully", id);
|
log.info("Task [%s] paused successfully", id);
|
||||||
return deserialize(response.getContent(), new TypeReference<Map<Integer, Long>>()
|
return deserialize(responseContent, new TypeReference<Map<Integer, Long>>()
|
||||||
{
|
{
|
||||||
});
|
});
|
||||||
}
|
} else if (responseStatus.equals(HttpResponseStatus.ACCEPTED)) {
|
||||||
|
// The task received the pause request, but its status hasn't been changed yet.
|
||||||
|
while (true) {
|
||||||
|
final Status status = getStatus(id);
|
||||||
|
if (status == KafkaIndexTask.Status.PAUSED) {
|
||||||
|
return getCurrentOffsets(id, true);
|
||||||
|
}
|
||||||
|
|
||||||
while (true) {
|
final Duration delay = newRetryPolicy().getAndIncrementRetryDelay();
|
||||||
if (getStatus(id) == KafkaIndexTask.Status.PAUSED) {
|
if (delay == null) {
|
||||||
return getCurrentOffsets(id, true);
|
throw new ISE(
|
||||||
}
|
"Task [%s] failed to change its status from [%s] to [%s], aborting",
|
||||||
|
id,
|
||||||
final Duration delay = newRetryPolicy().getAndIncrementRetryDelay();
|
status,
|
||||||
if (delay == null) {
|
Status.PAUSED
|
||||||
log.error("Task [%s] failed to pause, aborting", id);
|
);
|
||||||
throw new ISE("Task [%s] failed to pause, aborting", id);
|
} else {
|
||||||
} else {
|
final long sleepTime = delay.getMillis();
|
||||||
final long sleepTime = delay.getMillis();
|
log.info(
|
||||||
log.info(
|
"Still waiting for task [%s] to change its status to [%s]; will try again in [%s]",
|
||||||
"Still waiting for task [%s] to pause; will try again in [%s]",
|
id,
|
||||||
id,
|
Status.PAUSED,
|
||||||
new Duration(sleepTime).toString()
|
new Duration(sleepTime).toString()
|
||||||
);
|
);
|
||||||
Thread.sleep(sleepTime);
|
Thread.sleep(sleepTime);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
throw new ISE(
|
||||||
|
"Pause request for task [%s] failed with response [%s] : [%s]",
|
||||||
|
id,
|
||||||
|
responseStatus,
|
||||||
|
responseContent
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (NoTaskLocationException e) {
|
catch (NoTaskLocationException e) {
|
||||||
|
|
|
@ -1628,13 +1628,39 @@ public class KafkaSupervisor implements Supervisor
|
||||||
// 3) Build a map of the highest offset read by any task in the group for each partition
|
// 3) Build a map of the highest offset read by any task in the group for each partition
|
||||||
final Map<Integer, Long> endOffsets = new HashMap<>();
|
final Map<Integer, Long> endOffsets = new HashMap<>();
|
||||||
for (int i = 0; i < input.size(); i++) {
|
for (int i = 0; i < input.size(); i++) {
|
||||||
Map<Integer, Long> result = input.get(i);
|
final Map<Integer, Long> result = input.get(i);
|
||||||
|
final String taskId = pauseTaskIds.get(i);
|
||||||
|
|
||||||
if (result == null || result.isEmpty()) { // kill tasks that didn't return a value
|
if (result == null) {
|
||||||
String taskId = pauseTaskIds.get(i);
|
// Get the exception
|
||||||
killTask(taskId, "Task [%s] failed to respond to [pause] in a timely manner, killing task", taskId);
|
final Throwable pauseException;
|
||||||
|
try {
|
||||||
|
// The below get should throw ExecutionException since result is null.
|
||||||
|
final Map<Integer, Long> pauseResult = pauseFutures.get(i).get();
|
||||||
|
throw new ISE(
|
||||||
|
"WTH? The pause request for task [%s] is supposed to fail, but returned [%s]",
|
||||||
|
taskId,
|
||||||
|
pauseResult
|
||||||
|
);
|
||||||
|
}
|
||||||
|
catch (InterruptedException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
catch (ExecutionException e) {
|
||||||
|
pauseException = e.getCause() == null ? e : e.getCause();
|
||||||
|
}
|
||||||
|
|
||||||
|
killTask(
|
||||||
|
taskId,
|
||||||
|
"An exception occured while waiting for task [%s] to pause: [%s]",
|
||||||
|
taskId,
|
||||||
|
pauseException
|
||||||
|
);
|
||||||
taskGroup.tasks.remove(taskId);
|
taskGroup.tasks.remove(taskId);
|
||||||
|
|
||||||
|
} else if (result.isEmpty()) {
|
||||||
|
killTask(taskId, "Task [%s] returned empty offsets after pause", taskId);
|
||||||
|
taskGroup.tasks.remove(taskId);
|
||||||
} else { // otherwise build a map of the highest offsets seen
|
} else { // otherwise build a map of the highest offsets seen
|
||||||
for (Entry<Integer, Long> offset : result.entrySet()) {
|
for (Entry<Integer, Long> offset : result.entrySet()) {
|
||||||
if (!endOffsets.containsKey(offset.getKey())
|
if (!endOffsets.containsKey(offset.getKey())
|
||||||
|
|
|
@ -457,9 +457,10 @@ public class KafkaIndexTaskClientTest extends EasyMockSupport
|
||||||
Capture<Request> captured = Capture.newInstance();
|
Capture<Request> captured = Capture.newInstance();
|
||||||
Capture<Request> captured2 = Capture.newInstance();
|
Capture<Request> captured2 = Capture.newInstance();
|
||||||
Capture<Request> captured3 = Capture.newInstance();
|
Capture<Request> captured3 = Capture.newInstance();
|
||||||
|
// one time in IndexTaskClient.submitRequest() and another in KafkaIndexTaskClient.pause()
|
||||||
expect(responseHolder.getStatus()).andReturn(HttpResponseStatus.ACCEPTED).times(2)
|
expect(responseHolder.getStatus()).andReturn(HttpResponseStatus.ACCEPTED).times(2)
|
||||||
.andReturn(HttpResponseStatus.OK).times(2);
|
.andReturn(HttpResponseStatus.OK).anyTimes();
|
||||||
expect(responseHolder.getContent()).andReturn("\"PAUSED\"")
|
expect(responseHolder.getContent()).andReturn("\"PAUSED\"").times(2)
|
||||||
.andReturn("{\"0\":1, \"1\":10}").anyTimes();
|
.andReturn("{\"0\":1, \"1\":10}").anyTimes();
|
||||||
expect(httpClient.go(capture(captured), anyObject(FullResponseHandler.class), eq(TEST_HTTP_TIMEOUT))).andReturn(
|
expect(httpClient.go(capture(captured), anyObject(FullResponseHandler.class), eq(TEST_HTTP_TIMEOUT))).andReturn(
|
||||||
Futures.immediateFuture(responseHolder)
|
Futures.immediateFuture(responseHolder)
|
||||||
|
|
|
@ -1537,8 +1537,9 @@ public class KafkaSupervisorTest extends EasyMockSupport
|
||||||
.andReturn(Futures.immediateFailedFuture(new RuntimeException())).times(2);
|
.andReturn(Futures.immediateFailedFuture(new RuntimeException())).times(2);
|
||||||
taskQueue.shutdown(
|
taskQueue.shutdown(
|
||||||
EasyMock.contains("sequenceName-0"),
|
EasyMock.contains("sequenceName-0"),
|
||||||
EasyMock.eq("Task [%s] failed to respond to [pause] in a timely manner, killing task"),
|
EasyMock.eq("An exception occured while waiting for task [%s] to pause: [%s]"),
|
||||||
EasyMock.contains("sequenceName-0")
|
EasyMock.contains("sequenceName-0"),
|
||||||
|
EasyMock.anyString()
|
||||||
);
|
);
|
||||||
expectLastCall().times(2);
|
expectLastCall().times(2);
|
||||||
expect(taskQueue.add(capture(captured))).andReturn(true).times(2);
|
expect(taskQueue.add(capture(captured))).andReturn(true).times(2);
|
||||||
|
|
Loading…
Reference in New Issue