Fix bug with cancelling pending tasks when running kubernetes ingestion. (#16036)

* Fix bug

* Add new test
This commit is contained in:
George Shiqi Wu 2024-03-07 15:48:14 -05:00 committed by GitHub
parent 67ae0ff450
commit 80cab51d50
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 62 additions and 9 deletions

View File

@ -263,11 +263,14 @@ public class KubernetesTaskRunner implements TaskLogStreamer, TaskRunner
return;
}
workItem.shutdown();
if (!workItem.getResult().isDone()) {
return;
}
synchronized (tasks) {
tasks.remove(taskid);
}
workItem.shutdown();
}
@Override

View File

@ -61,7 +61,7 @@ public class KubernetesPeonClient
this.emitter = emitter;
}
public Pod launchPeonJobAndWaitForStart(Job job, Task task, long howLong, TimeUnit timeUnit)
public Pod launchPeonJobAndWaitForStart(Job job, Task task, long howLong, TimeUnit timeUnit) throws IllegalStateException
{
long start = System.currentTimeMillis();
// launch job
@ -74,12 +74,15 @@ public class KubernetesPeonClient
Pod result = client.pods().inNamespace(namespace).withName(mainPod.getMetadata().getName())
.waitUntilCondition(pod -> {
if (pod == null) {
return false;
return true;
}
return pod.getStatus() != null && pod.getStatus().getPodIP() != null;
}, howLong, timeUnit);
if (result == null) {
throw new IllegalStateException("K8s pod for the task [%s] appeared and disappeared. It can happen if the task was canceled");
}
long duration = System.currentTimeMillis() - start;
log.info("Took task %s %d ms for pod to startup", jobName, duration);
emitK8sPodMetrics(task, "k8s/peon/startup/time", duration);
return result;
});

View File

@ -75,6 +75,7 @@ public class KubernetesTaskRunnerTest extends EasyMockSupport
@Mock private KubernetesPeonClient peonClient;
@Mock private KubernetesPeonLifecycle kubernetesPeonLifecycle;
@Mock private ServiceEmitter emitter;
@Mock private ListenableFuture<TaskStatus> statusFuture;
private KubernetesTaskRunnerConfig config;
private KubernetesTaskRunner runner;
@ -330,16 +331,35 @@ public class KubernetesTaskRunnerTest extends EasyMockSupport
@Test
public void test_shutdown_withExistingTask_removesTaskFromMap()
{
KubernetesWorkItem workItem = new KubernetesWorkItem(task, null) {
KubernetesWorkItem workItem = new KubernetesWorkItem(task, statusFuture) {
@Override
protected synchronized void shutdown()
{
}
};
EasyMock.expect(statusFuture.isDone()).andReturn(true).anyTimes();
replayAll();
runner.tasks.put(task.getId(), workItem);
runner.shutdown(task.getId(), "");
Assert.assertTrue(runner.tasks.isEmpty());
verifyAll();
}
@Test
public void test_shutdown_withExistingTask_futureIncomplete_removesTaskFromMap()
{
KubernetesWorkItem workItem = new KubernetesWorkItem(task, statusFuture) {
@Override
protected synchronized void shutdown()
{
}
};
EasyMock.expect(statusFuture.isDone()).andReturn(false).anyTimes();
replayAll();
runner.tasks.put(task.getId(), workItem);
runner.shutdown(task.getId(), "");
Assert.assertEquals(1, runner.tasks.size());
verifyAll();
}
@Test

View File

@ -94,7 +94,7 @@ public class KubernetesPeonClientTest
}
@Test
void test_launchPeonJobAndWaitForStart_withDisappearingPod_throwsKubernetesClientTimeoutException()
void test_launchPeonJobAndWaitForStart_withDisappearingPod_throwIllegalStateExceptionn()
{
Job job = new JobBuilder()
.withNewMetadata()
@ -115,11 +115,38 @@ public class KubernetesPeonClientTest
).once();
Assertions.assertThrows(
KubernetesClientTimeoutException.class,
IllegalStateException.class,
() -> instance.launchPeonJobAndWaitForStart(job, NoopTask.create(), 1, TimeUnit.SECONDS)
);
}
@Test
void test_launchPeonJobAndWaitForStart_withPendingPod_throwIllegalStateExceptionn()
{
Job job = new JobBuilder()
.withNewMetadata()
.withName(KUBERNETES_JOB_NAME)
.endMetadata()
.build();
Pod pod = new PodBuilder()
.withNewMetadata()
.withName(POD_NAME)
.addToLabels("job-name", KUBERNETES_JOB_NAME)
.endMetadata()
.withNewStatus()
.withPodIP(null)
.endStatus()
.build();
client.pods().inNamespace(NAMESPACE).resource(pod).create();
Assertions.assertThrows(
KubernetesClientTimeoutException.class,
() -> instance.launchPeonJobAndWaitForStart(job, NoopTask.create(), 1, TimeUnit.SECONDS)
);
}
@Test
void test_waitForPeonJobCompletion_withSuccessfulJob_returnsJobResponseWithJobAndSucceededPeonPhase()
{