[ML] Make job closing robust against crashes in autodetect and other misbehavior (elastic/x-pack-elasticsearch#1480)
Set job to failed if autodetect manager fails closing, fix force closing of jobs that hang in closing state, set timeout when waiting for clusterstate update, disallow closing of failed jobs with normal close relates elastic/x-pack-elasticsearch#1453 Original commit: elastic/x-pack-elasticsearch@493cf85e22
This commit is contained in:
parent
392e67851e
commit
527dcfd98d
|
@ -29,6 +29,7 @@ import org.elasticsearch.common.io.stream.StreamOutput;
|
||||||
import org.elasticsearch.common.io.stream.Writeable;
|
import org.elasticsearch.common.io.stream.Writeable;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.common.unit.TimeValue;
|
import org.elasticsearch.common.unit.TimeValue;
|
||||||
|
import org.elasticsearch.common.util.ArrayUtils;
|
||||||
import org.elasticsearch.common.util.concurrent.AbstractRunnable;
|
import org.elasticsearch.common.util.concurrent.AbstractRunnable;
|
||||||
import org.elasticsearch.common.util.concurrent.AtomicArray;
|
import org.elasticsearch.common.util.concurrent.AtomicArray;
|
||||||
import org.elasticsearch.common.xcontent.ObjectParser;
|
import org.elasticsearch.common.xcontent.ObjectParser;
|
||||||
|
@ -57,7 +58,6 @@ import org.elasticsearch.xpack.security.InternalClient;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
@ -344,7 +344,7 @@ public class CloseJobAction extends Action<CloseJobAction.Request, CloseJobActio
|
||||||
|
|
||||||
List<String> openJobs = new ArrayList<>();
|
List<String> openJobs = new ArrayList<>();
|
||||||
List<String> closingJobs = new ArrayList<>();
|
List<String> closingJobs = new ArrayList<>();
|
||||||
resolveAndValidateJobId(request.getJobId(), state, openJobs, closingJobs);
|
resolveAndValidateJobId(request.getJobId(), state, openJobs, closingJobs, request.isForce());
|
||||||
request.setOpenJobIds(openJobs.toArray(new String[0]));
|
request.setOpenJobIds(openJobs.toArray(new String[0]));
|
||||||
request.setClosingJobIds(closingJobs.toArray(new String[0]));
|
request.setClosingJobIds(closingJobs.toArray(new String[0]));
|
||||||
if (request.openJobIds.length == 0 && request.closingJobIds.length == 0) {
|
if (request.openJobIds.length == 0 && request.closingJobIds.length == 0) {
|
||||||
|
@ -432,10 +432,11 @@ public class CloseJobAction extends Action<CloseJobAction.Request, CloseJobActio
|
||||||
private void forceCloseJob(ClusterState currentState, Request request, ActionListener<Response> listener) {
|
private void forceCloseJob(ClusterState currentState, Request request, ActionListener<Response> listener) {
|
||||||
PersistentTasksCustomMetaData tasks = currentState.getMetaData().custom(PersistentTasksCustomMetaData.TYPE);
|
PersistentTasksCustomMetaData tasks = currentState.getMetaData().custom(PersistentTasksCustomMetaData.TYPE);
|
||||||
|
|
||||||
final int numberOfJobs = request.openJobIds.length;
|
final int numberOfJobs = request.openJobIds.length + request.closingJobIds.length;
|
||||||
final AtomicInteger counter = new AtomicInteger();
|
final AtomicInteger counter = new AtomicInteger();
|
||||||
final AtomicArray<Exception> failures = new AtomicArray<>(numberOfJobs);
|
final AtomicArray<Exception> failures = new AtomicArray<>(numberOfJobs);
|
||||||
for (String jobId : request.openJobIds) {
|
|
||||||
|
for (String jobId : ArrayUtils.concat(request.openJobIds, request.closingJobIds)) {
|
||||||
PersistentTask<?> jobTask = MlMetadata.getJobTask(jobId, tasks);
|
PersistentTask<?> jobTask = MlMetadata.getJobTask(jobId, tasks);
|
||||||
if (jobTask != null) {
|
if (jobTask != null) {
|
||||||
auditor.info(jobId, Messages.JOB_AUDIT_FORCE_CLOSING);
|
auditor.info(jobId, Messages.JOB_AUDIT_FORCE_CLOSING);
|
||||||
|
@ -560,14 +561,16 @@ public class CloseJobAction extends Action<CloseJobAction.Request, CloseJobActio
|
||||||
* Expand the {@code jobId} parameter and add the job Id the the list arguments
|
* Expand the {@code jobId} parameter and add the job Id the the list arguments
|
||||||
* depending on job state.
|
* depending on job state.
|
||||||
*
|
*
|
||||||
* Opened or failed jobs are added to {@code openJobs} and closing jobs added to
|
* Opened jobs are added to {@code openJobs} and closing jobs added to {@code closingJobs}. Failed jobs are added
|
||||||
* {@code closingJobs}
|
* to {@code openJobs} if allowFailed is set otherwise an exception is thrown.
|
||||||
* @param jobId The job Id. If jobId == {@link Job#ALL} then expand the job list.
|
* @param jobId The job Id. If jobId == {@link Job#ALL} then expand the job list.
|
||||||
* @param state Cluster state
|
* @param state Cluster state
|
||||||
* @param openJobs Opened or failed jobs are added to this list
|
* @param openJobs Opened or failed jobs are added to this list
|
||||||
* @param closingJobs Closing jobs are added to this list
|
* @param closingJobs Closing jobs are added to this list
|
||||||
|
* @param allowFailed Whether failed jobs are allowed, if yes, they are added to {@code openJobs}
|
||||||
*/
|
*/
|
||||||
static void resolveAndValidateJobId(String jobId, ClusterState state, List<String> openJobs, List<String> closingJobs) {
|
static void resolveAndValidateJobId(String jobId, ClusterState state, List<String> openJobs, List<String> closingJobs,
|
||||||
|
boolean allowFailed) {
|
||||||
MlMetadata mlMetadata = state.metaData().custom(MlMetadata.TYPE);
|
MlMetadata mlMetadata = state.metaData().custom(MlMetadata.TYPE);
|
||||||
PersistentTasksCustomMetaData tasksMetaData = state.getMetaData().custom(PersistentTasksCustomMetaData.TYPE);
|
PersistentTasksCustomMetaData tasksMetaData = state.getMetaData().custom(PersistentTasksCustomMetaData.TYPE);
|
||||||
|
|
||||||
|
@ -575,26 +578,40 @@ public class CloseJobAction extends Action<CloseJobAction.Request, CloseJobActio
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
List<String> failedJobs = new ArrayList<>();
|
||||||
|
|
||||||
Consumer<String> jobIdProcessor = id -> {
|
Consumer<String> jobIdProcessor = id -> {
|
||||||
validateJobAndTaskState(id, mlMetadata, tasksMetaData);
|
validateJobAndTaskState(id, mlMetadata, tasksMetaData);
|
||||||
Job job = mlMetadata.getJobs().get(id);
|
Job job = mlMetadata.getJobs().get(id);
|
||||||
if (job.isDeleted()) {
|
if (job.isDeleted()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
addJobAccordingToState(id, tasksMetaData, openJobs, closingJobs);
|
addJobAccordingToState(id, tasksMetaData, openJobs, closingJobs, failedJobs);
|
||||||
};
|
};
|
||||||
|
|
||||||
if (!Job.ALL.equals(jobId)) {
|
if (!Job.ALL.equals(jobId)) {
|
||||||
jobIdProcessor.accept(jobId);
|
jobIdProcessor.accept(jobId);
|
||||||
|
|
||||||
|
if (allowFailed == false && failedJobs.size() > 0) {
|
||||||
|
throw ExceptionsHelper.conflictStatusException("cannot close job [{}] because it failed, use force close", jobId);
|
||||||
|
}
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
for (Map.Entry<String, Job> jobEntry : mlMetadata.getJobs().entrySet()) {
|
for (Map.Entry<String, Job> jobEntry : mlMetadata.getJobs().entrySet()) {
|
||||||
jobIdProcessor.accept(jobEntry.getKey());
|
jobIdProcessor.accept(jobEntry.getKey());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (allowFailed == false && failedJobs.size() > 0) {
|
||||||
|
throw ExceptionsHelper.conflictStatusException("one or more jobs have state failed, use force close");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// allowFailed == true
|
||||||
|
openJobs.addAll(failedJobs);
|
||||||
|
}
|
||||||
|
|
||||||
private static void addJobAccordingToState(String jobId, PersistentTasksCustomMetaData tasksMetaData,
|
private static void addJobAccordingToState(String jobId, PersistentTasksCustomMetaData tasksMetaData,
|
||||||
List<String> openJobs, List<String> closingJobs) {
|
List<String> openJobs, List<String> closingJobs, List<String> failedJobs) {
|
||||||
|
|
||||||
JobState jobState = MlMetadata.getJobState(jobId, tasksMetaData);
|
JobState jobState = MlMetadata.getJobState(jobId, tasksMetaData);
|
||||||
switch (jobState) {
|
switch (jobState) {
|
||||||
|
@ -602,6 +619,8 @@ public class CloseJobAction extends Action<CloseJobAction.Request, CloseJobActio
|
||||||
closingJobs.add(jobId);
|
closingJobs.add(jobId);
|
||||||
break;
|
break;
|
||||||
case FAILED:
|
case FAILED:
|
||||||
|
failedJobs.add(jobId);
|
||||||
|
break;
|
||||||
case OPENED:
|
case OPENED:
|
||||||
openJobs.add(jobId);
|
openJobs.add(jobId);
|
||||||
break;
|
break;
|
||||||
|
|
|
@ -358,6 +358,7 @@ public class AutodetectProcessManager extends AbstractComponent {
|
||||||
communicator.close(restart, reason);
|
communicator.close(restart, reason);
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
logger.warn("Exception closing stopped process input stream", e);
|
logger.warn("Exception closing stopped process input stream", e);
|
||||||
|
setJobState(jobTask, JobState.FAILED);
|
||||||
throw ExceptionsHelper.serverError("Exception closing stopped process input stream", e);
|
throw ExceptionsHelper.serverError("Exception closing stopped process input stream", e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -378,7 +379,7 @@ public class AutodetectProcessManager extends AbstractComponent {
|
||||||
return Optional.of(Duration.between(communicator.getProcessStartTime(), ZonedDateTime.now()));
|
return Optional.of(Duration.between(communicator.getProcessStartTime(), ZonedDateTime.now()));
|
||||||
}
|
}
|
||||||
|
|
||||||
private void setJobState(JobTask jobTask, JobState state) {
|
void setJobState(JobTask jobTask, JobState state) {
|
||||||
JobTaskStatus taskStatus = new JobTaskStatus(state, jobTask.getAllocationId());
|
JobTaskStatus taskStatus = new JobTaskStatus(state, jobTask.getAllocationId());
|
||||||
jobTask.updatePersistentStatus(taskStatus, new ActionListener<PersistentTask<?>>() {
|
jobTask.updatePersistentStatus(taskStatus, new ActionListener<PersistentTask<?>>() {
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -167,7 +167,7 @@ public class PersistentTasksService extends AbstractComponent {
|
||||||
public void onTimeout(TimeValue timeout) {
|
public void onTimeout(TimeValue timeout) {
|
||||||
listener.onFailure(new IllegalStateException("timed out after " + timeout));
|
listener.onFailure(new IllegalStateException("timed out after " + timeout));
|
||||||
}
|
}
|
||||||
}, clusterState -> predicate.test(clusterState.metaData().custom(PersistentTasksCustomMetaData.TYPE)));
|
}, clusterState -> predicate.test(clusterState.metaData().custom(PersistentTasksCustomMetaData.TYPE)), timeout);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -138,9 +138,12 @@ public class CloseJobActionRequestTests extends AbstractStreamableXContentTestCa
|
||||||
List<String> openJobs = new ArrayList<>();
|
List<String> openJobs = new ArrayList<>();
|
||||||
List<String> closingJobs = new ArrayList<>();
|
List<String> closingJobs = new ArrayList<>();
|
||||||
|
|
||||||
CloseJobAction.resolveAndValidateJobId("_all", cs1, openJobs, closingJobs);
|
CloseJobAction.resolveAndValidateJobId("_all", cs1, openJobs, closingJobs, true);
|
||||||
assertEquals(Arrays.asList("job_id_1", "job_id_2", "job_id_3"), openJobs);
|
assertEquals(Arrays.asList("job_id_1", "job_id_2", "job_id_3"), openJobs);
|
||||||
assertEquals(Arrays.asList("job_id_4"), closingJobs);
|
assertEquals(Arrays.asList("job_id_4"), closingJobs);
|
||||||
|
|
||||||
|
expectThrows(ElasticsearchStatusException.class,
|
||||||
|
() -> CloseJobAction.resolveAndValidateJobId("_all", cs1, openJobs, closingJobs, false));
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testResolve_givenJobId() {
|
public void testResolve_givenJobId() {
|
||||||
|
@ -158,7 +161,7 @@ public class CloseJobActionRequestTests extends AbstractStreamableXContentTestCa
|
||||||
List<String> openJobs = new ArrayList<>();
|
List<String> openJobs = new ArrayList<>();
|
||||||
List<String> closingJobs = new ArrayList<>();
|
List<String> closingJobs = new ArrayList<>();
|
||||||
|
|
||||||
CloseJobAction.resolveAndValidateJobId("job_id_1", cs1, openJobs, closingJobs);
|
CloseJobAction.resolveAndValidateJobId("job_id_1", cs1, openJobs, closingJobs, false);
|
||||||
assertEquals(Arrays.asList("job_id_1"), openJobs);
|
assertEquals(Arrays.asList("job_id_1"), openJobs);
|
||||||
assertEquals(Collections.emptyList(), closingJobs);
|
assertEquals(Collections.emptyList(), closingJobs);
|
||||||
|
|
||||||
|
@ -169,11 +172,35 @@ public class CloseJobActionRequestTests extends AbstractStreamableXContentTestCa
|
||||||
|
|
||||||
openJobs.clear();
|
openJobs.clear();
|
||||||
closingJobs.clear();
|
closingJobs.clear();
|
||||||
CloseJobAction.resolveAndValidateJobId("job_id_1", cs1, openJobs, closingJobs);
|
CloseJobAction.resolveAndValidateJobId("job_id_1", cs1, openJobs, closingJobs, false);
|
||||||
assertEquals(Collections.emptyList(), openJobs);
|
assertEquals(Collections.emptyList(), openJobs);
|
||||||
assertEquals(Collections.emptyList(), closingJobs);
|
assertEquals(Collections.emptyList(), closingJobs);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testResolve_givenJobIdFailed() {
|
||||||
|
MlMetadata.Builder mlBuilder = new MlMetadata.Builder();
|
||||||
|
mlBuilder.putJob(BaseMlIntegTestCase.createFareQuoteJob("job_id_failed").build(new Date()), false);
|
||||||
|
|
||||||
|
PersistentTasksCustomMetaData.Builder tasksBuilder = PersistentTasksCustomMetaData.builder();
|
||||||
|
addJobTask("job_id_failed", null, JobState.FAILED, tasksBuilder);
|
||||||
|
|
||||||
|
ClusterState cs1 = ClusterState.builder(new ClusterName("_name")).metaData(new MetaData.Builder()
|
||||||
|
.putCustom(MlMetadata.TYPE, mlBuilder.build()).putCustom(PersistentTasksCustomMetaData.TYPE, tasksBuilder.build())).build();
|
||||||
|
|
||||||
|
List<String> openJobs = new ArrayList<>();
|
||||||
|
List<String> closingJobs = new ArrayList<>();
|
||||||
|
|
||||||
|
CloseJobAction.resolveAndValidateJobId("job_id_failed", cs1, openJobs, closingJobs, true);
|
||||||
|
assertEquals(Arrays.asList("job_id_failed"), openJobs);
|
||||||
|
assertEquals(Collections.emptyList(), closingJobs);
|
||||||
|
|
||||||
|
openJobs.clear();
|
||||||
|
closingJobs.clear();
|
||||||
|
|
||||||
|
expectThrows(ElasticsearchStatusException.class,
|
||||||
|
() -> CloseJobAction.resolveAndValidateJobId("job_id_failed", cs1, openJobs, closingJobs, false));
|
||||||
|
}
|
||||||
|
|
||||||
public void testResolve_withSpecificJobIds() {
|
public void testResolve_withSpecificJobIds() {
|
||||||
MlMetadata.Builder mlBuilder = new MlMetadata.Builder();
|
MlMetadata.Builder mlBuilder = new MlMetadata.Builder();
|
||||||
mlBuilder.putJob(BaseMlIntegTestCase.createFareQuoteJob("job_id_closing").build(new Date()), false);
|
mlBuilder.putJob(BaseMlIntegTestCase.createFareQuoteJob("job_id_closing").build(new Date()), false);
|
||||||
|
@ -193,19 +220,19 @@ public class CloseJobActionRequestTests extends AbstractStreamableXContentTestCa
|
||||||
List<String> openJobs = new ArrayList<>();
|
List<String> openJobs = new ArrayList<>();
|
||||||
List<String> closingJobs = new ArrayList<>();
|
List<String> closingJobs = new ArrayList<>();
|
||||||
|
|
||||||
CloseJobAction.resolveAndValidateJobId("_all", cs1, openJobs, closingJobs);
|
CloseJobAction.resolveAndValidateJobId("_all", cs1, openJobs, closingJobs, false);
|
||||||
assertEquals(Arrays.asList("job_id_open"), openJobs);
|
assertEquals(Arrays.asList("job_id_open"), openJobs);
|
||||||
assertEquals(Arrays.asList("job_id_closing"), closingJobs);
|
assertEquals(Arrays.asList("job_id_closing"), closingJobs);
|
||||||
openJobs.clear();
|
openJobs.clear();
|
||||||
closingJobs.clear();
|
closingJobs.clear();
|
||||||
|
|
||||||
CloseJobAction.resolveAndValidateJobId("job_id_closing", cs1, openJobs, closingJobs);
|
CloseJobAction.resolveAndValidateJobId("job_id_closing", cs1, openJobs, closingJobs, false);
|
||||||
assertEquals(Collections.emptyList(), openJobs);
|
assertEquals(Collections.emptyList(), openJobs);
|
||||||
assertEquals(Arrays.asList("job_id_closing"), closingJobs);
|
assertEquals(Arrays.asList("job_id_closing"), closingJobs);
|
||||||
openJobs.clear();
|
openJobs.clear();
|
||||||
closingJobs.clear();
|
closingJobs.clear();
|
||||||
|
|
||||||
CloseJobAction.resolveAndValidateJobId("job_id_open", cs1, openJobs, closingJobs);
|
CloseJobAction.resolveAndValidateJobId("job_id_open", cs1, openJobs, closingJobs, false);
|
||||||
assertEquals(Arrays.asList("job_id_open"), openJobs);
|
assertEquals(Arrays.asList("job_id_open"), openJobs);
|
||||||
assertEquals(Collections.emptyList(), closingJobs);
|
assertEquals(Collections.emptyList(), closingJobs);
|
||||||
openJobs.clear();
|
openJobs.clear();
|
||||||
|
|
|
@ -5,6 +5,7 @@
|
||||||
*/
|
*/
|
||||||
package org.elasticsearch.xpack.ml.job.process.autodetect;
|
package org.elasticsearch.xpack.ml.job.process.autodetect;
|
||||||
|
|
||||||
|
import org.elasticsearch.ElasticsearchException;
|
||||||
import org.elasticsearch.client.Client;
|
import org.elasticsearch.client.Client;
|
||||||
import org.elasticsearch.common.CheckedConsumer;
|
import org.elasticsearch.common.CheckedConsumer;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
|
@ -47,7 +48,6 @@ import java.io.ByteArrayInputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.Date;
|
import java.util.Date;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
|
@ -322,6 +322,32 @@ public class AutodetectProcessManagerTests extends ESTestCase {
|
||||||
assertEquals("[foo] exception while flushing job", holder[0].getMessage());
|
assertEquals("[foo] exception while flushing job", holder[0].getMessage());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testCloseThrows() throws IOException {
|
||||||
|
AutodetectCommunicator communicator = mock(AutodetectCommunicator.class);
|
||||||
|
AutodetectProcessManager manager = createManager(communicator);
|
||||||
|
|
||||||
|
// let the communicator throw, simulating a problem with the underlying
|
||||||
|
// autodetect, e.g. a crash
|
||||||
|
doThrow(Exception.class).when(communicator).close(anyBoolean(), anyString());
|
||||||
|
|
||||||
|
// create a jobtask
|
||||||
|
JobTask jobTask = mock(JobTask.class);
|
||||||
|
when(jobTask.getJobId()).thenReturn("foo");
|
||||||
|
manager.openJob(jobTask, false, e -> {
|
||||||
|
});
|
||||||
|
manager.processData(jobTask, createInputStream(""), randomFrom(XContentType.values()), mock(DataLoadParams.class),
|
||||||
|
(dataCounts1, e) -> {
|
||||||
|
});
|
||||||
|
|
||||||
|
// job is created
|
||||||
|
assertEquals(1, manager.numberOfOpenJobs());
|
||||||
|
expectThrows(ElasticsearchException.class, () -> manager.closeJob(jobTask, false, null));
|
||||||
|
assertEquals(0, manager.numberOfOpenJobs());
|
||||||
|
|
||||||
|
verify(manager).setJobState(any(), eq(JobState.OPENED));
|
||||||
|
verify(manager).setJobState(any(), eq(JobState.FAILED));
|
||||||
|
}
|
||||||
|
|
||||||
public void testwriteUpdateProcessMessage() throws IOException {
|
public void testwriteUpdateProcessMessage() throws IOException {
|
||||||
AutodetectCommunicator communicator = mock(AutodetectCommunicator.class);
|
AutodetectCommunicator communicator = mock(AutodetectCommunicator.class);
|
||||||
AutodetectProcessManager manager = createManagerAndCallProcessData(communicator, "foo");
|
AutodetectProcessManager manager = createManagerAndCallProcessData(communicator, "foo");
|
||||||
|
|
Loading…
Reference in New Issue