[ML] Make job closing robust against crashes in autodetect and other misbehavior (elastic/x-pack-elasticsearch#1480)
Set job to failed if autodetect manager fails closing, fix force closing of jobs that hang in closing state, set timeout when waiting for clusterstate update, disallow closing of failed jobs with normal close relates elastic/x-pack-elasticsearch#1453 Original commit: elastic/x-pack-elasticsearch@493cf85e22
This commit is contained in:
parent
392e67851e
commit
527dcfd98d
|
@ -29,6 +29,7 @@ import org.elasticsearch.common.io.stream.StreamOutput;
|
|||
import org.elasticsearch.common.io.stream.Writeable;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.common.unit.TimeValue;
|
||||
import org.elasticsearch.common.util.ArrayUtils;
|
||||
import org.elasticsearch.common.util.concurrent.AbstractRunnable;
|
||||
import org.elasticsearch.common.util.concurrent.AtomicArray;
|
||||
import org.elasticsearch.common.xcontent.ObjectParser;
|
||||
|
@ -57,7 +58,6 @@ import org.elasticsearch.xpack.security.InternalClient;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
@ -344,7 +344,7 @@ public class CloseJobAction extends Action<CloseJobAction.Request, CloseJobActio
|
|||
|
||||
List<String> openJobs = new ArrayList<>();
|
||||
List<String> closingJobs = new ArrayList<>();
|
||||
resolveAndValidateJobId(request.getJobId(), state, openJobs, closingJobs);
|
||||
resolveAndValidateJobId(request.getJobId(), state, openJobs, closingJobs, request.isForce());
|
||||
request.setOpenJobIds(openJobs.toArray(new String[0]));
|
||||
request.setClosingJobIds(closingJobs.toArray(new String[0]));
|
||||
if (request.openJobIds.length == 0 && request.closingJobIds.length == 0) {
|
||||
|
@ -432,10 +432,11 @@ public class CloseJobAction extends Action<CloseJobAction.Request, CloseJobActio
|
|||
private void forceCloseJob(ClusterState currentState, Request request, ActionListener<Response> listener) {
|
||||
PersistentTasksCustomMetaData tasks = currentState.getMetaData().custom(PersistentTasksCustomMetaData.TYPE);
|
||||
|
||||
final int numberOfJobs = request.openJobIds.length;
|
||||
final int numberOfJobs = request.openJobIds.length + request.closingJobIds.length;
|
||||
final AtomicInteger counter = new AtomicInteger();
|
||||
final AtomicArray<Exception> failures = new AtomicArray<>(numberOfJobs);
|
||||
for (String jobId : request.openJobIds) {
|
||||
|
||||
for (String jobId : ArrayUtils.concat(request.openJobIds, request.closingJobIds)) {
|
||||
PersistentTask<?> jobTask = MlMetadata.getJobTask(jobId, tasks);
|
||||
if (jobTask != null) {
|
||||
auditor.info(jobId, Messages.JOB_AUDIT_FORCE_CLOSING);
|
||||
|
@ -560,14 +561,16 @@ public class CloseJobAction extends Action<CloseJobAction.Request, CloseJobActio
|
|||
* Expand the {@code jobId} parameter and add the job Id the the list arguments
|
||||
* depending on job state.
|
||||
*
|
||||
* Opened or failed jobs are added to {@code openJobs} and closing jobs added to
|
||||
* {@code closingJobs}
|
||||
* Opened jobs are added to {@code openJobs} and closing jobs added to {@code closingJobs}. Failed jobs are added
|
||||
* to {@code openJobs} if allowFailed is set otherwise an exception is thrown.
|
||||
* @param jobId The job Id. If jobId == {@link Job#ALL} then expand the job list.
|
||||
* @param state Cluster state
|
||||
* @param openJobs Opened or failed jobs are added to this list
|
||||
* @param closingJobs Closing jobs are added to this list
|
||||
* @param allowFailed Whether failed jobs are allowed, if yes, they are added to {@code openJobs}
|
||||
*/
|
||||
static void resolveAndValidateJobId(String jobId, ClusterState state, List<String> openJobs, List<String> closingJobs) {
|
||||
static void resolveAndValidateJobId(String jobId, ClusterState state, List<String> openJobs, List<String> closingJobs,
|
||||
boolean allowFailed) {
|
||||
MlMetadata mlMetadata = state.metaData().custom(MlMetadata.TYPE);
|
||||
PersistentTasksCustomMetaData tasksMetaData = state.getMetaData().custom(PersistentTasksCustomMetaData.TYPE);
|
||||
|
||||
|
@ -575,26 +578,40 @@ public class CloseJobAction extends Action<CloseJobAction.Request, CloseJobActio
|
|||
return;
|
||||
}
|
||||
|
||||
List<String> failedJobs = new ArrayList<>();
|
||||
|
||||
Consumer<String> jobIdProcessor = id -> {
|
||||
validateJobAndTaskState(id, mlMetadata, tasksMetaData);
|
||||
Job job = mlMetadata.getJobs().get(id);
|
||||
if (job.isDeleted()) {
|
||||
return;
|
||||
}
|
||||
addJobAccordingToState(id, tasksMetaData, openJobs, closingJobs);
|
||||
addJobAccordingToState(id, tasksMetaData, openJobs, closingJobs, failedJobs);
|
||||
};
|
||||
|
||||
if (!Job.ALL.equals(jobId)) {
|
||||
jobIdProcessor.accept(jobId);
|
||||
|
||||
if (allowFailed == false && failedJobs.size() > 0) {
|
||||
throw ExceptionsHelper.conflictStatusException("cannot close job [{}] because it failed, use force close", jobId);
|
||||
}
|
||||
|
||||
} else {
|
||||
for (Map.Entry<String, Job> jobEntry : mlMetadata.getJobs().entrySet()) {
|
||||
jobIdProcessor.accept(jobEntry.getKey());
|
||||
}
|
||||
|
||||
if (allowFailed == false && failedJobs.size() > 0) {
|
||||
throw ExceptionsHelper.conflictStatusException("one or more jobs have state failed, use force close");
|
||||
}
|
||||
}
|
||||
|
||||
// allowFailed == true
|
||||
openJobs.addAll(failedJobs);
|
||||
}
|
||||
|
||||
private static void addJobAccordingToState(String jobId, PersistentTasksCustomMetaData tasksMetaData,
|
||||
List<String> openJobs, List<String> closingJobs) {
|
||||
List<String> openJobs, List<String> closingJobs, List<String> failedJobs) {
|
||||
|
||||
JobState jobState = MlMetadata.getJobState(jobId, tasksMetaData);
|
||||
switch (jobState) {
|
||||
|
@ -602,6 +619,8 @@ public class CloseJobAction extends Action<CloseJobAction.Request, CloseJobActio
|
|||
closingJobs.add(jobId);
|
||||
break;
|
||||
case FAILED:
|
||||
failedJobs.add(jobId);
|
||||
break;
|
||||
case OPENED:
|
||||
openJobs.add(jobId);
|
||||
break;
|
||||
|
|
|
@ -358,6 +358,7 @@ public class AutodetectProcessManager extends AbstractComponent {
|
|||
communicator.close(restart, reason);
|
||||
} catch (Exception e) {
|
||||
logger.warn("Exception closing stopped process input stream", e);
|
||||
setJobState(jobTask, JobState.FAILED);
|
||||
throw ExceptionsHelper.serverError("Exception closing stopped process input stream", e);
|
||||
}
|
||||
}
|
||||
|
@ -378,7 +379,7 @@ public class AutodetectProcessManager extends AbstractComponent {
|
|||
return Optional.of(Duration.between(communicator.getProcessStartTime(), ZonedDateTime.now()));
|
||||
}
|
||||
|
||||
private void setJobState(JobTask jobTask, JobState state) {
|
||||
void setJobState(JobTask jobTask, JobState state) {
|
||||
JobTaskStatus taskStatus = new JobTaskStatus(state, jobTask.getAllocationId());
|
||||
jobTask.updatePersistentStatus(taskStatus, new ActionListener<PersistentTask<?>>() {
|
||||
@Override
|
||||
|
|
|
@ -167,7 +167,7 @@ public class PersistentTasksService extends AbstractComponent {
|
|||
public void onTimeout(TimeValue timeout) {
|
||||
listener.onFailure(new IllegalStateException("timed out after " + timeout));
|
||||
}
|
||||
}, clusterState -> predicate.test(clusterState.metaData().custom(PersistentTasksCustomMetaData.TYPE)));
|
||||
}, clusterState -> predicate.test(clusterState.metaData().custom(PersistentTasksCustomMetaData.TYPE)), timeout);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -138,9 +138,12 @@ public class CloseJobActionRequestTests extends AbstractStreamableXContentTestCa
|
|||
List<String> openJobs = new ArrayList<>();
|
||||
List<String> closingJobs = new ArrayList<>();
|
||||
|
||||
CloseJobAction.resolveAndValidateJobId("_all", cs1, openJobs, closingJobs);
|
||||
CloseJobAction.resolveAndValidateJobId("_all", cs1, openJobs, closingJobs, true);
|
||||
assertEquals(Arrays.asList("job_id_1", "job_id_2", "job_id_3"), openJobs);
|
||||
assertEquals(Arrays.asList("job_id_4"), closingJobs);
|
||||
|
||||
expectThrows(ElasticsearchStatusException.class,
|
||||
() -> CloseJobAction.resolveAndValidateJobId("_all", cs1, openJobs, closingJobs, false));
|
||||
}
|
||||
|
||||
public void testResolve_givenJobId() {
|
||||
|
@ -158,7 +161,7 @@ public class CloseJobActionRequestTests extends AbstractStreamableXContentTestCa
|
|||
List<String> openJobs = new ArrayList<>();
|
||||
List<String> closingJobs = new ArrayList<>();
|
||||
|
||||
CloseJobAction.resolveAndValidateJobId("job_id_1", cs1, openJobs, closingJobs);
|
||||
CloseJobAction.resolveAndValidateJobId("job_id_1", cs1, openJobs, closingJobs, false);
|
||||
assertEquals(Arrays.asList("job_id_1"), openJobs);
|
||||
assertEquals(Collections.emptyList(), closingJobs);
|
||||
|
||||
|
@ -169,11 +172,35 @@ public class CloseJobActionRequestTests extends AbstractStreamableXContentTestCa
|
|||
|
||||
openJobs.clear();
|
||||
closingJobs.clear();
|
||||
CloseJobAction.resolveAndValidateJobId("job_id_1", cs1, openJobs, closingJobs);
|
||||
CloseJobAction.resolveAndValidateJobId("job_id_1", cs1, openJobs, closingJobs, false);
|
||||
assertEquals(Collections.emptyList(), openJobs);
|
||||
assertEquals(Collections.emptyList(), closingJobs);
|
||||
}
|
||||
|
||||
public void testResolve_givenJobIdFailed() {
|
||||
MlMetadata.Builder mlBuilder = new MlMetadata.Builder();
|
||||
mlBuilder.putJob(BaseMlIntegTestCase.createFareQuoteJob("job_id_failed").build(new Date()), false);
|
||||
|
||||
PersistentTasksCustomMetaData.Builder tasksBuilder = PersistentTasksCustomMetaData.builder();
|
||||
addJobTask("job_id_failed", null, JobState.FAILED, tasksBuilder);
|
||||
|
||||
ClusterState cs1 = ClusterState.builder(new ClusterName("_name")).metaData(new MetaData.Builder()
|
||||
.putCustom(MlMetadata.TYPE, mlBuilder.build()).putCustom(PersistentTasksCustomMetaData.TYPE, tasksBuilder.build())).build();
|
||||
|
||||
List<String> openJobs = new ArrayList<>();
|
||||
List<String> closingJobs = new ArrayList<>();
|
||||
|
||||
CloseJobAction.resolveAndValidateJobId("job_id_failed", cs1, openJobs, closingJobs, true);
|
||||
assertEquals(Arrays.asList("job_id_failed"), openJobs);
|
||||
assertEquals(Collections.emptyList(), closingJobs);
|
||||
|
||||
openJobs.clear();
|
||||
closingJobs.clear();
|
||||
|
||||
expectThrows(ElasticsearchStatusException.class,
|
||||
() -> CloseJobAction.resolveAndValidateJobId("job_id_failed", cs1, openJobs, closingJobs, false));
|
||||
}
|
||||
|
||||
public void testResolve_withSpecificJobIds() {
|
||||
MlMetadata.Builder mlBuilder = new MlMetadata.Builder();
|
||||
mlBuilder.putJob(BaseMlIntegTestCase.createFareQuoteJob("job_id_closing").build(new Date()), false);
|
||||
|
@ -193,19 +220,19 @@ public class CloseJobActionRequestTests extends AbstractStreamableXContentTestCa
|
|||
List<String> openJobs = new ArrayList<>();
|
||||
List<String> closingJobs = new ArrayList<>();
|
||||
|
||||
CloseJobAction.resolveAndValidateJobId("_all", cs1, openJobs, closingJobs);
|
||||
CloseJobAction.resolveAndValidateJobId("_all", cs1, openJobs, closingJobs, false);
|
||||
assertEquals(Arrays.asList("job_id_open"), openJobs);
|
||||
assertEquals(Arrays.asList("job_id_closing"), closingJobs);
|
||||
openJobs.clear();
|
||||
closingJobs.clear();
|
||||
|
||||
CloseJobAction.resolveAndValidateJobId("job_id_closing", cs1, openJobs, closingJobs);
|
||||
CloseJobAction.resolveAndValidateJobId("job_id_closing", cs1, openJobs, closingJobs, false);
|
||||
assertEquals(Collections.emptyList(), openJobs);
|
||||
assertEquals(Arrays.asList("job_id_closing"), closingJobs);
|
||||
openJobs.clear();
|
||||
closingJobs.clear();
|
||||
|
||||
CloseJobAction.resolveAndValidateJobId("job_id_open", cs1, openJobs, closingJobs);
|
||||
CloseJobAction.resolveAndValidateJobId("job_id_open", cs1, openJobs, closingJobs, false);
|
||||
assertEquals(Arrays.asList("job_id_open"), openJobs);
|
||||
assertEquals(Collections.emptyList(), closingJobs);
|
||||
openJobs.clear();
|
||||
|
|
|
@ -5,6 +5,7 @@
|
|||
*/
|
||||
package org.elasticsearch.xpack.ml.job.process.autodetect;
|
||||
|
||||
import org.elasticsearch.ElasticsearchException;
|
||||
import org.elasticsearch.client.Client;
|
||||
import org.elasticsearch.common.CheckedConsumer;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
|
@ -47,7 +48,6 @@ import java.io.ByteArrayInputStream;
|
|||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.Date;
|
||||
import java.util.HashSet;
|
||||
|
@ -322,6 +322,32 @@ public class AutodetectProcessManagerTests extends ESTestCase {
|
|||
assertEquals("[foo] exception while flushing job", holder[0].getMessage());
|
||||
}
|
||||
|
||||
public void testCloseThrows() throws IOException {
|
||||
AutodetectCommunicator communicator = mock(AutodetectCommunicator.class);
|
||||
AutodetectProcessManager manager = createManager(communicator);
|
||||
|
||||
// let the communicator throw, simulating a problem with the underlying
|
||||
// autodetect, e.g. a crash
|
||||
doThrow(Exception.class).when(communicator).close(anyBoolean(), anyString());
|
||||
|
||||
// create a jobtask
|
||||
JobTask jobTask = mock(JobTask.class);
|
||||
when(jobTask.getJobId()).thenReturn("foo");
|
||||
manager.openJob(jobTask, false, e -> {
|
||||
});
|
||||
manager.processData(jobTask, createInputStream(""), randomFrom(XContentType.values()), mock(DataLoadParams.class),
|
||||
(dataCounts1, e) -> {
|
||||
});
|
||||
|
||||
// job is created
|
||||
assertEquals(1, manager.numberOfOpenJobs());
|
||||
expectThrows(ElasticsearchException.class, () -> manager.closeJob(jobTask, false, null));
|
||||
assertEquals(0, manager.numberOfOpenJobs());
|
||||
|
||||
verify(manager).setJobState(any(), eq(JobState.OPENED));
|
||||
verify(manager).setJobState(any(), eq(JobState.FAILED));
|
||||
}
|
||||
|
||||
public void testwriteUpdateProcessMessage() throws IOException {
|
||||
AutodetectCommunicator communicator = mock(AutodetectCommunicator.class);
|
||||
AutodetectProcessManager manager = createManagerAndCallProcessData(communicator, "foo");
|
||||
|
|
Loading…
Reference in New Issue