[ML] Make job closing robust against crashes in autodetect and other misbehavior (elastic/x-pack-elasticsearch#1480)

Set job to failed if autodetect manager fails closing, fix force closing of jobs that hang in closing 
state, set timeout when waiting for clusterstate update, disallow closing of failed jobs with normal 
close

relates elastic/x-pack-elasticsearch#1453 

Original commit: elastic/x-pack-elasticsearch@493cf85e22
This commit is contained in:
Hendrik Muhs 2017-05-22 08:48:33 +02:00 committed by GitHub
parent 392e67851e
commit 527dcfd98d
5 changed files with 91 additions and 18 deletions

View File

@ -29,6 +29,7 @@ import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.io.stream.Writeable;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.common.util.ArrayUtils;
import org.elasticsearch.common.util.concurrent.AbstractRunnable;
import org.elasticsearch.common.util.concurrent.AtomicArray;
import org.elasticsearch.common.xcontent.ObjectParser;
@ -57,7 +58,6 @@ import org.elasticsearch.xpack.security.InternalClient;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
@ -344,7 +344,7 @@ public class CloseJobAction extends Action<CloseJobAction.Request, CloseJobActio
List<String> openJobs = new ArrayList<>();
List<String> closingJobs = new ArrayList<>();
resolveAndValidateJobId(request.getJobId(), state, openJobs, closingJobs);
resolveAndValidateJobId(request.getJobId(), state, openJobs, closingJobs, request.isForce());
request.setOpenJobIds(openJobs.toArray(new String[0]));
request.setClosingJobIds(closingJobs.toArray(new String[0]));
if (request.openJobIds.length == 0 && request.closingJobIds.length == 0) {
@ -432,10 +432,11 @@ public class CloseJobAction extends Action<CloseJobAction.Request, CloseJobActio
private void forceCloseJob(ClusterState currentState, Request request, ActionListener<Response> listener) {
PersistentTasksCustomMetaData tasks = currentState.getMetaData().custom(PersistentTasksCustomMetaData.TYPE);
final int numberOfJobs = request.openJobIds.length;
final int numberOfJobs = request.openJobIds.length + request.closingJobIds.length;
final AtomicInteger counter = new AtomicInteger();
final AtomicArray<Exception> failures = new AtomicArray<>(numberOfJobs);
for (String jobId : request.openJobIds) {
for (String jobId : ArrayUtils.concat(request.openJobIds, request.closingJobIds)) {
PersistentTask<?> jobTask = MlMetadata.getJobTask(jobId, tasks);
if (jobTask != null) {
auditor.info(jobId, Messages.JOB_AUDIT_FORCE_CLOSING);
@ -560,14 +561,16 @@ public class CloseJobAction extends Action<CloseJobAction.Request, CloseJobActio
* Expand the {@code jobId} parameter and add the job Id the the list arguments
* depending on job state.
*
* Opened or failed jobs are added to {@code openJobs} and closing jobs added to
* {@code closingJobs}
* Opened jobs are added to {@code openJobs} and closing jobs added to {@code closingJobs}. Failed jobs are added
* to {@code openJobs} if allowFailed is set otherwise an exception is thrown.
* @param jobId The job Id. If jobId == {@link Job#ALL} then expand the job list.
* @param state Cluster state
* @param openJobs Opened or failed jobs are added to this list
* @param closingJobs Closing jobs are added to this list
* @param allowFailed Whether failed jobs are allowed, if yes, they are added to {@code openJobs}
*/
static void resolveAndValidateJobId(String jobId, ClusterState state, List<String> openJobs, List<String> closingJobs) {
static void resolveAndValidateJobId(String jobId, ClusterState state, List<String> openJobs, List<String> closingJobs,
boolean allowFailed) {
MlMetadata mlMetadata = state.metaData().custom(MlMetadata.TYPE);
PersistentTasksCustomMetaData tasksMetaData = state.getMetaData().custom(PersistentTasksCustomMetaData.TYPE);
@ -575,26 +578,40 @@ public class CloseJobAction extends Action<CloseJobAction.Request, CloseJobActio
return;
}
List<String> failedJobs = new ArrayList<>();
Consumer<String> jobIdProcessor = id -> {
validateJobAndTaskState(id, mlMetadata, tasksMetaData);
Job job = mlMetadata.getJobs().get(id);
if (job.isDeleted()) {
return;
}
addJobAccordingToState(id, tasksMetaData, openJobs, closingJobs);
addJobAccordingToState(id, tasksMetaData, openJobs, closingJobs, failedJobs);
};
if (!Job.ALL.equals(jobId)) {
jobIdProcessor.accept(jobId);
if (allowFailed == false && failedJobs.size() > 0) {
throw ExceptionsHelper.conflictStatusException("cannot close job [{}] because it failed, use force close", jobId);
}
} else {
for (Map.Entry<String, Job> jobEntry : mlMetadata.getJobs().entrySet()) {
jobIdProcessor.accept(jobEntry.getKey());
}
if (allowFailed == false && failedJobs.size() > 0) {
throw ExceptionsHelper.conflictStatusException("one or more jobs have state failed, use force close");
}
}
// allowFailed == true
openJobs.addAll(failedJobs);
}
private static void addJobAccordingToState(String jobId, PersistentTasksCustomMetaData tasksMetaData,
List<String> openJobs, List<String> closingJobs) {
List<String> openJobs, List<String> closingJobs, List<String> failedJobs) {
JobState jobState = MlMetadata.getJobState(jobId, tasksMetaData);
switch (jobState) {
@ -602,6 +619,8 @@ public class CloseJobAction extends Action<CloseJobAction.Request, CloseJobActio
closingJobs.add(jobId);
break;
case FAILED:
failedJobs.add(jobId);
break;
case OPENED:
openJobs.add(jobId);
break;

View File

@ -358,6 +358,7 @@ public class AutodetectProcessManager extends AbstractComponent {
communicator.close(restart, reason);
} catch (Exception e) {
logger.warn("Exception closing stopped process input stream", e);
setJobState(jobTask, JobState.FAILED);
throw ExceptionsHelper.serverError("Exception closing stopped process input stream", e);
}
}
@ -378,7 +379,7 @@ public class AutodetectProcessManager extends AbstractComponent {
return Optional.of(Duration.between(communicator.getProcessStartTime(), ZonedDateTime.now()));
}
private void setJobState(JobTask jobTask, JobState state) {
void setJobState(JobTask jobTask, JobState state) {
JobTaskStatus taskStatus = new JobTaskStatus(state, jobTask.getAllocationId());
jobTask.updatePersistentStatus(taskStatus, new ActionListener<PersistentTask<?>>() {
@Override

View File

@ -167,7 +167,7 @@ public class PersistentTasksService extends AbstractComponent {
public void onTimeout(TimeValue timeout) {
listener.onFailure(new IllegalStateException("timed out after " + timeout));
}
}, clusterState -> predicate.test(clusterState.metaData().custom(PersistentTasksCustomMetaData.TYPE)));
}, clusterState -> predicate.test(clusterState.metaData().custom(PersistentTasksCustomMetaData.TYPE)), timeout);
}
}

View File

@ -138,9 +138,12 @@ public class CloseJobActionRequestTests extends AbstractStreamableXContentTestCa
List<String> openJobs = new ArrayList<>();
List<String> closingJobs = new ArrayList<>();
CloseJobAction.resolveAndValidateJobId("_all", cs1, openJobs, closingJobs);
CloseJobAction.resolveAndValidateJobId("_all", cs1, openJobs, closingJobs, true);
assertEquals(Arrays.asList("job_id_1", "job_id_2", "job_id_3"), openJobs);
assertEquals(Arrays.asList("job_id_4"), closingJobs);
expectThrows(ElasticsearchStatusException.class,
() -> CloseJobAction.resolveAndValidateJobId("_all", cs1, openJobs, closingJobs, false));
}
public void testResolve_givenJobId() {
@ -158,7 +161,7 @@ public class CloseJobActionRequestTests extends AbstractStreamableXContentTestCa
List<String> openJobs = new ArrayList<>();
List<String> closingJobs = new ArrayList<>();
CloseJobAction.resolveAndValidateJobId("job_id_1", cs1, openJobs, closingJobs);
CloseJobAction.resolveAndValidateJobId("job_id_1", cs1, openJobs, closingJobs, false);
assertEquals(Arrays.asList("job_id_1"), openJobs);
assertEquals(Collections.emptyList(), closingJobs);
@ -169,11 +172,35 @@ public class CloseJobActionRequestTests extends AbstractStreamableXContentTestCa
openJobs.clear();
closingJobs.clear();
CloseJobAction.resolveAndValidateJobId("job_id_1", cs1, openJobs, closingJobs);
CloseJobAction.resolveAndValidateJobId("job_id_1", cs1, openJobs, closingJobs, false);
assertEquals(Collections.emptyList(), openJobs);
assertEquals(Collections.emptyList(), closingJobs);
}
public void testResolve_givenJobIdFailed() {
MlMetadata.Builder mlBuilder = new MlMetadata.Builder();
mlBuilder.putJob(BaseMlIntegTestCase.createFareQuoteJob("job_id_failed").build(new Date()), false);
PersistentTasksCustomMetaData.Builder tasksBuilder = PersistentTasksCustomMetaData.builder();
addJobTask("job_id_failed", null, JobState.FAILED, tasksBuilder);
ClusterState cs1 = ClusterState.builder(new ClusterName("_name")).metaData(new MetaData.Builder()
.putCustom(MlMetadata.TYPE, mlBuilder.build()).putCustom(PersistentTasksCustomMetaData.TYPE, tasksBuilder.build())).build();
List<String> openJobs = new ArrayList<>();
List<String> closingJobs = new ArrayList<>();
CloseJobAction.resolveAndValidateJobId("job_id_failed", cs1, openJobs, closingJobs, true);
assertEquals(Arrays.asList("job_id_failed"), openJobs);
assertEquals(Collections.emptyList(), closingJobs);
openJobs.clear();
closingJobs.clear();
expectThrows(ElasticsearchStatusException.class,
() -> CloseJobAction.resolveAndValidateJobId("job_id_failed", cs1, openJobs, closingJobs, false));
}
public void testResolve_withSpecificJobIds() {
MlMetadata.Builder mlBuilder = new MlMetadata.Builder();
mlBuilder.putJob(BaseMlIntegTestCase.createFareQuoteJob("job_id_closing").build(new Date()), false);
@ -193,19 +220,19 @@ public class CloseJobActionRequestTests extends AbstractStreamableXContentTestCa
List<String> openJobs = new ArrayList<>();
List<String> closingJobs = new ArrayList<>();
CloseJobAction.resolveAndValidateJobId("_all", cs1, openJobs, closingJobs);
CloseJobAction.resolveAndValidateJobId("_all", cs1, openJobs, closingJobs, false);
assertEquals(Arrays.asList("job_id_open"), openJobs);
assertEquals(Arrays.asList("job_id_closing"), closingJobs);
openJobs.clear();
closingJobs.clear();
CloseJobAction.resolveAndValidateJobId("job_id_closing", cs1, openJobs, closingJobs);
CloseJobAction.resolveAndValidateJobId("job_id_closing", cs1, openJobs, closingJobs, false);
assertEquals(Collections.emptyList(), openJobs);
assertEquals(Arrays.asList("job_id_closing"), closingJobs);
openJobs.clear();
closingJobs.clear();
CloseJobAction.resolveAndValidateJobId("job_id_open", cs1, openJobs, closingJobs);
CloseJobAction.resolveAndValidateJobId("job_id_open", cs1, openJobs, closingJobs, false);
assertEquals(Arrays.asList("job_id_open"), openJobs);
assertEquals(Collections.emptyList(), closingJobs);
openJobs.clear();

View File

@ -5,6 +5,7 @@
*/
package org.elasticsearch.xpack.ml.job.process.autodetect;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.client.Client;
import org.elasticsearch.common.CheckedConsumer;
import org.elasticsearch.common.settings.Settings;
@ -47,7 +48,6 @@ import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.Collections;
import java.util.Date;
import java.util.HashSet;
@ -322,6 +322,32 @@ public class AutodetectProcessManagerTests extends ESTestCase {
assertEquals("[foo] exception while flushing job", holder[0].getMessage());
}
public void testCloseThrows() throws IOException {
AutodetectCommunicator communicator = mock(AutodetectCommunicator.class);
AutodetectProcessManager manager = createManager(communicator);
// let the communicator throw, simulating a problem with the underlying
// autodetect, e.g. a crash
doThrow(Exception.class).when(communicator).close(anyBoolean(), anyString());
// create a jobtask
JobTask jobTask = mock(JobTask.class);
when(jobTask.getJobId()).thenReturn("foo");
manager.openJob(jobTask, false, e -> {
});
manager.processData(jobTask, createInputStream(""), randomFrom(XContentType.values()), mock(DataLoadParams.class),
(dataCounts1, e) -> {
});
// job is created
assertEquals(1, manager.numberOfOpenJobs());
expectThrows(ElasticsearchException.class, () -> manager.closeJob(jobTask, false, null));
assertEquals(0, manager.numberOfOpenJobs());
verify(manager).setJobState(any(), eq(JobState.OPENED));
verify(manager).setJobState(any(), eq(JobState.FAILED));
}
public void testwriteUpdateProcessMessage() throws IOException {
AutodetectCommunicator communicator = mock(AutodetectCommunicator.class);
AutodetectProcessManager manager = createManagerAndCallProcessData(communicator, "foo");