Die with dignity

Today when a thread encounters a fatal unrecoverable error that
threatens the stability of the JVM, Elasticsearch marches on. This
includes out of memory errors, stack overflow errors and other errors
that leave the JVM in a questionable state. Instead, the Elasticsearch
JVM should die when these errors are encountered. This commit causes
this to be the case.

Relates #19272
This commit is contained in:
Jason Tedor 2016-07-07 14:44:03 -04:00 committed by GitHub
parent d3f8329a3d
commit e86aa29f67
12 changed files with 325 additions and 5 deletions

View File

@ -56,7 +56,7 @@ dependencies {
compile "org.apache.lucene:lucene-spatial3d:${versions.lucene}"
compile "org.apache.lucene:lucene-suggest:${versions.lucene}"
compile 'org.elasticsearch:securesm:1.0'
compile 'org.elasticsearch:securesm:1.1'
// utilities
compile 'net.sf.jopt-simple:jopt-simple:5.0.2'

View File

@ -246,6 +246,12 @@ final class Bootstrap {
// fail if somebody replaced the lucene jars
checkLucene();
// install the default uncaught exception handler; must be done before security is
// initialized as we do not want to grant the runtime permission
// setDefaultUncaughtExceptionHandler
Thread.setDefaultUncaughtExceptionHandler(
new ElasticsearchUncaughtExceptionHandler(() -> Node.NODE_NAME_SETTING.get(settings)));
INSTANCE.setup(true, settings, environment);
INSTANCE.start();

View File

@ -0,0 +1,94 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.bootstrap;
import org.apache.lucene.index.MergePolicy;
import org.elasticsearch.common.SuppressForbidden;
import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.logging.Loggers;
import java.io.IOError;
import java.util.Objects;
import java.util.function.Supplier;
class ElasticsearchUncaughtExceptionHandler implements Thread.UncaughtExceptionHandler {
private final Supplier<String> loggingPrefixSupplier;
ElasticsearchUncaughtExceptionHandler(final Supplier<String> loggingPrefixSupplier) {
this.loggingPrefixSupplier = Objects.requireNonNull(loggingPrefixSupplier);
}
@Override
public void uncaughtException(Thread t, Throwable e) {
if (isFatalUncaught(e)) {
try {
onFatalUncaught(t.getName(), e);
} finally {
// we use specific error codes in case the above notification failed, at least we
// will have some indication of the error bringing us down
if (e instanceof InternalError) {
halt(128);
} else if (e instanceof OutOfMemoryError) {
halt(127);
} else if (e instanceof StackOverflowError) {
halt(126);
} else if (e instanceof UnknownError) {
halt(125);
} else if (e instanceof IOError) {
halt(124);
} else {
halt(1);
}
}
} else {
onNonFatalUncaught(t.getName(), e);
}
}
// visible for testing
static boolean isFatalUncaught(Throwable e) {
return isFatalCause(e) || (e instanceof MergePolicy.MergeException && isFatalCause(e.getCause()));
}
private static boolean isFatalCause(Throwable cause) {
return cause instanceof Error;
}
// visible for testing
void onFatalUncaught(final String threadName, final Throwable t) {
final ESLogger logger = Loggers.getLogger(ElasticsearchUncaughtExceptionHandler.class, loggingPrefixSupplier.get());
logger.error("fatal error in thread [{}], exiting", t, threadName);
}
// visible for testing
void onNonFatalUncaught(final String threadName, final Throwable t) {
final ESLogger logger = Loggers.getLogger(ElasticsearchUncaughtExceptionHandler.class, loggingPrefixSupplier.get());
logger.warn("uncaught exception in thread [{}]", t, threadName);
}
// visible for testing
@SuppressForbidden(reason = "halt")
void halt(int status) {
// we halt to prevent shutdown hooks from running
Runtime.getRuntime().halt(status);
}
}

View File

@ -120,7 +120,7 @@ final class Security {
Policy.setPolicy(new ESPolicy(createPermissions(environment), getPluginPermissions(environment), filterBadDefaults));
// enable security manager
System.setSecurityManager(new SecureSM());
System.setSecurityManager(new SecureSM(new String[] { "org.elasticsearch.bootstrap." }));
// do some basic tests
selfTest();

View File

@ -24,7 +24,7 @@
//// SecurityManager impl:
//// Must have all permissions to properly perform access checks
grant codeBase "${codebase.securesm-1.0.jar}" {
grant codeBase "${codebase.securesm-1.1.jar}" {
permission java.security.AllPermission;
};

View File

@ -0,0 +1,152 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.bootstrap;
import org.apache.lucene.index.MergePolicy;
import org.elasticsearch.test.ESTestCase;
import org.junit.Before;
import java.io.IOError;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;
import static org.hamcrest.CoreMatchers.equalTo;
public class ElasticsearchUncaughtExceptionHandlerTests extends ESTestCase {
private Map<Class<? extends Error>, Integer> expectedStatus;
@Before
public void setUp() throws Exception {
super.setUp();
Map<Class<? extends Error>, Integer> expectedStatus = new HashMap<>();
expectedStatus.put(InternalError.class, 128);
expectedStatus.put(OutOfMemoryError.class, 127);
expectedStatus.put(StackOverflowError.class, 126);
expectedStatus.put(UnknownError.class, 125);
expectedStatus.put(IOError.class, 124);
this.expectedStatus = Collections.unmodifiableMap(expectedStatus);
}
public void testUncaughtError() throws InterruptedException {
final Error error = randomFrom(
new InternalError(),
new OutOfMemoryError(),
new StackOverflowError(),
new UnknownError(),
new IOError(new IOException("fatal")),
new Error() {});
final Thread thread = new Thread(() -> { throw error; });
final String name = randomAsciiOfLength(10);
thread.setName(name);
final AtomicBoolean halt = new AtomicBoolean();
final AtomicInteger observedStatus = new AtomicInteger();
final AtomicReference<String> threadNameReference = new AtomicReference<>();
final AtomicReference<Throwable> throwableReference = new AtomicReference<>();
thread.setUncaughtExceptionHandler(new ElasticsearchUncaughtExceptionHandler(() -> "testUncaughtError") {
@Override
void halt(int status) {
halt.set(true);
observedStatus.set(status);
}
@Override
void onFatalUncaught(String threadName, Throwable t) {
threadNameReference.set(threadName);
throwableReference.set(t);
}
@Override
void onNonFatalUncaught(String threadName, Throwable t) {
fail();
}
});
thread.start();
thread.join();
assertTrue(halt.get());
final int status;
if (expectedStatus.containsKey(error.getClass())) {
status = expectedStatus.get(error.getClass());
} else {
status = 1;
}
assertThat(observedStatus.get(), equalTo(status));
assertThat(threadNameReference.get(), equalTo(name));
assertThat(throwableReference.get(), equalTo(error));
}
public void testUncaughtException() throws InterruptedException {
final RuntimeException e = new RuntimeException("boom");
final Thread thread = new Thread(() -> { throw e; });
final String name = randomAsciiOfLength(10);
thread.setName(name);
final AtomicReference<String> threadNameReference = new AtomicReference<>();
final AtomicReference<Throwable> throwableReference = new AtomicReference<>();
thread.setUncaughtExceptionHandler(new ElasticsearchUncaughtExceptionHandler(() -> "testUncaughtException") {
@Override
void halt(int status) {
fail();
}
@Override
void onFatalUncaught(String threadName, Throwable t) {
fail();
}
@Override
void onNonFatalUncaught(String threadName, Throwable t) {
threadNameReference.set(threadName);
throwableReference.set(t);
}
});
thread.start();
thread.join();
assertThat(threadNameReference.get(), equalTo(name));
assertThat(throwableReference.get(), equalTo(e));
}
public void testIsFatalCause() {
assertFatal(new MergePolicy.MergeException(new OutOfMemoryError(), null));
assertFatal(new OutOfMemoryError());
assertFatal(new StackOverflowError());
assertFatal(new InternalError());
assertFatal(new UnknownError());
assertFatal(new IOError(new IOException()));
assertNonFatal(new RuntimeException());
assertNonFatal(new UncheckedIOException(new IOException()));
}
private void assertFatal(Throwable cause) {
assertTrue(ElasticsearchUncaughtExceptionHandler.isFatalUncaught(cause));
}
private void assertNonFatal(Throwable cause) {
assertFalse(ElasticsearchUncaughtExceptionHandler.isFatalUncaught(cause));
}
}

View File

@ -1 +0,0 @@
c0c6cf986ba0057390bfcc80c366a0e3157f944b

View File

@ -0,0 +1 @@
1e423447d020041534be94c0f31a49fbdc1f2950

View File

@ -55,3 +55,11 @@ from Elasticsearch.
Additionally, it was previously possible to set any setting in
Elasticsearch via JVM system properties. This has been removed from
Elasticsearch.
==== Dying on fatal errors
Previous versions of Elasticsearch would not halt the JVM if out of memory errors or other fatal
errors were encountered during the life of the Elasticsearch instance. Because such errors leave
the JVM in a questionable state, the best course of action is to halt the JVM when this occurs.
Starting in Elasticsearch 5.x, this is now the case. Operators should consider configuring their
Elasticsearch services so that they respawn automatically in the case of such a fatal crash.

View File

@ -47,3 +47,5 @@ include::setup/bootstrap-checks.asciidoc[]
include::setup/sysconfig.asciidoc[]
include::setup/upgrade.asciidoc[]
include::setup/stopping.asciidoc[]

View File

@ -0,0 +1,58 @@
[[stopping-elasticsearch]]
=== Stopping Elasticsearch
An orderly shutdown of Elasticsearch ensures that Elasticsearch has a chance to cleanup and close
outstanding resources. For example, a node that is shutdown in an orderly fashion will remove itself
from the cluster, sync translogs to disk, and perform other related cleanup activities. You can help
ensure an orderly shutdown by properly stopping Elasticsearch.
If you're running Elasticsearch as a service, you can stop Elasticsearch via the service management
functionality provided by your installation.
If you're running Elasticsearch directly, you can stop Elasticsearch by sending control-C if you're
running Elasticsearch in the console, or by sending `SIGTERM` to the Elasticsearch process on a
POSIX system. You can obtain the PID to send the signal to via various tools (e.g., `ps` or `jps`):
[source,sh]
--------------------------------------------------
$ jps | grep Elasticsearch
14542 Elasticsearch
--------------------------------------------------
From the Elasticsearch startup logs:
[source,sh]
--------------------------------------------------
[2016-07-07 12:26:18,908][INFO ][node ] [Reaper] version[5.0.0-alpha4], pid[15399], build[3f5b994/2016-06-27T16:23:46.861Z], OS[Mac OS X/10.11.5/x86_64], JVM[Oracle Corporation/Java HotSpot(TM) 64-Bit Server VM/1.8.0_92/25.92-b14]
--------------------------------------------------
Or by specifying a location to write a PID file to on startup (`-p <path>`):
[source,sh]
--------------------------------------------------
$ ./bin/elasticsearch -p /tmp/elasticsearch-pid -d
$ cat /tmp/elasticsearch-pid && echo
15516
$ kill -SIGTERM 15516
--------------------------------------------------
[[fatal-errors]
[float]
=== Stopping on Fatal Errors
During the life of the Elasticsearch virtual machine, certain fatal errors could arise that put the
virtual machine in a questionable state. Such fatal errors include out of memory errors, internal
errors in virtual machine, and serious I/O errors.
When Elasticsearch detects that the virtual machine has encountered such a fatal error Elasticsearch
will attempt to log the error and then will halt the virtual machine. When Elasticsearch initiates
such a shutdown, it does not go through an orderly shutdown as described above. The Elasticsearch
process will also return with a special status code indicating the nature of the error.
[horizontal]
JVM internal error:: 128
Out of memory error:: 127
Stack overflow error:: 126
Unknown virtual machine error:: 125
Serious I/O error:: 124
Unknown fatal error:: 1

View File

@ -150,7 +150,7 @@ public class BootstrapForTesting {
return esPolicy.implies(domain, permission) || testFramework.implies(domain, permission);
}
});
System.setSecurityManager(new SecureSM(true));
System.setSecurityManager(SecureSM.createTestSecureSM());
Security.selfTest();
// guarantee plugin classes are initialized first, in case they have one-time hacks.