mirror of
https://github.com/honeymoose/OpenSearch.git
synced 2025-03-27 10:28:28 +00:00
Die with dignity
Today when a thread encounters a fatal unrecoverable error that threatens the stability of the JVM, Elasticsearch marches on. This includes out of memory errors, stack overflow errors and other errors that leave the JVM in a questionable state. Instead, the Elasticsearch JVM should die when these errors are encountered. This commit causes this to be the case. Relates #19272
This commit is contained in:
parent
d3f8329a3d
commit
e86aa29f67
@ -56,7 +56,7 @@ dependencies {
|
||||
compile "org.apache.lucene:lucene-spatial3d:${versions.lucene}"
|
||||
compile "org.apache.lucene:lucene-suggest:${versions.lucene}"
|
||||
|
||||
compile 'org.elasticsearch:securesm:1.0'
|
||||
compile 'org.elasticsearch:securesm:1.1'
|
||||
|
||||
// utilities
|
||||
compile 'net.sf.jopt-simple:jopt-simple:5.0.2'
|
||||
|
@ -246,6 +246,12 @@ final class Bootstrap {
|
||||
// fail if somebody replaced the lucene jars
|
||||
checkLucene();
|
||||
|
||||
// install the default uncaught exception handler; must be done before security is
|
||||
// initialized as we do not want to grant the runtime permission
|
||||
// setDefaultUncaughtExceptionHandler
|
||||
Thread.setDefaultUncaughtExceptionHandler(
|
||||
new ElasticsearchUncaughtExceptionHandler(() -> Node.NODE_NAME_SETTING.get(settings)));
|
||||
|
||||
INSTANCE.setup(true, settings, environment);
|
||||
|
||||
INSTANCE.start();
|
||||
|
@ -0,0 +1,94 @@
|
||||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.bootstrap;
|
||||
|
||||
import org.apache.lucene.index.MergePolicy;
|
||||
import org.elasticsearch.common.SuppressForbidden;
|
||||
import org.elasticsearch.common.logging.ESLogger;
|
||||
import org.elasticsearch.common.logging.Loggers;
|
||||
|
||||
import java.io.IOError;
|
||||
import java.util.Objects;
|
||||
import java.util.function.Supplier;
|
||||
|
||||
class ElasticsearchUncaughtExceptionHandler implements Thread.UncaughtExceptionHandler {
|
||||
|
||||
private final Supplier<String> loggingPrefixSupplier;
|
||||
|
||||
ElasticsearchUncaughtExceptionHandler(final Supplier<String> loggingPrefixSupplier) {
|
||||
this.loggingPrefixSupplier = Objects.requireNonNull(loggingPrefixSupplier);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void uncaughtException(Thread t, Throwable e) {
|
||||
if (isFatalUncaught(e)) {
|
||||
try {
|
||||
onFatalUncaught(t.getName(), e);
|
||||
} finally {
|
||||
// we use specific error codes in case the above notification failed, at least we
|
||||
// will have some indication of the error bringing us down
|
||||
if (e instanceof InternalError) {
|
||||
halt(128);
|
||||
} else if (e instanceof OutOfMemoryError) {
|
||||
halt(127);
|
||||
} else if (e instanceof StackOverflowError) {
|
||||
halt(126);
|
||||
} else if (e instanceof UnknownError) {
|
||||
halt(125);
|
||||
} else if (e instanceof IOError) {
|
||||
halt(124);
|
||||
} else {
|
||||
halt(1);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
onNonFatalUncaught(t.getName(), e);
|
||||
}
|
||||
}
|
||||
|
||||
// visible for testing
|
||||
static boolean isFatalUncaught(Throwable e) {
|
||||
return isFatalCause(e) || (e instanceof MergePolicy.MergeException && isFatalCause(e.getCause()));
|
||||
}
|
||||
|
||||
private static boolean isFatalCause(Throwable cause) {
|
||||
return cause instanceof Error;
|
||||
}
|
||||
|
||||
// visible for testing
|
||||
void onFatalUncaught(final String threadName, final Throwable t) {
|
||||
final ESLogger logger = Loggers.getLogger(ElasticsearchUncaughtExceptionHandler.class, loggingPrefixSupplier.get());
|
||||
logger.error("fatal error in thread [{}], exiting", t, threadName);
|
||||
}
|
||||
|
||||
// visible for testing
|
||||
void onNonFatalUncaught(final String threadName, final Throwable t) {
|
||||
final ESLogger logger = Loggers.getLogger(ElasticsearchUncaughtExceptionHandler.class, loggingPrefixSupplier.get());
|
||||
logger.warn("uncaught exception in thread [{}]", t, threadName);
|
||||
}
|
||||
|
||||
// visible for testing
|
||||
@SuppressForbidden(reason = "halt")
|
||||
void halt(int status) {
|
||||
// we halt to prevent shutdown hooks from running
|
||||
Runtime.getRuntime().halt(status);
|
||||
}
|
||||
|
||||
}
|
@ -120,7 +120,7 @@ final class Security {
|
||||
Policy.setPolicy(new ESPolicy(createPermissions(environment), getPluginPermissions(environment), filterBadDefaults));
|
||||
|
||||
// enable security manager
|
||||
System.setSecurityManager(new SecureSM());
|
||||
System.setSecurityManager(new SecureSM(new String[] { "org.elasticsearch.bootstrap." }));
|
||||
|
||||
// do some basic tests
|
||||
selfTest();
|
||||
|
@ -24,7 +24,7 @@
|
||||
//// SecurityManager impl:
|
||||
//// Must have all permissions to properly perform access checks
|
||||
|
||||
grant codeBase "${codebase.securesm-1.0.jar}" {
|
||||
grant codeBase "${codebase.securesm-1.1.jar}" {
|
||||
permission java.security.AllPermission;
|
||||
};
|
||||
|
||||
|
@ -0,0 +1,152 @@
|
||||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.bootstrap;
|
||||
|
||||
import org.apache.lucene.index.MergePolicy;
|
||||
import org.elasticsearch.test.ESTestCase;
|
||||
import org.junit.Before;
|
||||
|
||||
import java.io.IOError;
|
||||
import java.io.IOException;
|
||||
import java.io.UncheckedIOException;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.concurrent.atomic.AtomicReference;
|
||||
|
||||
import static org.hamcrest.CoreMatchers.equalTo;
|
||||
|
||||
public class ElasticsearchUncaughtExceptionHandlerTests extends ESTestCase {
|
||||
|
||||
private Map<Class<? extends Error>, Integer> expectedStatus;
|
||||
|
||||
@Before
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
Map<Class<? extends Error>, Integer> expectedStatus = new HashMap<>();
|
||||
expectedStatus.put(InternalError.class, 128);
|
||||
expectedStatus.put(OutOfMemoryError.class, 127);
|
||||
expectedStatus.put(StackOverflowError.class, 126);
|
||||
expectedStatus.put(UnknownError.class, 125);
|
||||
expectedStatus.put(IOError.class, 124);
|
||||
this.expectedStatus = Collections.unmodifiableMap(expectedStatus);
|
||||
}
|
||||
|
||||
public void testUncaughtError() throws InterruptedException {
|
||||
final Error error = randomFrom(
|
||||
new InternalError(),
|
||||
new OutOfMemoryError(),
|
||||
new StackOverflowError(),
|
||||
new UnknownError(),
|
||||
new IOError(new IOException("fatal")),
|
||||
new Error() {});
|
||||
final Thread thread = new Thread(() -> { throw error; });
|
||||
final String name = randomAsciiOfLength(10);
|
||||
thread.setName(name);
|
||||
final AtomicBoolean halt = new AtomicBoolean();
|
||||
final AtomicInteger observedStatus = new AtomicInteger();
|
||||
final AtomicReference<String> threadNameReference = new AtomicReference<>();
|
||||
final AtomicReference<Throwable> throwableReference = new AtomicReference<>();
|
||||
thread.setUncaughtExceptionHandler(new ElasticsearchUncaughtExceptionHandler(() -> "testUncaughtError") {
|
||||
|
||||
@Override
|
||||
void halt(int status) {
|
||||
halt.set(true);
|
||||
observedStatus.set(status);
|
||||
}
|
||||
|
||||
@Override
|
||||
void onFatalUncaught(String threadName, Throwable t) {
|
||||
threadNameReference.set(threadName);
|
||||
throwableReference.set(t);
|
||||
}
|
||||
|
||||
@Override
|
||||
void onNonFatalUncaught(String threadName, Throwable t) {
|
||||
fail();
|
||||
}
|
||||
|
||||
});
|
||||
thread.start();
|
||||
thread.join();
|
||||
assertTrue(halt.get());
|
||||
final int status;
|
||||
if (expectedStatus.containsKey(error.getClass())) {
|
||||
status = expectedStatus.get(error.getClass());
|
||||
} else {
|
||||
status = 1;
|
||||
}
|
||||
assertThat(observedStatus.get(), equalTo(status));
|
||||
assertThat(threadNameReference.get(), equalTo(name));
|
||||
assertThat(throwableReference.get(), equalTo(error));
|
||||
}
|
||||
|
||||
public void testUncaughtException() throws InterruptedException {
|
||||
final RuntimeException e = new RuntimeException("boom");
|
||||
final Thread thread = new Thread(() -> { throw e; });
|
||||
final String name = randomAsciiOfLength(10);
|
||||
thread.setName(name);
|
||||
final AtomicReference<String> threadNameReference = new AtomicReference<>();
|
||||
final AtomicReference<Throwable> throwableReference = new AtomicReference<>();
|
||||
thread.setUncaughtExceptionHandler(new ElasticsearchUncaughtExceptionHandler(() -> "testUncaughtException") {
|
||||
@Override
|
||||
void halt(int status) {
|
||||
fail();
|
||||
}
|
||||
|
||||
@Override
|
||||
void onFatalUncaught(String threadName, Throwable t) {
|
||||
fail();
|
||||
}
|
||||
|
||||
@Override
|
||||
void onNonFatalUncaught(String threadName, Throwable t) {
|
||||
threadNameReference.set(threadName);
|
||||
throwableReference.set(t);
|
||||
}
|
||||
});
|
||||
thread.start();
|
||||
thread.join();
|
||||
assertThat(threadNameReference.get(), equalTo(name));
|
||||
assertThat(throwableReference.get(), equalTo(e));
|
||||
}
|
||||
|
||||
public void testIsFatalCause() {
|
||||
assertFatal(new MergePolicy.MergeException(new OutOfMemoryError(), null));
|
||||
assertFatal(new OutOfMemoryError());
|
||||
assertFatal(new StackOverflowError());
|
||||
assertFatal(new InternalError());
|
||||
assertFatal(new UnknownError());
|
||||
assertFatal(new IOError(new IOException()));
|
||||
assertNonFatal(new RuntimeException());
|
||||
assertNonFatal(new UncheckedIOException(new IOException()));
|
||||
}
|
||||
|
||||
private void assertFatal(Throwable cause) {
|
||||
assertTrue(ElasticsearchUncaughtExceptionHandler.isFatalUncaught(cause));
|
||||
}
|
||||
|
||||
private void assertNonFatal(Throwable cause) {
|
||||
assertFalse(ElasticsearchUncaughtExceptionHandler.isFatalUncaught(cause));
|
||||
}
|
||||
|
||||
}
|
@ -1 +0,0 @@
|
||||
c0c6cf986ba0057390bfcc80c366a0e3157f944b
|
1
distribution/licenses/securesm-1.1.jar.sha1
Normal file
1
distribution/licenses/securesm-1.1.jar.sha1
Normal file
@ -0,0 +1 @@
|
||||
1e423447d020041534be94c0f31a49fbdc1f2950
|
@ -55,3 +55,11 @@ from Elasticsearch.
|
||||
Additionally, it was previously possible to set any setting in
|
||||
Elasticsearch via JVM system properties. This has been removed from
|
||||
Elasticsearch.
|
||||
|
||||
==== Dying on fatal errors
|
||||
|
||||
Previous versions of Elasticsearch would not halt the JVM if out of memory errors or other fatal
|
||||
errors were encountered during the life of the Elasticsearch instance. Because such errors leave
|
||||
the JVM in a questionable state, the best course of action is to halt the JVM when this occurs.
|
||||
Starting in Elasticsearch 5.x, this is now the case. Operators should consider configuring their
|
||||
Elasticsearch services so that they respawn automatically in the case of such a fatal crash.
|
||||
|
@ -47,3 +47,5 @@ include::setup/bootstrap-checks.asciidoc[]
|
||||
include::setup/sysconfig.asciidoc[]
|
||||
|
||||
include::setup/upgrade.asciidoc[]
|
||||
|
||||
include::setup/stopping.asciidoc[]
|
||||
|
58
docs/reference/setup/stopping.asciidoc
Normal file
58
docs/reference/setup/stopping.asciidoc
Normal file
@ -0,0 +1,58 @@
|
||||
[[stopping-elasticsearch]]
|
||||
=== Stopping Elasticsearch
|
||||
|
||||
An orderly shutdown of Elasticsearch ensures that Elasticsearch has a chance to cleanup and close
|
||||
outstanding resources. For example, a node that is shutdown in an orderly fashion will remove itself
|
||||
from the cluster, sync translogs to disk, and perform other related cleanup activities. You can help
|
||||
ensure an orderly shutdown by properly stopping Elasticsearch.
|
||||
|
||||
If you're running Elasticsearch as a service, you can stop Elasticsearch via the service management
|
||||
functionality provided by your installation.
|
||||
|
||||
If you're running Elasticsearch directly, you can stop Elasticsearch by sending control-C if you're
|
||||
running Elasticsearch in the console, or by sending `SIGTERM` to the Elasticsearch process on a
|
||||
POSIX system. You can obtain the PID to send the signal to via various tools (e.g., `ps` or `jps`):
|
||||
|
||||
[source,sh]
|
||||
--------------------------------------------------
|
||||
$ jps | grep Elasticsearch
|
||||
14542 Elasticsearch
|
||||
--------------------------------------------------
|
||||
|
||||
From the Elasticsearch startup logs:
|
||||
|
||||
[source,sh]
|
||||
--------------------------------------------------
|
||||
[2016-07-07 12:26:18,908][INFO ][node ] [Reaper] version[5.0.0-alpha4], pid[15399], build[3f5b994/2016-06-27T16:23:46.861Z], OS[Mac OS X/10.11.5/x86_64], JVM[Oracle Corporation/Java HotSpot(TM) 64-Bit Server VM/1.8.0_92/25.92-b14]
|
||||
--------------------------------------------------
|
||||
|
||||
Or by specifying a location to write a PID file to on startup (`-p <path>`):
|
||||
|
||||
[source,sh]
|
||||
--------------------------------------------------
|
||||
$ ./bin/elasticsearch -p /tmp/elasticsearch-pid -d
|
||||
$ cat /tmp/elasticsearch-pid && echo
|
||||
15516
|
||||
$ kill -SIGTERM 15516
|
||||
--------------------------------------------------
|
||||
|
||||
[[fatal-errors]
|
||||
[float]
|
||||
=== Stopping on Fatal Errors
|
||||
|
||||
During the life of the Elasticsearch virtual machine, certain fatal errors could arise that put the
|
||||
virtual machine in a questionable state. Such fatal errors include out of memory errors, internal
|
||||
errors in virtual machine, and serious I/O errors.
|
||||
|
||||
When Elasticsearch detects that the virtual machine has encountered such a fatal error Elasticsearch
|
||||
will attempt to log the error and then will halt the virtual machine. When Elasticsearch initiates
|
||||
such a shutdown, it does not go through an orderly shutdown as described above. The Elasticsearch
|
||||
process will also return with a special status code indicating the nature of the error.
|
||||
|
||||
[horizontal]
|
||||
JVM internal error:: 128
|
||||
Out of memory error:: 127
|
||||
Stack overflow error:: 126
|
||||
Unknown virtual machine error:: 125
|
||||
Serious I/O error:: 124
|
||||
Unknown fatal error:: 1
|
@ -150,7 +150,7 @@ public class BootstrapForTesting {
|
||||
return esPolicy.implies(domain, permission) || testFramework.implies(domain, permission);
|
||||
}
|
||||
});
|
||||
System.setSecurityManager(new SecureSM(true));
|
||||
System.setSecurityManager(SecureSM.createTestSecureSM());
|
||||
Security.selfTest();
|
||||
|
||||
// guarantee plugin classes are initialized first, in case they have one-time hacks.
|
||||
|
Loading…
x
Reference in New Issue
Block a user