SOLR-14978 OOM Killer in Foreground (#2055)

Combine Docker and bin/solr OOM handling scripts, move OOM handling to foreground Solr as well.

Co-authored-by: Houston Putman <houstonputman@gmail.com>
This commit is contained in:
Mike Drob 2020-11-04 17:20:16 -06:00 committed by GitHub
parent 656ce93c3a
commit 7c1ff288b7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 54 additions and 83 deletions

View File

@ -148,6 +148,8 @@ Other Changes
* SOLR-14912: Clean up solr-extraction contrib to produce solr-extraction-* jar * SOLR-14912: Clean up solr-extraction contrib to produce solr-extraction-* jar
(instead of solr-cell-*). (Dawid Weiss) (instead of solr-cell-*). (Dawid Weiss)
* SOLR-14978: Enable OOM Killer Script in Solr Foreground. Simplify getting heap dumps on OOM. (Mike Drob, Houston Putman)
Bug Fixes Bug Fixes
--------------------- ---------------------
* SOLR-14546: Fix for a relatively hard to hit issue in OverseerTaskProcessor that could lead to out of order execution * SOLR-14546: Fix for a relatively hard to hit issue in OverseerTaskProcessor that could lead to out of order execution

View File

@ -17,14 +17,20 @@
SOLR_PORT=$1 SOLR_PORT=$1
SOLR_LOGS_DIR=$2 SOLR_LOGS_DIR=$2
SOLR_PID=`ps auxww | grep start.jar | grep $SOLR_PORT | grep -v grep | awk '{print $2}' | sort -r` SOLR_PID=$(ps auxww | grep start.jar | grep $SOLR_PORT | grep -v grep | awk '{print $2}' | sort -r)
if [ -z "$SOLR_PID" ]; then if [ -z "$SOLR_PID" ]; then
echo "Couldn't find Solr process running on port $SOLR_PORT!" echo "Couldn't find Solr process running on port $SOLR_PORT!"
exit exit
fi fi
NOW=$(date +"%F_%H_%M_%S") NOW=$(date +"%F_%H_%M_%S")
( (
echo "Running OOM killer script for process $SOLR_PID for Solr on port $SOLR_PORT" echo "Running OOM killer script for process $SOLR_PID for Solr on port $SOLR_PORT"
kill -9 $SOLR_PID if [[ "$SOLR_PID" == 1 ]]; then
echo "Killed process $SOLR_PID" # Under Docker when running as pid 1, SIGKILL is ignored, so use the default SIGTERM
kill "$SOLR_PID"
else
# On a real system, or in a container with tini or similar, it is safe to SIGKILL
kill -9 "$SOLR_PID"
fi
echo "Killed process $SOLR_PID"
) | tee $SOLR_LOGS_DIR/solr_oom_killer-$SOLR_PORT-$NOW.log ) | tee $SOLR_LOGS_DIR/solr_oom_killer-$SOLR_PORT-$NOW.log

View File

@ -2133,6 +2133,19 @@ if [ -z "$SOLR_TIMEZONE" ]; then
SOLR_TIMEZONE='UTC' SOLR_TIMEZONE='UTC'
fi fi
function mk_writable_dir() {
local DIRNAME="$1"
local DESCRIPTION="$2"
if ! mkdir -p "$DIRNAME" 2> /dev/null ; then
echo -e "\nERROR: $DESCRIPTION directory $DIRNAME could not be created. Exiting"
exit 1
fi
if [ ! -w "$DIRNAME" ]; then
echo -e "\nERROR: $DESCRIPTION directory $DIRNAME is not writable. Exiting"
exit 1
fi
}
# Launches Solr in foreground/background depending on parameters # Launches Solr in foreground/background depending on parameters
function start_solr() { function start_solr() {
@ -2172,6 +2185,15 @@ function start_solr() {
SOLR_OPTS+=($AUTHC_OPTS) SOLR_OPTS+=($AUTHC_OPTS)
fi fi
# If a heap dump directory is specified, enable it in SOLR_OPTS
if [[ -z "$SOLR_HEAP_DUMP_DIR" ]] && [[ "$SOLR_HEAP_DUMP" == "true" ]]; then
SOLR_HEAP_DUMP_DIR="${SOLR_LOGS_DIR}/dumps"
fi
if [[ -n "$SOLR_HEAP_DUMP_DIR" ]]; then
SOLR_OPTS+=("-XX:+HeapDumpOnOutOfMemoryError")
SOLR_OPTS+=("-XX:HeapDumpPath=$SOLR_HEAP_DUMP_DIR/solr-$(date +%s)-pid$$.hprof")
fi
if $verbose ; then if $verbose ; then
echo -e "\nStarting Solr using the following settings:" echo -e "\nStarting Solr using the following settings:"
echo -e " JAVA = $JAVA" echo -e " JAVA = $JAVA"
@ -2226,21 +2248,13 @@ function start_solr() {
# '-OmitStackTraceInFastThrow' ensures stack traces in errors, # '-OmitStackTraceInFastThrow' ensures stack traces in errors,
# users who don't care about useful error msgs can override in SOLR_OPTS with +OmitStackTraceInFastThrow # users who don't care about useful error msgs can override in SOLR_OPTS with +OmitStackTraceInFastThrow
"${SOLR_HOST_ARG[@]}" "-Duser.timezone=$SOLR_TIMEZONE" "-XX:-OmitStackTraceInFastThrow" \ "${SOLR_HOST_ARG[@]}" "-Duser.timezone=$SOLR_TIMEZONE" "-XX:-OmitStackTraceInFastThrow" \
"-XX:OnOutOfMemoryError=$SOLR_TIP/bin/oom_solr.sh $SOLR_PORT $SOLR_LOGS_DIR" \
"-Djetty.home=$SOLR_SERVER_DIR" "-Dsolr.solr.home=$SOLR_HOME" "-Dsolr.data.home=$SOLR_DATA_HOME" "-Dsolr.install.dir=$SOLR_TIP" \ "-Djetty.home=$SOLR_SERVER_DIR" "-Dsolr.solr.home=$SOLR_HOME" "-Dsolr.data.home=$SOLR_DATA_HOME" "-Dsolr.install.dir=$SOLR_TIP" \
"-Dsolr.default.confdir=$DEFAULT_CONFDIR" "${LOG4J_CONFIG[@]}" "${SOLR_OPTS[@]}" "${SECURITY_MANAGER_OPTS[@]}" "${SOLR_ADMIN_UI}") "-Dsolr.default.confdir=$DEFAULT_CONFDIR" "${LOG4J_CONFIG[@]}" "${SOLR_OPTS[@]}" "${SECURITY_MANAGER_OPTS[@]}" "${SOLR_ADMIN_UI}")
if [ "$SOLR_MODE" == "solrcloud" ]; then mk_writable_dir "$SOLR_LOGS_DIR" "Logs"
IN_CLOUD_MODE=" in SolrCloud mode" if [[ -n "$SOLR_HEAP_DUMP_DIR" ]]; then
fi mk_writable_dir "$SOLR_HEAP_DUMP_DIR" "Heap Dump"
mkdir -p "$SOLR_LOGS_DIR" 2>/dev/null
if [ $? -ne 0 ]; then
echo -e "\nERROR: Logs directory $SOLR_LOGS_DIR could not be created. Exiting"
exit 1
fi
if [ ! -w "$SOLR_LOGS_DIR" ]; then
echo -e "\nERROR: Logs directory $SOLR_LOGS_DIR is not writable. Exiting"
exit 1
fi fi
case "$SOLR_LOGS_DIR" in case "$SOLR_LOGS_DIR" in
contexts|etc|lib|modules|resources|scripts|solr|solr-webapp) contexts|etc|lib|modules|resources|scripts|solr|solr-webapp)
@ -2254,7 +2268,6 @@ function start_solr() {
else else
# run Solr in the background # run Solr in the background
nohup "$JAVA" "${SOLR_START_OPTS[@]}" $SOLR_ADDL_ARGS -Dsolr.log.muteconsole \ nohup "$JAVA" "${SOLR_START_OPTS[@]}" $SOLR_ADDL_ARGS -Dsolr.log.muteconsole \
"-XX:OnOutOfMemoryError=$SOLR_TIP/bin/oom_solr.sh $SOLR_PORT $SOLR_LOGS_DIR" \
-jar start.jar "${SOLR_JETTY_CONFIG[@]}" $SOLR_JETTY_ADDL_CONFIG \ -jar start.jar "${SOLR_JETTY_CONFIG[@]}" $SOLR_JETTY_ADDL_CONFIG \
1>"$SOLR_LOGS_DIR/solr-$SOLR_PORT-console.log" 2>&1 & echo $! > "$SOLR_PID_DIR/solr-$SOLR_PORT.pid" 1>"$SOLR_LOGS_DIR/solr-$SOLR_PORT-console.log" 2>&1 & echo $! > "$SOLR_PID_DIR/solr-$SOLR_PORT.pid"

View File

@ -248,3 +248,13 @@
# Sometimes it may be necessary to place a core or a backup on a different location or a different disk # Sometimes it may be necessary to place a core or a backup on a different location or a different disk
# This parameter lets you specify file system path(s) to explicitly allow. The special value of '*' will allow any path # This parameter lets you specify file system path(s) to explicitly allow. The special value of '*' will allow any path
#SOLR_OPTS="$SOLR_OPTS -Dsolr.allowPaths=/mnt/bigdisk,/other/path" #SOLR_OPTS="$SOLR_OPTS -Dsolr.allowPaths=/mnt/bigdisk,/other/path"
# Solr can attempt to take a heap dump on out of memory errors. To enable this, uncomment the line setting
# SOLR_HEAP_DUMP below. Heap dumps will be saved to SOLR_LOG_DIR/dumps by default. Alternatively, you can specify any
# other directory, which will implicitly enable heap dumping. Dump name pattern will be solr-[timestamp]-pid[###].hprof
# When using this feature, it is recommended to have an external service monitoring the given dir.
# If more fine grained control is required, you can manually add the appropriate flags to SOLR_OPTS
# See https://docs.oracle.com/en/java/javase/11/troubleshoot/command-line-options1.html
# You can test this behaviour by setting SOLR_HEAP=25m
#SOLR_HEAP_DUMP=true
#SOLR_HEAP_DUMP_DIR=/var/log/dumps

View File

@ -39,6 +39,9 @@ COPY --chown=0:0 scripts /opt/docker-solr/scripts
ARG SOLR_VERSION ARG SOLR_VERSION
# Used by solr-fg
ENV SOLR_VERSION $SOLR_VERSION
COPY --from=solr_package "/opt/solr-$SOLR_VERSION.tgz" "/opt/solr-$SOLR_VERSION.tgz" COPY --from=solr_package "/opt/solr-$SOLR_VERSION.tgz" "/opt/solr-$SOLR_VERSION.tgz"
RUN set -ex; \ RUN set -ex; \

View File

@ -1,38 +0,0 @@
#!/bin/bash
# Custom oom handler loosely based on
# https://github.com/apache/lucene-solr/blob/master/solr/bin/oom_solr.sh
# See solr-forgeground for how to configure OOM behaviour
if [[ -z "${SOLR_LOGS_DIR:-}" ]]; then
if [ -d /var/solr/logs ]; then
SOLR_LOGS_DIR=/var/solr/logs
elif [ -d /opt/solr/server/logs ]; then
SOLR_LOGS_DIR=/opt/solr/server/logs
else
echo "Cannot determine SOLR_LOGS_DIR!"
exit 1
fi
fi
SOLR_PID=$(pgrep -f start.jar)
if [[ -z "$SOLR_PID" ]]; then
echo "Couldn't find Solr process running!"
exit
fi
NOW=$(date +"%F_%H_%M_%S")
(
echo "Running OOM killer script for Solr process $SOLR_PID"
if [[ "$SOLR_PID" == 1 ]]; then
# under Docker, when running as pid 1, a SIGKILL is ignored,
# so use the default SIGTERM
kill "$SOLR_PID"
sleep 2
# if that hasn't worked, send SIGKILL
kill -SIGILL "$SOLR_PID"
else
# if we're running with `--init` or under tini or similar,
# follow the upstream behaviour
kill -9 "$SOLR_PID"
fi
) | tee "$SOLR_LOGS_DIR/solr_oom_killer-$SOLR_PORT-$NOW.log"

View File

@ -7,31 +7,6 @@ if [[ "$VERBOSE" == "yes" ]]; then
set -x set -x
fi fi
EXTRA_ARGS=()
# Allow easy setting of the OOM behaviour
# Test with: docker run -p 8983:8983 -it -e OOM=script -e SOLR_JAVA_MEM="-Xms25m -Xmx25m" solr
if [[ -z "${OOM:-}" ]]; then
OOM='none'
fi
case "$OOM" in
'script')
EXTRA_ARGS+=(-a '-XX:OnOutOfMemoryError=/opt/docker-solr/scripts/oom_solr.sh')
;;
'exit')
# recommended
EXTRA_ARGS+=(-a '-XX:+ExitOnOutOfMemoryError')
;;
'crash')
EXTRA_ARGS+=(-a '-XX:+CrashOnOutOfMemoryError')
;;
'none'|'')
;;
*)
echo "Unsupported value in OOM=$OOM"
exit 1
esac
echo "Starting Solr $SOLR_VERSION" echo "Starting Solr $SOLR_VERSION"
# determine TINI default. If it is already set, assume the user knows what they want # determine TINI default. If it is already set, assume the user knows what they want
if [[ -z "${TINI:-}" ]]; then if [[ -z "${TINI:-}" ]]; then
@ -47,9 +22,9 @@ if [[ -z "${TINI:-}" ]]; then
fi fi
fi fi
if [[ "$TINI" == yes ]]; then if [[ "$TINI" == yes ]]; then
exec tini -- solr -f "$@" "${EXTRA_ARGS[@]}" exec tini -- solr -f "$@"
elif [[ "$TINI" == no ]]; then elif [[ "$TINI" == no ]]; then
exec solr -f "$@" "${EXTRA_ARGS[@]}" exec solr -f "$@"
else else
echo "invalid value TINI=$TINI" echo "invalid value TINI=$TINI"
exit 1 exit 1

View File

@ -36,7 +36,7 @@ There are several points to keep in mind:
* The G1GC garbage collector is currently preferred when using a JVM that supports it (Java 9 and later). * The G1GC garbage collector is currently preferred when using a JVM that supports it (Java 9 and later).
* Modern hardware can be configured with hundreds of gigabytes of physical RAM and many CPUs. It is often better in these cases to run multiple JVMs, each with a limited amount of memory allocated to their heaps. One way to achieve this is to run Solr as a https://hub.docker.com/_/solr?tab=tags[Docker container]. * Modern hardware can be configured with hundreds of gigabytes of physical RAM and many CPUs. It is often better in these cases to run multiple JVMs, each with a limited amount of memory allocated to their heaps. One way to achieve this is to run Solr as a https://hub.docker.com/_/solr?tab=tags[Docker container].
* It's good practice to periodically re-analyze the GC logs and/or monitor with <<metrics-reporting#metrics-reporting,Metrics Reporting>> to see if the memory usage has changed due to changes in your application, number of documents, etc. * It's good practice to periodically re-analyze the GC logs and/or monitor with <<metrics-reporting#metrics-reporting,Metrics Reporting>> to see if the memory usage has changed due to changes in your application, number of documents, etc.
* On *nix systems, we recommend that Solr be run with the "OOM killer script" (see `solr/bin/oom_solr.sh`). This will forcefully stop Solr when the heap is exhausted rather than continue in an indeterminate state. If `jstack` is available, a heap dump will also be taken before the process is killed. * On *nix systems, Solr will run with "OOM killer script" (see `solr/bin/oom_solr.sh`). This will forcefully stop Solr when the heap is exhausted rather than continue in an indeterminate state. You can additionally request a heap dump on OOM through the values in `solr.in.sh`
* All current (Java 11) garbage collectors can hit "stop the world" collections, which suspend the JVM until completed. If, through monitoring, these garbage collections are frequent and greater than your application can tolerate, additional tuning should be considered. "Stop the world" pauses greater than 5 seconds are rarely acceptable, and having them be less than 1 second is desirable. * All current (Java 11) garbage collectors can hit "stop the world" collections, which suspend the JVM until completed. If, through monitoring, these garbage collections are frequent and greater than your application can tolerate, additional tuning should be considered. "Stop the world" pauses greater than 5 seconds are rarely acceptable, and having them be less than 1 second is desirable.
Consult your JVM vendor's documentation for specifics in your particular case, the recommendations above are intended as starting points. Consult your JVM vendor's documentation for specifics in your particular case, the recommendations above are intended as starting points.