mirror of https://github.com/apache/lucene.git
SOLR-14978 OOM Killer in Foreground (#2055)
Combine Docker and bin/solr OOM handling scripts, move OOM handling to foreground Solr as well. Co-authored-by: Houston Putman <houstonputman@gmail.com>
This commit is contained in:
parent
656ce93c3a
commit
7c1ff288b7
|
@ -148,6 +148,8 @@ Other Changes
|
||||||
* SOLR-14912: Clean up solr-extraction contrib to produce solr-extraction-* jar
|
* SOLR-14912: Clean up solr-extraction contrib to produce solr-extraction-* jar
|
||||||
(instead of solr-cell-*). (Dawid Weiss)
|
(instead of solr-cell-*). (Dawid Weiss)
|
||||||
|
|
||||||
|
* SOLR-14978: Enable OOM Killer Script in Solr Foreground. Simplify getting heap dumps on OOM. (Mike Drob, Houston Putman)
|
||||||
|
|
||||||
Bug Fixes
|
Bug Fixes
|
||||||
---------------------
|
---------------------
|
||||||
* SOLR-14546: Fix for a relatively hard to hit issue in OverseerTaskProcessor that could lead to out of order execution
|
* SOLR-14546: Fix for a relatively hard to hit issue in OverseerTaskProcessor that could lead to out of order execution
|
||||||
|
|
|
@ -17,14 +17,20 @@
|
||||||
|
|
||||||
SOLR_PORT=$1
|
SOLR_PORT=$1
|
||||||
SOLR_LOGS_DIR=$2
|
SOLR_LOGS_DIR=$2
|
||||||
SOLR_PID=`ps auxww | grep start.jar | grep $SOLR_PORT | grep -v grep | awk '{print $2}' | sort -r`
|
SOLR_PID=$(ps auxww | grep start.jar | grep $SOLR_PORT | grep -v grep | awk '{print $2}' | sort -r)
|
||||||
if [ -z "$SOLR_PID" ]; then
|
if [ -z "$SOLR_PID" ]; then
|
||||||
echo "Couldn't find Solr process running on port $SOLR_PORT!"
|
echo "Couldn't find Solr process running on port $SOLR_PORT!"
|
||||||
exit
|
exit
|
||||||
fi
|
fi
|
||||||
NOW=$(date +"%F_%H_%M_%S")
|
NOW=$(date +"%F_%H_%M_%S")
|
||||||
(
|
(
|
||||||
echo "Running OOM killer script for process $SOLR_PID for Solr on port $SOLR_PORT"
|
echo "Running OOM killer script for process $SOLR_PID for Solr on port $SOLR_PORT"
|
||||||
kill -9 $SOLR_PID
|
if [[ "$SOLR_PID" == 1 ]]; then
|
||||||
echo "Killed process $SOLR_PID"
|
# Under Docker when running as pid 1, SIGKILL is ignored, so use the default SIGTERM
|
||||||
|
kill "$SOLR_PID"
|
||||||
|
else
|
||||||
|
# On a real system, or in a container with tini or similar, it is safe to SIGKILL
|
||||||
|
kill -9 "$SOLR_PID"
|
||||||
|
fi
|
||||||
|
echo "Killed process $SOLR_PID"
|
||||||
) | tee $SOLR_LOGS_DIR/solr_oom_killer-$SOLR_PORT-$NOW.log
|
) | tee $SOLR_LOGS_DIR/solr_oom_killer-$SOLR_PORT-$NOW.log
|
||||||
|
|
|
@ -2133,6 +2133,19 @@ if [ -z "$SOLR_TIMEZONE" ]; then
|
||||||
SOLR_TIMEZONE='UTC'
|
SOLR_TIMEZONE='UTC'
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
function mk_writable_dir() {
|
||||||
|
local DIRNAME="$1"
|
||||||
|
local DESCRIPTION="$2"
|
||||||
|
if ! mkdir -p "$DIRNAME" 2> /dev/null ; then
|
||||||
|
echo -e "\nERROR: $DESCRIPTION directory $DIRNAME could not be created. Exiting"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
if [ ! -w "$DIRNAME" ]; then
|
||||||
|
echo -e "\nERROR: $DESCRIPTION directory $DIRNAME is not writable. Exiting"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
# Launches Solr in foreground/background depending on parameters
|
# Launches Solr in foreground/background depending on parameters
|
||||||
function start_solr() {
|
function start_solr() {
|
||||||
|
|
||||||
|
@ -2172,6 +2185,15 @@ function start_solr() {
|
||||||
SOLR_OPTS+=($AUTHC_OPTS)
|
SOLR_OPTS+=($AUTHC_OPTS)
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# If a heap dump directory is specified, enable it in SOLR_OPTS
|
||||||
|
if [[ -z "$SOLR_HEAP_DUMP_DIR" ]] && [[ "$SOLR_HEAP_DUMP" == "true" ]]; then
|
||||||
|
SOLR_HEAP_DUMP_DIR="${SOLR_LOGS_DIR}/dumps"
|
||||||
|
fi
|
||||||
|
if [[ -n "$SOLR_HEAP_DUMP_DIR" ]]; then
|
||||||
|
SOLR_OPTS+=("-XX:+HeapDumpOnOutOfMemoryError")
|
||||||
|
SOLR_OPTS+=("-XX:HeapDumpPath=$SOLR_HEAP_DUMP_DIR/solr-$(date +%s)-pid$$.hprof")
|
||||||
|
fi
|
||||||
|
|
||||||
if $verbose ; then
|
if $verbose ; then
|
||||||
echo -e "\nStarting Solr using the following settings:"
|
echo -e "\nStarting Solr using the following settings:"
|
||||||
echo -e " JAVA = $JAVA"
|
echo -e " JAVA = $JAVA"
|
||||||
|
@ -2226,21 +2248,13 @@ function start_solr() {
|
||||||
# '-OmitStackTraceInFastThrow' ensures stack traces in errors,
|
# '-OmitStackTraceInFastThrow' ensures stack traces in errors,
|
||||||
# users who don't care about useful error msgs can override in SOLR_OPTS with +OmitStackTraceInFastThrow
|
# users who don't care about useful error msgs can override in SOLR_OPTS with +OmitStackTraceInFastThrow
|
||||||
"${SOLR_HOST_ARG[@]}" "-Duser.timezone=$SOLR_TIMEZONE" "-XX:-OmitStackTraceInFastThrow" \
|
"${SOLR_HOST_ARG[@]}" "-Duser.timezone=$SOLR_TIMEZONE" "-XX:-OmitStackTraceInFastThrow" \
|
||||||
|
"-XX:OnOutOfMemoryError=$SOLR_TIP/bin/oom_solr.sh $SOLR_PORT $SOLR_LOGS_DIR" \
|
||||||
"-Djetty.home=$SOLR_SERVER_DIR" "-Dsolr.solr.home=$SOLR_HOME" "-Dsolr.data.home=$SOLR_DATA_HOME" "-Dsolr.install.dir=$SOLR_TIP" \
|
"-Djetty.home=$SOLR_SERVER_DIR" "-Dsolr.solr.home=$SOLR_HOME" "-Dsolr.data.home=$SOLR_DATA_HOME" "-Dsolr.install.dir=$SOLR_TIP" \
|
||||||
"-Dsolr.default.confdir=$DEFAULT_CONFDIR" "${LOG4J_CONFIG[@]}" "${SOLR_OPTS[@]}" "${SECURITY_MANAGER_OPTS[@]}" "${SOLR_ADMIN_UI}")
|
"-Dsolr.default.confdir=$DEFAULT_CONFDIR" "${LOG4J_CONFIG[@]}" "${SOLR_OPTS[@]}" "${SECURITY_MANAGER_OPTS[@]}" "${SOLR_ADMIN_UI}")
|
||||||
|
|
||||||
if [ "$SOLR_MODE" == "solrcloud" ]; then
|
mk_writable_dir "$SOLR_LOGS_DIR" "Logs"
|
||||||
IN_CLOUD_MODE=" in SolrCloud mode"
|
if [[ -n "$SOLR_HEAP_DUMP_DIR" ]]; then
|
||||||
fi
|
mk_writable_dir "$SOLR_HEAP_DUMP_DIR" "Heap Dump"
|
||||||
|
|
||||||
mkdir -p "$SOLR_LOGS_DIR" 2>/dev/null
|
|
||||||
if [ $? -ne 0 ]; then
|
|
||||||
echo -e "\nERROR: Logs directory $SOLR_LOGS_DIR could not be created. Exiting"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
if [ ! -w "$SOLR_LOGS_DIR" ]; then
|
|
||||||
echo -e "\nERROR: Logs directory $SOLR_LOGS_DIR is not writable. Exiting"
|
|
||||||
exit 1
|
|
||||||
fi
|
fi
|
||||||
case "$SOLR_LOGS_DIR" in
|
case "$SOLR_LOGS_DIR" in
|
||||||
contexts|etc|lib|modules|resources|scripts|solr|solr-webapp)
|
contexts|etc|lib|modules|resources|scripts|solr|solr-webapp)
|
||||||
|
@ -2254,7 +2268,6 @@ function start_solr() {
|
||||||
else
|
else
|
||||||
# run Solr in the background
|
# run Solr in the background
|
||||||
nohup "$JAVA" "${SOLR_START_OPTS[@]}" $SOLR_ADDL_ARGS -Dsolr.log.muteconsole \
|
nohup "$JAVA" "${SOLR_START_OPTS[@]}" $SOLR_ADDL_ARGS -Dsolr.log.muteconsole \
|
||||||
"-XX:OnOutOfMemoryError=$SOLR_TIP/bin/oom_solr.sh $SOLR_PORT $SOLR_LOGS_DIR" \
|
|
||||||
-jar start.jar "${SOLR_JETTY_CONFIG[@]}" $SOLR_JETTY_ADDL_CONFIG \
|
-jar start.jar "${SOLR_JETTY_CONFIG[@]}" $SOLR_JETTY_ADDL_CONFIG \
|
||||||
1>"$SOLR_LOGS_DIR/solr-$SOLR_PORT-console.log" 2>&1 & echo $! > "$SOLR_PID_DIR/solr-$SOLR_PORT.pid"
|
1>"$SOLR_LOGS_DIR/solr-$SOLR_PORT-console.log" 2>&1 & echo $! > "$SOLR_PID_DIR/solr-$SOLR_PORT.pid"
|
||||||
|
|
||||||
|
|
|
@ -248,3 +248,13 @@
|
||||||
# Sometimes it may be necessary to place a core or a backup on a different location or a different disk
|
# Sometimes it may be necessary to place a core or a backup on a different location or a different disk
|
||||||
# This parameter lets you specify file system path(s) to explicitly allow. The special value of '*' will allow any path
|
# This parameter lets you specify file system path(s) to explicitly allow. The special value of '*' will allow any path
|
||||||
#SOLR_OPTS="$SOLR_OPTS -Dsolr.allowPaths=/mnt/bigdisk,/other/path"
|
#SOLR_OPTS="$SOLR_OPTS -Dsolr.allowPaths=/mnt/bigdisk,/other/path"
|
||||||
|
|
||||||
|
# Solr can attempt to take a heap dump on out of memory errors. To enable this, uncomment the line setting
|
||||||
|
# SOLR_HEAP_DUMP below. Heap dumps will be saved to SOLR_LOG_DIR/dumps by default. Alternatively, you can specify any
|
||||||
|
# other directory, which will implicitly enable heap dumping. Dump name pattern will be solr-[timestamp]-pid[###].hprof
|
||||||
|
# When using this feature, it is recommended to have an external service monitoring the given dir.
|
||||||
|
# If more fine grained control is required, you can manually add the appropriate flags to SOLR_OPTS
|
||||||
|
# See https://docs.oracle.com/en/java/javase/11/troubleshoot/command-line-options1.html
|
||||||
|
# You can test this behaviour by setting SOLR_HEAP=25m
|
||||||
|
#SOLR_HEAP_DUMP=true
|
||||||
|
#SOLR_HEAP_DUMP_DIR=/var/log/dumps
|
||||||
|
|
|
@ -39,6 +39,9 @@ COPY --chown=0:0 scripts /opt/docker-solr/scripts
|
||||||
|
|
||||||
ARG SOLR_VERSION
|
ARG SOLR_VERSION
|
||||||
|
|
||||||
|
# Used by solr-fg
|
||||||
|
ENV SOLR_VERSION $SOLR_VERSION
|
||||||
|
|
||||||
COPY --from=solr_package "/opt/solr-$SOLR_VERSION.tgz" "/opt/solr-$SOLR_VERSION.tgz"
|
COPY --from=solr_package "/opt/solr-$SOLR_VERSION.tgz" "/opt/solr-$SOLR_VERSION.tgz"
|
||||||
|
|
||||||
RUN set -ex; \
|
RUN set -ex; \
|
||||||
|
|
|
@ -1,38 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# Custom oom handler loosely based on
|
|
||||||
# https://github.com/apache/lucene-solr/blob/master/solr/bin/oom_solr.sh
|
|
||||||
# See solr-forgeground for how to configure OOM behaviour
|
|
||||||
|
|
||||||
if [[ -z "${SOLR_LOGS_DIR:-}" ]]; then
|
|
||||||
if [ -d /var/solr/logs ]; then
|
|
||||||
SOLR_LOGS_DIR=/var/solr/logs
|
|
||||||
elif [ -d /opt/solr/server/logs ]; then
|
|
||||||
SOLR_LOGS_DIR=/opt/solr/server/logs
|
|
||||||
else
|
|
||||||
echo "Cannot determine SOLR_LOGS_DIR!"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
SOLR_PID=$(pgrep -f start.jar)
|
|
||||||
if [[ -z "$SOLR_PID" ]]; then
|
|
||||||
echo "Couldn't find Solr process running!"
|
|
||||||
exit
|
|
||||||
fi
|
|
||||||
|
|
||||||
NOW=$(date +"%F_%H_%M_%S")
|
|
||||||
(
|
|
||||||
echo "Running OOM killer script for Solr process $SOLR_PID"
|
|
||||||
if [[ "$SOLR_PID" == 1 ]]; then
|
|
||||||
# under Docker, when running as pid 1, a SIGKILL is ignored,
|
|
||||||
# so use the default SIGTERM
|
|
||||||
kill "$SOLR_PID"
|
|
||||||
sleep 2
|
|
||||||
# if that hasn't worked, send SIGKILL
|
|
||||||
kill -SIGILL "$SOLR_PID"
|
|
||||||
else
|
|
||||||
# if we're running with `--init` or under tini or similar,
|
|
||||||
# follow the upstream behaviour
|
|
||||||
kill -9 "$SOLR_PID"
|
|
||||||
fi
|
|
||||||
) | tee "$SOLR_LOGS_DIR/solr_oom_killer-$SOLR_PORT-$NOW.log"
|
|
|
@ -7,31 +7,6 @@ if [[ "$VERBOSE" == "yes" ]]; then
|
||||||
set -x
|
set -x
|
||||||
fi
|
fi
|
||||||
|
|
||||||
EXTRA_ARGS=()
|
|
||||||
|
|
||||||
# Allow easy setting of the OOM behaviour
|
|
||||||
# Test with: docker run -p 8983:8983 -it -e OOM=script -e SOLR_JAVA_MEM="-Xms25m -Xmx25m" solr
|
|
||||||
if [[ -z "${OOM:-}" ]]; then
|
|
||||||
OOM='none'
|
|
||||||
fi
|
|
||||||
case "$OOM" in
|
|
||||||
'script')
|
|
||||||
EXTRA_ARGS+=(-a '-XX:OnOutOfMemoryError=/opt/docker-solr/scripts/oom_solr.sh')
|
|
||||||
;;
|
|
||||||
'exit')
|
|
||||||
# recommended
|
|
||||||
EXTRA_ARGS+=(-a '-XX:+ExitOnOutOfMemoryError')
|
|
||||||
;;
|
|
||||||
'crash')
|
|
||||||
EXTRA_ARGS+=(-a '-XX:+CrashOnOutOfMemoryError')
|
|
||||||
;;
|
|
||||||
'none'|'')
|
|
||||||
;;
|
|
||||||
*)
|
|
||||||
echo "Unsupported value in OOM=$OOM"
|
|
||||||
exit 1
|
|
||||||
esac
|
|
||||||
|
|
||||||
echo "Starting Solr $SOLR_VERSION"
|
echo "Starting Solr $SOLR_VERSION"
|
||||||
# determine TINI default. If it is already set, assume the user knows what they want
|
# determine TINI default. If it is already set, assume the user knows what they want
|
||||||
if [[ -z "${TINI:-}" ]]; then
|
if [[ -z "${TINI:-}" ]]; then
|
||||||
|
@ -47,9 +22,9 @@ if [[ -z "${TINI:-}" ]]; then
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
if [[ "$TINI" == yes ]]; then
|
if [[ "$TINI" == yes ]]; then
|
||||||
exec tini -- solr -f "$@" "${EXTRA_ARGS[@]}"
|
exec tini -- solr -f "$@"
|
||||||
elif [[ "$TINI" == no ]]; then
|
elif [[ "$TINI" == no ]]; then
|
||||||
exec solr -f "$@" "${EXTRA_ARGS[@]}"
|
exec solr -f "$@"
|
||||||
else
|
else
|
||||||
echo "invalid value TINI=$TINI"
|
echo "invalid value TINI=$TINI"
|
||||||
exit 1
|
exit 1
|
||||||
|
|
|
@ -36,7 +36,7 @@ There are several points to keep in mind:
|
||||||
* The G1GC garbage collector is currently preferred when using a JVM that supports it (Java 9 and later).
|
* The G1GC garbage collector is currently preferred when using a JVM that supports it (Java 9 and later).
|
||||||
* Modern hardware can be configured with hundreds of gigabytes of physical RAM and many CPUs. It is often better in these cases to run multiple JVMs, each with a limited amount of memory allocated to their heaps. One way to achieve this is to run Solr as a https://hub.docker.com/_/solr?tab=tags[Docker container].
|
* Modern hardware can be configured with hundreds of gigabytes of physical RAM and many CPUs. It is often better in these cases to run multiple JVMs, each with a limited amount of memory allocated to their heaps. One way to achieve this is to run Solr as a https://hub.docker.com/_/solr?tab=tags[Docker container].
|
||||||
* It's good practice to periodically re-analyze the GC logs and/or monitor with <<metrics-reporting#metrics-reporting,Metrics Reporting>> to see if the memory usage has changed due to changes in your application, number of documents, etc.
|
* It's good practice to periodically re-analyze the GC logs and/or monitor with <<metrics-reporting#metrics-reporting,Metrics Reporting>> to see if the memory usage has changed due to changes in your application, number of documents, etc.
|
||||||
* On *nix systems, we recommend that Solr be run with the "OOM killer script" (see `solr/bin/oom_solr.sh`). This will forcefully stop Solr when the heap is exhausted rather than continue in an indeterminate state. If `jstack` is available, a heap dump will also be taken before the process is killed.
|
* On *nix systems, Solr will run with "OOM killer script" (see `solr/bin/oom_solr.sh`). This will forcefully stop Solr when the heap is exhausted rather than continue in an indeterminate state. You can additionally request a heap dump on OOM through the values in `solr.in.sh`
|
||||||
* All current (Java 11) garbage collectors can hit "stop the world" collections, which suspend the JVM until completed. If, through monitoring, these garbage collections are frequent and greater than your application can tolerate, additional tuning should be considered. "Stop the world" pauses greater than 5 seconds are rarely acceptable, and having them be less than 1 second is desirable.
|
* All current (Java 11) garbage collectors can hit "stop the world" collections, which suspend the JVM until completed. If, through monitoring, these garbage collections are frequent and greater than your application can tolerate, additional tuning should be considered. "Stop the world" pauses greater than 5 seconds are rarely acceptable, and having them be less than 1 second is desirable.
|
||||||
|
|
||||||
Consult your JVM vendor's documentation for specifics in your particular case, the recommendations above are intended as starting points.
|
Consult your JVM vendor's documentation for specifics in your particular case, the recommendations above are intended as starting points.
|
||||||
|
|
Loading…
Reference in New Issue