Reintroduce five-minute and fifteen-minute load averages on Linux

This commit reintroduces the five-minute and fifteen-minute load stats
on Linux, and changes the format of the load_average field back to an
array.
This commit is contained in:
Jason Tedor 2016-01-11 17:26:00 -05:00
parent 7873dddab6
commit 1de2081ed3
9 changed files with 121 additions and 34 deletions

View File

@ -40,7 +40,7 @@ import static org.elasticsearch.common.Strings.cleanPath;
* The environment of where things exists.
*/
@SuppressForbidden(reason = "configures paths for the system")
// TODO: move PathUtils to be package-private here instead of
// TODO: move PathUtils to be package-private here instead of
// public+forbidden api!
public class Environment {
@ -72,7 +72,7 @@ public class Environment {
/** Path to the PID file (can be null if no PID file is configured) **/
private final Path pidFile;
/** Path to the temporary file directory used by the JDK */
private final Path tmpFile = PathUtils.get(System.getProperty("java.io.tmpdir"));
@ -292,7 +292,7 @@ public class Environment {
public Path pidFile() {
return pidFile;
}
/** Path to the default temp directory used by the JDK */
public Path tmpFile() {
return tmpFile;
@ -317,7 +317,7 @@ public class Environment {
public static FileStore getFileStore(Path path) throws IOException {
return ESFileStore.getMatchingFileStore(path, fileStores);
}
/**
* Returns true if the path is writable.
* Acts just like {@link Files#isWritable(Path)}, except won't

View File

@ -20,11 +20,16 @@
package org.elasticsearch.monitor.os;
import org.apache.lucene.util.Constants;
import org.apache.lucene.util.SuppressForbidden;
import org.elasticsearch.common.io.PathUtils;
import org.elasticsearch.monitor.Probes;
import java.io.IOException;
import java.lang.management.ManagementFactory;
import java.lang.management.OperatingSystemMXBean;
import java.lang.reflect.Method;
import java.nio.file.Files;
import java.util.List;
public class OsProbe {
@ -103,19 +108,42 @@ public class OsProbe {
}
/**
* Returns the system load average for the last minute.
* Returns the system load averages
*/
public double getSystemLoadAverage() {
public double[] getSystemLoadAverage() {
if (Constants.LINUX) {
double[] loadAverage = readProcLoadavg("/proc/loadavg");
if (loadAverage != null) {
return loadAverage;
}
// fallback
}
if (getSystemLoadAverage == null) {
return -1;
return null;
}
try {
return (double) getSystemLoadAverage.invoke(osMxBean);
double oneMinuteLoadAverage = (double) getSystemLoadAverage.invoke(osMxBean);
return new double[] { oneMinuteLoadAverage, -1, -1 };
} catch (Throwable t) {
return -1;
return null;
}
}
@SuppressForbidden(reason = "access /proc")
private static double[] readProcLoadavg(String procLoadavg) {
try {
List<String> lines = Files.readAllLines(PathUtils.get(procLoadavg));
if (!lines.isEmpty()) {
String[] fields = lines.get(0).split("\\s+");
return new double[] { Double.parseDouble(fields[0]), Double.parseDouble(fields[1]), Double.parseDouble(fields[2]) };
}
} catch (IOException e) {
// do not fail Elasticsearch if something unexpected
// happens here
}
return null;
}
public short getSystemCpuPercent() {
return Probes.getLoadAndScaleToPercent(getSystemCpuLoad, osMxBean);
}

View File

@ -87,7 +87,13 @@ public class OsStats implements Streamable, ToXContent {
if (cpu != null) {
builder.startObject(Fields.CPU);
builder.field(Fields.PERCENT, cpu.getPercent());
builder.field(Fields.LOAD_AVERAGE, cpu.getLoadAverage());
if (cpu.getLoadAverage() != null) {
builder.startArray(Fields.LOAD_AVERAGE);
builder.value(cpu.getLoadAverage()[0]);
builder.value(cpu.getLoadAverage()[1]);
builder.value(cpu.getLoadAverage()[2]);
builder.endArray();
}
builder.endObject();
}
@ -152,8 +158,9 @@ public class OsStats implements Streamable, ToXContent {
}
public static class Cpu implements Streamable {
short percent = -1;
double loadAverage = -1;
double[] loadAverage = null;
Cpu() {}
@ -166,20 +173,29 @@ public class OsStats implements Streamable, ToXContent {
@Override
public void readFrom(StreamInput in) throws IOException {
percent = in.readShort();
loadAverage = in.readDouble();
if (in.readBoolean()) {
loadAverage = in.readDoubleArray();
} else {
loadAverage = null;
}
}
@Override
public void writeTo(StreamOutput out) throws IOException {
out.writeShort(percent);
out.writeDouble(loadAverage);
if (loadAverage == null) {
out.writeBoolean(false);
} else {
out.writeBoolean(true);
out.writeDoubleArray(loadAverage);
}
}
public short getPercent() {
return percent;
}
public double getLoadAverage() {
public double[] getLoadAverage() {
return loadAverage;
}
}

View File

@ -134,7 +134,9 @@ public class RestNodesAction extends AbstractCatAction {
table.addCell("file_desc.max", "default:false;alias:fdm,fileDescriptorMax;text-align:right;desc:max file descriptors");
table.addCell("cpu", "alias:cpu;text-align:right;desc:recent cpu usage");
table.addCell("load", "alias:l;text-align:right;desc:most recent load avg");
table.addCell("load_1m", "alias:l;text-align:right;desc:1m load avg");
table.addCell("load_5m", "alias:l;text-align:right;desc:5m load avg");
table.addCell("load_15m", "alias:l;text-align:right;desc:15m load avg");
table.addCell("uptime", "default:false;alias:u;text-align:right;desc:node uptime");
table.addCell("node.role", "alias:r,role,dc,nodeRole;desc:d:data node, c:client node");
table.addCell("master", "alias:m;desc:m:master-eligible, *:current master");
@ -263,7 +265,10 @@ public class RestNodesAction extends AbstractCatAction {
table.addCell(processStats == null ? null : processStats.getMaxFileDescriptors());
table.addCell(osStats == null ? null : Short.toString(osStats.getCpu().getPercent()));
table.addCell(osStats == null ? null : String.format(Locale.ROOT, "%.2f", osStats.getCpu().getLoadAverage()));
boolean hasLoadAverage = osStats != null && osStats.getCpu().getLoadAverage() != null;
table.addCell(!hasLoadAverage ? null : String.format(Locale.ROOT, "%.2f", osStats.getCpu().getLoadAverage()[0]));
table.addCell(!hasLoadAverage ? null : String.format(Locale.ROOT, "%.2f", osStats.getCpu().getLoadAverage()[1]));
table.addCell(!hasLoadAverage ? null : String.format(Locale.ROOT, "%.2f", osStats.getCpu().getLoadAverage()[2]));
table.addCell(jvmStats == null ? null : jvmStats.getUptime());
table.addCell(node.clientNode() ? "c" : node.dataNode() ? "d" : "-");
table.addCell(masterId == null ? "x" : masterId.equals(node.id()) ? "*" : node.masterNode() ? "m" : "-");

View File

@ -115,4 +115,7 @@ grant {
// needed by JDKESLoggerTests
permission java.util.logging.LoggingPermission "control";
// load averages on Linux
permission java.io.FilePermission "/proc/loadavg", "read";
};

View File

@ -50,12 +50,31 @@ public class OsProbeTests extends ESTestCase {
assertNotNull(stats);
assertThat(stats.getTimestamp(), greaterThan(0L));
assertThat(stats.getCpu().getPercent(), anyOf(equalTo((short) -1), is(both(greaterThanOrEqualTo((short) 0)).and(lessThanOrEqualTo((short) 100)))));
double[] loadAverage = stats.getCpu().loadAverage;
if (loadAverage != null) {
assertThat(loadAverage.length, equalTo(3));
}
if (Constants.WINDOWS) {
// Load average is always -1 on Windows platforms
assertThat(stats.getCpu().getLoadAverage(), equalTo((double) -1));
// load average is unavailable on Windows
if (loadAverage != null) {
assertThat(loadAverage[0], equalTo((double) -1));
assertThat(loadAverage[1], equalTo((double) -1));
assertThat(loadAverage[2], equalTo((double) -1));
}
} else if (Constants.LINUX) {
// we should be able to get the load average
assertNotNull(loadAverage);
assertThat(loadAverage[0], greaterThanOrEqualTo((double) 0));
assertThat(loadAverage[1], greaterThanOrEqualTo((double) 0));
assertThat(loadAverage[2], greaterThanOrEqualTo((double) 0));
} else {
// Load average can be negative if not available or not computed yet, otherwise it should be >= 0
assertThat(stats.getCpu().getLoadAverage(), anyOf(lessThan((double) 0), greaterThanOrEqualTo((double) 0)));
// one minute load average is available, but 10-minute and 15-minute load averages are not
// load average can be negative if not available or not computed yet, otherwise it should be >= 0
if (loadAverage != null) {
assertThat(loadAverage[0], anyOf(lessThan((double) 0), greaterThanOrEqualTo((double) 0)));
assertThat(loadAverage[1], equalTo((double) -1));
assertThat(loadAverage[2], equalTo((double) -1));
}
}
assertNotNull(stats.getMem());

View File

@ -132,7 +132,8 @@ the operating system:
Recent CPU usage for the whole system, or -1 if not supported
`os.cpu.load_average`::
System load average for the last minute, or -1 if not supported
Array of system load averages for the last one minute, five
minute and fifteen minutes (value of -1 indicates not supported)
`os.mem.total_in_bytes`::
Total amount of physical memory in bytes

View File

@ -552,17 +552,32 @@ and high risk of being misused. The ability to change the thread pool type for a
that it is still possible to adjust relevant thread pool parameters for each of the thread pools (e.g., depending on
the thread pool type, `keep_alive`, `queue_size`, etc.).
=== Adding system CPU percent to OS stats
=== System CPU stats
The recent CPU usage (as a percent) has been added to the OS stats reported under the node stats API and the cat nodes
API. The breaking change here is that there is a new object in the "os" object in the node stats response. This object
is called "cpu" and includes "percent" and "load_average" as fields. This moves the "load_average" field that was
previously a top-level field in the "os" object to the "cpu" object. Additionally, the "cpu" field in the cat nodes API
response is output by default.
The recent CPU usage (as a percent) has been added to the OS stats
reported under the node stats API and the cat nodes API. The breaking
change here is that there is a new object in the "os" object in the node
stats response. This object is called "cpu" and includes "percent" and
"load_average" as fields. This moves the "load_average" field that was
previously a top-level field in the "os" object to the "cpu" object. The
format of the "load_average" field has changed to an array of length
three representing the one-minute, five-minute and fifteen-minute load
averages (a value of -1 for any of array components indicates that the
corresponding metric is not available).
Finally, the API for org.elasticsearch.monitor.os.OsStats has changed. The `getLoadAverage` method has been removed. The
value for this can now be obtained from `OsStats.Cpu#getLoadAverage`. Additionally, the recent CPU usage can be obtained
from `OsStats.Cpu#getPercent`.
In the cat nodes API response, the "cpu" field is output by default. The
previous "load" field has been removed and is replaced by "load_1m",
"load_5m", and "load_15m" which represent the one-minute, five-minute
and fifteen-minute loads respectively. These values are output by
default, and a value of -1 indicates that the corresponding metric is
not available.
Finally, the API for org.elasticsearch.monitor.os.OsStats has
changed. The `getLoadAverage` method has been removed. The value for
this can now be obtained from `OsStats.Cpu#getLoadAverage` but it is no
longer a double and is instead an object encapuslating the one-minute,
five-minute and fifteen-minute load averages. Additionally, the recent
CPU usage can be obtained from `OsStats.Cpu#getPercent`.
=== Fields option
Only stored fields are retrievable with this option.

View File

@ -6,8 +6,8 @@
- match:
$body: |
/ #host ip heap.percent ram.percent cpu load node.role master name
^ (\S+ \s+ (\d{1,3}\.){3}\d{1,3} \s+ \d+ \s+ \d* \s+ (-)?\d* \s+ (-)?\d*(\.\d+)? \s+ [-dc] \s+ [-*mx] \s+ (\S+\s?)+ \n)+ $/
/ #host ip heap.percent ram.percent cpu load_1m load_5m load_15m node.role master name
^ (\S+ \s+ (\d{1,3}\.){3}\d{1,3} \s+ \d+ \s+ \d* \s+ (-)?\d* \s+ ((-)?\d*(\.\d+)?)? \s+ ((-)?\d*(\.\d+)?)? \s+ ((-)?\d*(\.\d+)?)? \s+ [-dc] \s+ [-*mx] \s+ (\S+\s?)+ \n)+ $/
- do:
cat.nodes:
@ -15,8 +15,8 @@
- match:
$body: |
/^ host \s+ ip \s+ heap\.percent \s+ ram\.percent \s+ cpu \s+ load \s+ node\.role \s+ master \s+ name \n
(\S+ \s+ (\d{1,3}\.){3}\d{1,3} \s+ \d+ \s+ \d* \s+ (-)?\d* \s+ (-)?\d*(\.\d+)? \s+ [-dc] \s+ [-*mx] \s+ (\S+\s?)+ \n)+ $/
/^ host \s+ ip \s+ heap\.percent \s+ ram\.percent \s+ cpu \s+ load_1m \s+ load_5m \s+ load_15m \s+ node\.role \s+ master \s+ name \n
(\S+ \s+ (\d{1,3}\.){3}\d{1,3} \s+ \d+ \s+ \d* \s+ (-)?\d* \s+ ((-)?\d*(\.\d+)?)? \s+ ((-)?\d*(\.\d+)?)? \s+ ((-)?\d*(\.\d+)?)? \s+ [-dc] \s+ [-*mx] \s+ (\S+\s?)+ \n)+ $/
- do:
cat.nodes: