Reintroduce five-minute and fifteen-minute load averages on Linux
This commit reintroduces the five-minute and fifteen-minute load stats on Linux, and changes the format of the load_average field back to an array.
This commit is contained in:
parent
7873dddab6
commit
1de2081ed3
|
@ -40,7 +40,7 @@ import static org.elasticsearch.common.Strings.cleanPath;
|
|||
* The environment of where things exists.
|
||||
*/
|
||||
@SuppressForbidden(reason = "configures paths for the system")
|
||||
// TODO: move PathUtils to be package-private here instead of
|
||||
// TODO: move PathUtils to be package-private here instead of
|
||||
// public+forbidden api!
|
||||
public class Environment {
|
||||
|
||||
|
@ -72,7 +72,7 @@ public class Environment {
|
|||
|
||||
/** Path to the PID file (can be null if no PID file is configured) **/
|
||||
private final Path pidFile;
|
||||
|
||||
|
||||
/** Path to the temporary file directory used by the JDK */
|
||||
private final Path tmpFile = PathUtils.get(System.getProperty("java.io.tmpdir"));
|
||||
|
||||
|
@ -292,7 +292,7 @@ public class Environment {
|
|||
public Path pidFile() {
|
||||
return pidFile;
|
||||
}
|
||||
|
||||
|
||||
/** Path to the default temp directory used by the JDK */
|
||||
public Path tmpFile() {
|
||||
return tmpFile;
|
||||
|
@ -317,7 +317,7 @@ public class Environment {
|
|||
public static FileStore getFileStore(Path path) throws IOException {
|
||||
return ESFileStore.getMatchingFileStore(path, fileStores);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns true if the path is writable.
|
||||
* Acts just like {@link Files#isWritable(Path)}, except won't
|
||||
|
|
|
@ -20,11 +20,16 @@
|
|||
package org.elasticsearch.monitor.os;
|
||||
|
||||
import org.apache.lucene.util.Constants;
|
||||
import org.apache.lucene.util.SuppressForbidden;
|
||||
import org.elasticsearch.common.io.PathUtils;
|
||||
import org.elasticsearch.monitor.Probes;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.management.ManagementFactory;
|
||||
import java.lang.management.OperatingSystemMXBean;
|
||||
import java.lang.reflect.Method;
|
||||
import java.nio.file.Files;
|
||||
import java.util.List;
|
||||
|
||||
public class OsProbe {
|
||||
|
||||
|
@ -103,19 +108,42 @@ public class OsProbe {
|
|||
}
|
||||
|
||||
/**
|
||||
* Returns the system load average for the last minute.
|
||||
* Returns the system load averages
|
||||
*/
|
||||
public double getSystemLoadAverage() {
|
||||
public double[] getSystemLoadAverage() {
|
||||
if (Constants.LINUX) {
|
||||
double[] loadAverage = readProcLoadavg("/proc/loadavg");
|
||||
if (loadAverage != null) {
|
||||
return loadAverage;
|
||||
}
|
||||
// fallback
|
||||
}
|
||||
if (getSystemLoadAverage == null) {
|
||||
return -1;
|
||||
return null;
|
||||
}
|
||||
try {
|
||||
return (double) getSystemLoadAverage.invoke(osMxBean);
|
||||
double oneMinuteLoadAverage = (double) getSystemLoadAverage.invoke(osMxBean);
|
||||
return new double[] { oneMinuteLoadAverage, -1, -1 };
|
||||
} catch (Throwable t) {
|
||||
return -1;
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
@SuppressForbidden(reason = "access /proc")
|
||||
private static double[] readProcLoadavg(String procLoadavg) {
|
||||
try {
|
||||
List<String> lines = Files.readAllLines(PathUtils.get(procLoadavg));
|
||||
if (!lines.isEmpty()) {
|
||||
String[] fields = lines.get(0).split("\\s+");
|
||||
return new double[] { Double.parseDouble(fields[0]), Double.parseDouble(fields[1]), Double.parseDouble(fields[2]) };
|
||||
}
|
||||
} catch (IOException e) {
|
||||
// do not fail Elasticsearch if something unexpected
|
||||
// happens here
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public short getSystemCpuPercent() {
|
||||
return Probes.getLoadAndScaleToPercent(getSystemCpuLoad, osMxBean);
|
||||
}
|
||||
|
|
|
@ -87,7 +87,13 @@ public class OsStats implements Streamable, ToXContent {
|
|||
if (cpu != null) {
|
||||
builder.startObject(Fields.CPU);
|
||||
builder.field(Fields.PERCENT, cpu.getPercent());
|
||||
builder.field(Fields.LOAD_AVERAGE, cpu.getLoadAverage());
|
||||
if (cpu.getLoadAverage() != null) {
|
||||
builder.startArray(Fields.LOAD_AVERAGE);
|
||||
builder.value(cpu.getLoadAverage()[0]);
|
||||
builder.value(cpu.getLoadAverage()[1]);
|
||||
builder.value(cpu.getLoadAverage()[2]);
|
||||
builder.endArray();
|
||||
}
|
||||
builder.endObject();
|
||||
}
|
||||
|
||||
|
@ -152,8 +158,9 @@ public class OsStats implements Streamable, ToXContent {
|
|||
}
|
||||
|
||||
public static class Cpu implements Streamable {
|
||||
|
||||
short percent = -1;
|
||||
double loadAverage = -1;
|
||||
double[] loadAverage = null;
|
||||
|
||||
Cpu() {}
|
||||
|
||||
|
@ -166,20 +173,29 @@ public class OsStats implements Streamable, ToXContent {
|
|||
@Override
|
||||
public void readFrom(StreamInput in) throws IOException {
|
||||
percent = in.readShort();
|
||||
loadAverage = in.readDouble();
|
||||
if (in.readBoolean()) {
|
||||
loadAverage = in.readDoubleArray();
|
||||
} else {
|
||||
loadAverage = null;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeTo(StreamOutput out) throws IOException {
|
||||
out.writeShort(percent);
|
||||
out.writeDouble(loadAverage);
|
||||
if (loadAverage == null) {
|
||||
out.writeBoolean(false);
|
||||
} else {
|
||||
out.writeBoolean(true);
|
||||
out.writeDoubleArray(loadAverage);
|
||||
}
|
||||
}
|
||||
|
||||
public short getPercent() {
|
||||
return percent;
|
||||
}
|
||||
|
||||
public double getLoadAverage() {
|
||||
public double[] getLoadAverage() {
|
||||
return loadAverage;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -134,7 +134,9 @@ public class RestNodesAction extends AbstractCatAction {
|
|||
table.addCell("file_desc.max", "default:false;alias:fdm,fileDescriptorMax;text-align:right;desc:max file descriptors");
|
||||
|
||||
table.addCell("cpu", "alias:cpu;text-align:right;desc:recent cpu usage");
|
||||
table.addCell("load", "alias:l;text-align:right;desc:most recent load avg");
|
||||
table.addCell("load_1m", "alias:l;text-align:right;desc:1m load avg");
|
||||
table.addCell("load_5m", "alias:l;text-align:right;desc:5m load avg");
|
||||
table.addCell("load_15m", "alias:l;text-align:right;desc:15m load avg");
|
||||
table.addCell("uptime", "default:false;alias:u;text-align:right;desc:node uptime");
|
||||
table.addCell("node.role", "alias:r,role,dc,nodeRole;desc:d:data node, c:client node");
|
||||
table.addCell("master", "alias:m;desc:m:master-eligible, *:current master");
|
||||
|
@ -263,7 +265,10 @@ public class RestNodesAction extends AbstractCatAction {
|
|||
table.addCell(processStats == null ? null : processStats.getMaxFileDescriptors());
|
||||
|
||||
table.addCell(osStats == null ? null : Short.toString(osStats.getCpu().getPercent()));
|
||||
table.addCell(osStats == null ? null : String.format(Locale.ROOT, "%.2f", osStats.getCpu().getLoadAverage()));
|
||||
boolean hasLoadAverage = osStats != null && osStats.getCpu().getLoadAverage() != null;
|
||||
table.addCell(!hasLoadAverage ? null : String.format(Locale.ROOT, "%.2f", osStats.getCpu().getLoadAverage()[0]));
|
||||
table.addCell(!hasLoadAverage ? null : String.format(Locale.ROOT, "%.2f", osStats.getCpu().getLoadAverage()[1]));
|
||||
table.addCell(!hasLoadAverage ? null : String.format(Locale.ROOT, "%.2f", osStats.getCpu().getLoadAverage()[2]));
|
||||
table.addCell(jvmStats == null ? null : jvmStats.getUptime());
|
||||
table.addCell(node.clientNode() ? "c" : node.dataNode() ? "d" : "-");
|
||||
table.addCell(masterId == null ? "x" : masterId.equals(node.id()) ? "*" : node.masterNode() ? "m" : "-");
|
||||
|
|
|
@ -115,4 +115,7 @@ grant {
|
|||
|
||||
// needed by JDKESLoggerTests
|
||||
permission java.util.logging.LoggingPermission "control";
|
||||
|
||||
// load averages on Linux
|
||||
permission java.io.FilePermission "/proc/loadavg", "read";
|
||||
};
|
||||
|
|
|
@ -50,12 +50,31 @@ public class OsProbeTests extends ESTestCase {
|
|||
assertNotNull(stats);
|
||||
assertThat(stats.getTimestamp(), greaterThan(0L));
|
||||
assertThat(stats.getCpu().getPercent(), anyOf(equalTo((short) -1), is(both(greaterThanOrEqualTo((short) 0)).and(lessThanOrEqualTo((short) 100)))));
|
||||
double[] loadAverage = stats.getCpu().loadAverage;
|
||||
if (loadAverage != null) {
|
||||
assertThat(loadAverage.length, equalTo(3));
|
||||
}
|
||||
if (Constants.WINDOWS) {
|
||||
// Load average is always -1 on Windows platforms
|
||||
assertThat(stats.getCpu().getLoadAverage(), equalTo((double) -1));
|
||||
// load average is unavailable on Windows
|
||||
if (loadAverage != null) {
|
||||
assertThat(loadAverage[0], equalTo((double) -1));
|
||||
assertThat(loadAverage[1], equalTo((double) -1));
|
||||
assertThat(loadAverage[2], equalTo((double) -1));
|
||||
}
|
||||
} else if (Constants.LINUX) {
|
||||
// we should be able to get the load average
|
||||
assertNotNull(loadAverage);
|
||||
assertThat(loadAverage[0], greaterThanOrEqualTo((double) 0));
|
||||
assertThat(loadAverage[1], greaterThanOrEqualTo((double) 0));
|
||||
assertThat(loadAverage[2], greaterThanOrEqualTo((double) 0));
|
||||
} else {
|
||||
// Load average can be negative if not available or not computed yet, otherwise it should be >= 0
|
||||
assertThat(stats.getCpu().getLoadAverage(), anyOf(lessThan((double) 0), greaterThanOrEqualTo((double) 0)));
|
||||
// one minute load average is available, but 10-minute and 15-minute load averages are not
|
||||
// load average can be negative if not available or not computed yet, otherwise it should be >= 0
|
||||
if (loadAverage != null) {
|
||||
assertThat(loadAverage[0], anyOf(lessThan((double) 0), greaterThanOrEqualTo((double) 0)));
|
||||
assertThat(loadAverage[1], equalTo((double) -1));
|
||||
assertThat(loadAverage[2], equalTo((double) -1));
|
||||
}
|
||||
}
|
||||
|
||||
assertNotNull(stats.getMem());
|
||||
|
|
|
@ -132,7 +132,8 @@ the operating system:
|
|||
Recent CPU usage for the whole system, or -1 if not supported
|
||||
|
||||
`os.cpu.load_average`::
|
||||
System load average for the last minute, or -1 if not supported
|
||||
Array of system load averages for the last one minute, five
|
||||
minute and fifteen minutes (value of -1 indicates not supported)
|
||||
|
||||
`os.mem.total_in_bytes`::
|
||||
Total amount of physical memory in bytes
|
||||
|
|
|
@ -552,17 +552,32 @@ and high risk of being misused. The ability to change the thread pool type for a
|
|||
that it is still possible to adjust relevant thread pool parameters for each of the thread pools (e.g., depending on
|
||||
the thread pool type, `keep_alive`, `queue_size`, etc.).
|
||||
|
||||
=== Adding system CPU percent to OS stats
|
||||
=== System CPU stats
|
||||
|
||||
The recent CPU usage (as a percent) has been added to the OS stats reported under the node stats API and the cat nodes
|
||||
API. The breaking change here is that there is a new object in the "os" object in the node stats response. This object
|
||||
is called "cpu" and includes "percent" and "load_average" as fields. This moves the "load_average" field that was
|
||||
previously a top-level field in the "os" object to the "cpu" object. Additionally, the "cpu" field in the cat nodes API
|
||||
response is output by default.
|
||||
The recent CPU usage (as a percent) has been added to the OS stats
|
||||
reported under the node stats API and the cat nodes API. The breaking
|
||||
change here is that there is a new object in the "os" object in the node
|
||||
stats response. This object is called "cpu" and includes "percent" and
|
||||
"load_average" as fields. This moves the "load_average" field that was
|
||||
previously a top-level field in the "os" object to the "cpu" object. The
|
||||
format of the "load_average" field has changed to an array of length
|
||||
three representing the one-minute, five-minute and fifteen-minute load
|
||||
averages (a value of -1 for any of array components indicates that the
|
||||
corresponding metric is not available).
|
||||
|
||||
Finally, the API for org.elasticsearch.monitor.os.OsStats has changed. The `getLoadAverage` method has been removed. The
|
||||
value for this can now be obtained from `OsStats.Cpu#getLoadAverage`. Additionally, the recent CPU usage can be obtained
|
||||
from `OsStats.Cpu#getPercent`.
|
||||
In the cat nodes API response, the "cpu" field is output by default. The
|
||||
previous "load" field has been removed and is replaced by "load_1m",
|
||||
"load_5m", and "load_15m" which represent the one-minute, five-minute
|
||||
and fifteen-minute loads respectively. These values are output by
|
||||
default, and a value of -1 indicates that the corresponding metric is
|
||||
not available.
|
||||
|
||||
Finally, the API for org.elasticsearch.monitor.os.OsStats has
|
||||
changed. The `getLoadAverage` method has been removed. The value for
|
||||
this can now be obtained from `OsStats.Cpu#getLoadAverage` but it is no
|
||||
longer a double and is instead an object encapuslating the one-minute,
|
||||
five-minute and fifteen-minute load averages. Additionally, the recent
|
||||
CPU usage can be obtained from `OsStats.Cpu#getPercent`.
|
||||
|
||||
=== Fields option
|
||||
Only stored fields are retrievable with this option.
|
||||
|
|
|
@ -6,8 +6,8 @@
|
|||
|
||||
- match:
|
||||
$body: |
|
||||
/ #host ip heap.percent ram.percent cpu load node.role master name
|
||||
^ (\S+ \s+ (\d{1,3}\.){3}\d{1,3} \s+ \d+ \s+ \d* \s+ (-)?\d* \s+ (-)?\d*(\.\d+)? \s+ [-dc] \s+ [-*mx] \s+ (\S+\s?)+ \n)+ $/
|
||||
/ #host ip heap.percent ram.percent cpu load_1m load_5m load_15m node.role master name
|
||||
^ (\S+ \s+ (\d{1,3}\.){3}\d{1,3} \s+ \d+ \s+ \d* \s+ (-)?\d* \s+ ((-)?\d*(\.\d+)?)? \s+ ((-)?\d*(\.\d+)?)? \s+ ((-)?\d*(\.\d+)?)? \s+ [-dc] \s+ [-*mx] \s+ (\S+\s?)+ \n)+ $/
|
||||
|
||||
- do:
|
||||
cat.nodes:
|
||||
|
@ -15,8 +15,8 @@
|
|||
|
||||
- match:
|
||||
$body: |
|
||||
/^ host \s+ ip \s+ heap\.percent \s+ ram\.percent \s+ cpu \s+ load \s+ node\.role \s+ master \s+ name \n
|
||||
(\S+ \s+ (\d{1,3}\.){3}\d{1,3} \s+ \d+ \s+ \d* \s+ (-)?\d* \s+ (-)?\d*(\.\d+)? \s+ [-dc] \s+ [-*mx] \s+ (\S+\s?)+ \n)+ $/
|
||||
/^ host \s+ ip \s+ heap\.percent \s+ ram\.percent \s+ cpu \s+ load_1m \s+ load_5m \s+ load_15m \s+ node\.role \s+ master \s+ name \n
|
||||
(\S+ \s+ (\d{1,3}\.){3}\d{1,3} \s+ \d+ \s+ \d* \s+ (-)?\d* \s+ ((-)?\d*(\.\d+)?)? \s+ ((-)?\d*(\.\d+)?)? \s+ ((-)?\d*(\.\d+)?)? \s+ [-dc] \s+ [-*mx] \s+ (\S+\s?)+ \n)+ $/
|
||||
|
||||
- do:
|
||||
cat.nodes:
|
||||
|
|
Loading…
Reference in New Issue