Add cgroup memory usage/limit to OS stats on Linux (#26166)
This change adds cgroup memory usage/limit to the OS stats section of the node stats on Linux. This information is useful because in Docker containers the standard node stats report the host memory limit, not taking account of extra restrictions that may have been applied to the container. The original idea was to store these values as Long, truncating any values outside the range of long. However, this meant that in the relatively common case of no limit being applied, users would not see the same value in the OS stats as they see by querying Linux directly. So instead the values are stored as String. This change places a burden on consumers of the strings to convert the strings to numbers and decide what to do about extremely large values, but there will be very few consumers and they would need to have a policy for dealing with "no limit" in any case.
This commit is contained in:
parent
a18bd9caa2
commit
a292740b9e
|
@ -36,8 +36,6 @@ import java.nio.file.Path;
|
|||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class OsProbe {
|
||||
|
||||
|
@ -382,12 +380,70 @@ public class OsProbe {
|
|||
}
|
||||
|
||||
/**
|
||||
* Checks if cgroup stats are available by checking for the existence of {@code /proc/self/cgroup}, {@code /sys/fs/cgroup/cpu}, and
|
||||
* {@code /sys/fs/cgroup/cpuacct}.
|
||||
* The maximum amount of user memory (including file cache).
|
||||
* If there is no limit then some Linux versions return the maximum value that can be stored in an
|
||||
* unsigned 64 bit number, and this will overflow a long, hence the result type is <code>String</code>.
|
||||
* (The alternative would have been <code>BigInteger</code> but then it would not be possible to index
|
||||
* the OS stats document into Elasticsearch without losing information, as <code>BigInteger</code> is
|
||||
* not a supported Elasticsearch type.)
|
||||
*
|
||||
* @param controlGroup the control group for the Elasticsearch process for the {@code memory} subsystem
|
||||
* @return the maximum amount of user memory (including file cache)
|
||||
* @throws IOException if an I/O exception occurs reading {@code memory.limit_in_bytes} for the control group
|
||||
*/
|
||||
private String getCgroupMemoryLimitInBytes(final String controlGroup) throws IOException {
|
||||
return readSysFsCgroupMemoryLimitInBytes(controlGroup);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the line from {@code memory.limit_in_bytes} for the control group to which the Elasticsearch process belongs for the
|
||||
* {@code memory} subsystem. This line represents the maximum amount of user memory (including file cache).
|
||||
*
|
||||
* @param controlGroup the control group to which the Elasticsearch process belongs for the {@code memory} subsystem
|
||||
* @return the line from {@code memory.limit_in_bytes}
|
||||
* @throws IOException if an I/O exception occurs reading {@code memory.limit_in_bytes} for the control group
|
||||
*/
|
||||
@SuppressForbidden(reason = "access /sys/fs/cgroup/memory")
|
||||
String readSysFsCgroupMemoryLimitInBytes(final String controlGroup) throws IOException {
|
||||
return readSingleLine(PathUtils.get("/sys/fs/cgroup/memory", controlGroup, "memory.limit_in_bytes"));
|
||||
}
|
||||
|
||||
/**
|
||||
* The total current memory usage by processes in the cgroup (in bytes).
|
||||
* If there is no limit then some Linux versions return the maximum value that can be stored in an
|
||||
* unsigned 64 bit number, and this will overflow a long, hence the result type is <code>String</code>.
|
||||
* (The alternative would have been <code>BigInteger</code> but then it would not be possible to index
|
||||
* the OS stats document into Elasticsearch without losing information, as <code>BigInteger</code> is
|
||||
* not a supported Elasticsearch type.)
|
||||
*
|
||||
* @param controlGroup the control group for the Elasticsearch process for the {@code memory} subsystem
|
||||
* @return the total current memory usage by processes in the cgroup (in bytes)
|
||||
* @throws IOException if an I/O exception occurs reading {@code memory.limit_in_bytes} for the control group
|
||||
*/
|
||||
private String getCgroupMemoryUsageInBytes(final String controlGroup) throws IOException {
|
||||
return readSysFsCgroupMemoryUsageInBytes(controlGroup);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the line from {@code memory.usage_in_bytes} for the control group to which the Elasticsearch process belongs for the
|
||||
* {@code memory} subsystem. This line represents the total current memory usage by processes in the cgroup (in bytes).
|
||||
*
|
||||
* @param controlGroup the control group to which the Elasticsearch process belongs for the {@code memory} subsystem
|
||||
* @return the line from {@code memory.usage_in_bytes}
|
||||
* @throws IOException if an I/O exception occurs reading {@code memory.usage_in_bytes} for the control group
|
||||
*/
|
||||
@SuppressForbidden(reason = "access /sys/fs/cgroup/memory")
|
||||
String readSysFsCgroupMemoryUsageInBytes(final String controlGroup) throws IOException {
|
||||
return readSingleLine(PathUtils.get("/sys/fs/cgroup/memory", controlGroup, "memory.usage_in_bytes"));
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if cgroup stats are available by checking for the existence of {@code /proc/self/cgroup}, {@code /sys/fs/cgroup/cpu},
|
||||
* {@code /sys/fs/cgroup/cpuacct} and {@code /sys/fs/cgroup/memory}.
|
||||
*
|
||||
* @return {@code true} if the stats are available, otherwise {@code false}
|
||||
*/
|
||||
@SuppressForbidden(reason = "access /proc/self/cgroup, /sys/fs/cgroup/cpu, and /sys/fs/cgroup/cpuacct")
|
||||
@SuppressForbidden(reason = "access /proc/self/cgroup, /sys/fs/cgroup/cpu, /sys/fs/cgroup/cpuacct and /sys/fs/cgroup/memory")
|
||||
boolean areCgroupStatsAvailable() {
|
||||
if (!Files.exists(PathUtils.get("/proc/self/cgroup"))) {
|
||||
return false;
|
||||
|
@ -398,6 +454,9 @@ public class OsProbe {
|
|||
if (!Files.exists(PathUtils.get("/sys/fs/cgroup/cpuacct"))) {
|
||||
return false;
|
||||
}
|
||||
if (!Files.exists(PathUtils.get("/sys/fs/cgroup/memory"))) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -424,13 +483,21 @@ public class OsProbe {
|
|||
final long cgroupCpuAcctCpuCfsQuotaMicros = getCgroupCpuAcctCpuCfsQuotaMicros(cpuControlGroup);
|
||||
final OsStats.Cgroup.CpuStat cpuStat = getCgroupCpuAcctCpuStat(cpuControlGroup);
|
||||
|
||||
final String memoryControlGroup = controllerMap.get("memory");
|
||||
assert memoryControlGroup != null;
|
||||
final String cgroupMemoryLimitInBytes = getCgroupMemoryLimitInBytes(memoryControlGroup);
|
||||
final String cgroupMemoryUsageInBytes = getCgroupMemoryUsageInBytes(memoryControlGroup);
|
||||
|
||||
return new OsStats.Cgroup(
|
||||
cpuAcctControlGroup,
|
||||
cgroupCpuAcctUsageNanos,
|
||||
cpuControlGroup,
|
||||
cgroupCpuAcctCpuCfsPeriodMicros,
|
||||
cgroupCpuAcctCpuCfsQuotaMicros,
|
||||
cpuStat);
|
||||
cpuStat,
|
||||
memoryControlGroup,
|
||||
cgroupMemoryLimitInBytes,
|
||||
cgroupMemoryUsageInBytes);
|
||||
}
|
||||
} catch (final IOException e) {
|
||||
logger.debug("error reading control group stats", e);
|
||||
|
|
|
@ -294,6 +294,10 @@ public class OsStats implements Writeable, ToXContentFragment {
|
|||
private final long cpuCfsPeriodMicros;
|
||||
private final long cpuCfsQuotaMicros;
|
||||
private final CpuStat cpuStat;
|
||||
// These will be null for nodes running versions prior to 6.1.0
|
||||
private final String memoryControlGroup;
|
||||
private final String memoryLimitInBytes;
|
||||
private final String memoryUsageInBytes;
|
||||
|
||||
/**
|
||||
* The control group for the {@code cpuacct} subsystem.
|
||||
|
@ -355,19 +359,57 @@ public class OsStats implements Writeable, ToXContentFragment {
|
|||
return cpuStat;
|
||||
}
|
||||
|
||||
/**
|
||||
* The control group for the {@code memory} subsystem.
|
||||
*
|
||||
* @return the control group
|
||||
*/
|
||||
public String getMemoryControlGroup() {
|
||||
return memoryControlGroup;
|
||||
}
|
||||
|
||||
/**
|
||||
* The maximum amount of user memory (including file cache).
|
||||
* This is stored as a <code>String</code> because the value can be too big to fit in a
|
||||
* <code>long</code>. (The alternative would have been <code>BigInteger</code> but then
|
||||
* it would not be possible to index the OS stats document into Elasticsearch without
|
||||
* losing information, as <code>BigInteger</code> is not a supported Elasticsearch type.)
|
||||
*
|
||||
* @return the maximum amount of user memory (including file cache).
|
||||
*/
|
||||
public String getMemoryLimitInBytes() {
|
||||
return memoryLimitInBytes;
|
||||
}
|
||||
|
||||
/**
|
||||
* The total current memory usage by processes in the cgroup (in bytes).
|
||||
* This is stored as a <code>String</code> for consistency with <code>memoryLimitInBytes</code>.
|
||||
*
|
||||
* @return the total current memory usage by processes in the cgroup (in bytes).
|
||||
*/
|
||||
public String getMemoryUsageInBytes() {
|
||||
return memoryUsageInBytes;
|
||||
}
|
||||
|
||||
public Cgroup(
|
||||
final String cpuAcctControlGroup,
|
||||
final long cpuAcctUsageNanos,
|
||||
final String cpuControlGroup,
|
||||
final long cpuCfsPeriodMicros,
|
||||
final long cpuCfsQuotaMicros,
|
||||
final CpuStat cpuStat) {
|
||||
final CpuStat cpuStat,
|
||||
final String memoryControlGroup,
|
||||
final String memoryLimitInBytes,
|
||||
final String memoryUsageInBytes) {
|
||||
this.cpuAcctControlGroup = Objects.requireNonNull(cpuAcctControlGroup);
|
||||
this.cpuAcctUsageNanos = cpuAcctUsageNanos;
|
||||
this.cpuControlGroup = Objects.requireNonNull(cpuControlGroup);
|
||||
this.cpuCfsPeriodMicros = cpuCfsPeriodMicros;
|
||||
this.cpuCfsQuotaMicros = cpuCfsQuotaMicros;
|
||||
this.cpuStat = Objects.requireNonNull(cpuStat);
|
||||
this.memoryControlGroup = memoryControlGroup;
|
||||
this.memoryLimitInBytes = memoryLimitInBytes;
|
||||
this.memoryUsageInBytes = memoryUsageInBytes;
|
||||
}
|
||||
|
||||
Cgroup(final StreamInput in) throws IOException {
|
||||
|
@ -377,6 +419,16 @@ public class OsStats implements Writeable, ToXContentFragment {
|
|||
cpuCfsPeriodMicros = in.readLong();
|
||||
cpuCfsQuotaMicros = in.readLong();
|
||||
cpuStat = new CpuStat(in);
|
||||
// TODO: change this to 6.1.0 after backporting
|
||||
if (in.getVersion().onOrAfter(Version.V_7_0_0_alpha1)) {
|
||||
memoryControlGroup = in.readOptionalString();
|
||||
memoryLimitInBytes = in.readOptionalString();
|
||||
memoryUsageInBytes = in.readOptionalString();
|
||||
} else {
|
||||
memoryControlGroup = null;
|
||||
memoryLimitInBytes = null;
|
||||
memoryUsageInBytes = null;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -387,6 +439,12 @@ public class OsStats implements Writeable, ToXContentFragment {
|
|||
out.writeLong(cpuCfsPeriodMicros);
|
||||
out.writeLong(cpuCfsQuotaMicros);
|
||||
cpuStat.writeTo(out);
|
||||
// TODO: change this to 6.1.0 after backporting
|
||||
if (out.getVersion().onOrAfter(Version.V_7_0_0_alpha1)) {
|
||||
out.writeOptionalString(memoryControlGroup);
|
||||
out.writeOptionalString(memoryLimitInBytes);
|
||||
out.writeOptionalString(memoryUsageInBytes);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -407,6 +465,19 @@ public class OsStats implements Writeable, ToXContentFragment {
|
|||
cpuStat.toXContent(builder, params);
|
||||
}
|
||||
builder.endObject();
|
||||
if (memoryControlGroup != null) {
|
||||
builder.startObject("memory");
|
||||
{
|
||||
builder.field("control_group", memoryControlGroup);
|
||||
if (memoryLimitInBytes != null) {
|
||||
builder.field("limit_in_bytes", memoryLimitInBytes);
|
||||
}
|
||||
if (memoryUsageInBytes != null) {
|
||||
builder.field("usage_in_bytes", memoryUsageInBytes);
|
||||
}
|
||||
}
|
||||
builder.endObject();
|
||||
}
|
||||
}
|
||||
builder.endObject();
|
||||
return builder;
|
||||
|
|
|
@ -129,4 +129,6 @@ grant {
|
|||
permission java.io.FilePermission "/sys/fs/cgroup/cpu/-", "read";
|
||||
permission java.io.FilePermission "/sys/fs/cgroup/cpuacct", "read";
|
||||
permission java.io.FilePermission "/sys/fs/cgroup/cpuacct/-", "read";
|
||||
permission java.io.FilePermission "/sys/fs/cgroup/memory", "read";
|
||||
permission java.io.FilePermission "/sys/fs/cgroup/memory/-", "read";
|
||||
};
|
||||
|
|
|
@ -96,6 +96,12 @@ public class NodeStatsTests extends ESTestCase {
|
|||
assertEquals(
|
||||
nodeStats.getOs().getCgroup().getCpuStat().getTimeThrottledNanos(),
|
||||
deserializedNodeStats.getOs().getCgroup().getCpuStat().getTimeThrottledNanos());
|
||||
assertEquals(
|
||||
nodeStats.getOs().getCgroup().getMemoryLimitInBytes(),
|
||||
deserializedNodeStats.getOs().getCgroup().getMemoryLimitInBytes());
|
||||
assertEquals(
|
||||
nodeStats.getOs().getCgroup().getMemoryUsageInBytes(),
|
||||
deserializedNodeStats.getOs().getCgroup().getMemoryUsageInBytes());
|
||||
assertArrayEquals(nodeStats.getOs().getCpu().getLoadAverage(),
|
||||
deserializedNodeStats.getOs().getCpu().getLoadAverage(), 0);
|
||||
}
|
||||
|
@ -294,7 +300,10 @@ public class NodeStatsTests extends ESTestCase {
|
|||
randomAlphaOfLength(8),
|
||||
randomNonNegativeLong(),
|
||||
randomNonNegativeLong(),
|
||||
new OsStats.Cgroup.CpuStat(randomNonNegativeLong(), randomNonNegativeLong(), randomNonNegativeLong())));
|
||||
new OsStats.Cgroup.CpuStat(randomNonNegativeLong(), randomNonNegativeLong(), randomNonNegativeLong()),
|
||||
randomAlphaOfLength(8),
|
||||
Long.toString(randomNonNegativeLong()),
|
||||
Long.toString(randomNonNegativeLong())));
|
||||
}
|
||||
ProcessStats processStats = frequently() ?
|
||||
new ProcessStats(
|
||||
|
|
|
@ -22,6 +22,7 @@ package org.elasticsearch.monitor.os;
|
|||
import org.apache.lucene.util.Constants;
|
||||
import org.elasticsearch.test.ESTestCase;
|
||||
|
||||
import java.math.BigInteger;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
|
@ -117,6 +118,12 @@ public class OsProbeTests extends ESTestCase {
|
|||
assertThat(stats.getCgroup().getCpuStat().getNumberOfElapsedPeriods(), greaterThanOrEqualTo(0L));
|
||||
assertThat(stats.getCgroup().getCpuStat().getNumberOfTimesThrottled(), greaterThanOrEqualTo(0L));
|
||||
assertThat(stats.getCgroup().getCpuStat().getTimeThrottledNanos(), greaterThanOrEqualTo(0L));
|
||||
// These could be null if transported from a node running an older version, but shouldn't be null on the current node
|
||||
assertThat(stats.getCgroup().getMemoryControlGroup(), notNullValue());
|
||||
assertThat(stats.getCgroup().getMemoryLimitInBytes(), notNullValue());
|
||||
assertThat(new BigInteger(stats.getCgroup().getMemoryLimitInBytes()), greaterThan(BigInteger.ZERO));
|
||||
assertThat(stats.getCgroup().getMemoryUsageInBytes(), notNullValue());
|
||||
assertThat(new BigInteger(stats.getCgroup().getMemoryUsageInBytes()), greaterThan(BigInteger.ZERO));
|
||||
}
|
||||
} else {
|
||||
assertNull(stats.getCgroup());
|
||||
|
@ -159,7 +166,7 @@ public class OsProbeTests extends ESTestCase {
|
|||
"9:net_cls,net_prio:/",
|
||||
"8:pids:/",
|
||||
"7:blkio:/",
|
||||
"6:memory:/",
|
||||
"6:memory:/" + hierarchy,
|
||||
"5:devices:/user.slice",
|
||||
"4:hugetlb:/",
|
||||
"3:perf_event:/",
|
||||
|
@ -194,6 +201,19 @@ public class OsProbeTests extends ESTestCase {
|
|||
"throttled_time 139298645489");
|
||||
}
|
||||
|
||||
@Override
|
||||
String readSysFsCgroupMemoryLimitInBytes(String controlGroup) {
|
||||
assertThat(controlGroup, equalTo("/" + hierarchy));
|
||||
// This is the highest value that can be stored in an unsigned 64 bit number, hence too big for long
|
||||
return "18446744073709551615";
|
||||
}
|
||||
|
||||
@Override
|
||||
String readSysFsCgroupMemoryUsageInBytes(String controlGroup) {
|
||||
assertThat(controlGroup, equalTo("/" + hierarchy));
|
||||
return "4796416";
|
||||
}
|
||||
|
||||
@Override
|
||||
boolean areCgroupStatsAvailable() {
|
||||
return areCgroupStatsAvailable;
|
||||
|
@ -213,6 +233,8 @@ public class OsProbeTests extends ESTestCase {
|
|||
assertThat(cgroup.getCpuStat().getNumberOfElapsedPeriods(), equalTo(17992L));
|
||||
assertThat(cgroup.getCpuStat().getNumberOfTimesThrottled(), equalTo(1311L));
|
||||
assertThat(cgroup.getCpuStat().getTimeThrottledNanos(), equalTo(139298645489L));
|
||||
assertThat(cgroup.getMemoryLimitInBytes(), equalTo(Long.MAX_VALUE));
|
||||
assertThat(cgroup.getMemoryUsageInBytes(), equalTo(4796416L));
|
||||
} else {
|
||||
assertNull(cgroup);
|
||||
}
|
||||
|
|
|
@ -42,7 +42,10 @@ public class OsStatsTests extends ESTestCase {
|
|||
randomAlphaOfLength(8),
|
||||
randomNonNegativeLong(),
|
||||
randomNonNegativeLong(),
|
||||
new OsStats.Cgroup.CpuStat(randomNonNegativeLong(), randomNonNegativeLong(), randomNonNegativeLong()));
|
||||
new OsStats.Cgroup.CpuStat(randomNonNegativeLong(), randomNonNegativeLong(), randomNonNegativeLong()),
|
||||
randomAlphaOfLength(8),
|
||||
Long.toString(randomNonNegativeLong()),
|
||||
Long.toString(randomNonNegativeLong()));
|
||||
OsStats osStats = new OsStats(System.currentTimeMillis(), cpu, mem, swap, cgroup);
|
||||
|
||||
try (BytesStreamOutput out = new BytesStreamOutput()) {
|
||||
|
@ -70,6 +73,8 @@ public class OsStatsTests extends ESTestCase {
|
|||
assertEquals(
|
||||
osStats.getCgroup().getCpuStat().getTimeThrottledNanos(),
|
||||
deserializedOsStats.getCgroup().getCpuStat().getTimeThrottledNanos());
|
||||
assertEquals(osStats.getCgroup().getMemoryLimitInBytes(), deserializedOsStats.getCgroup().getMemoryLimitInBytes());
|
||||
assertEquals(osStats.getCgroup().getMemoryUsageInBytes(), deserializedOsStats.getCgroup().getMemoryUsageInBytes());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -255,6 +255,25 @@ the operating system:
|
|||
The total amount of time (in nanoseconds) for which all tasks in
|
||||
the same cgroup as the Elasticsearch process have been throttled.
|
||||
|
||||
`os.cgroup.memory.control_group` (Linux only)::
|
||||
The `memory` control group to which the Elasticsearch process
|
||||
belongs
|
||||
|
||||
`os.cgroup.memory.limit_in_bytes` (Linux only)::
|
||||
The maximum amount of user memory (including file cache) allowed
|
||||
for all tasks in the same cgroup as the Elasticsearch process.
|
||||
This value can be too big to store in a `long`, so is returned as
|
||||
a string so that the value returned can exactly match what the
|
||||
underlying operating system interface returns. Any value that is
|
||||
too large to parse into a `long` almost certainly means no limit
|
||||
has been set for the cgroup.
|
||||
|
||||
`os.cgroup.memory.usage_in_bytes` (Linux only)::
|
||||
The total current memory usage by processes in the cgroup (in bytes)
|
||||
by all tasks in the same cgroup as the Elasticsearch process.
|
||||
This value is stored as a string for consistency with
|
||||
`os.cgroup.memory.limit_in_bytes`.
|
||||
|
||||
NOTE: For the cgroup stats to be visible, cgroups must be compiled into
|
||||
the kernal, the `cpu` and `cpuacct` cgroup subsystems must be
|
||||
configured and stats must be readable from `/sys/fs/cgroup/cpu`
|
||||
|
|
Loading…
Reference in New Issue