Add cgroup memory usage/limit to OS stats on Linux (#26166)

This change adds cgroup memory usage/limit to the OS stats section of
the node stats on Linux.  This information is useful because in Docker
containers the standard node stats report the host memory limit, not
taking account of extra restrictions that may have been applied to the
container.

The original idea was to store these values as Long, truncating any values
outside the range of long.  However, this meant that in the relatively common
case of no limit being applied, users would not see the same value in the OS
stats as they see by querying Linux directly.  So instead the values are stored
as String.  This change places a burden on consumers of the strings to
convert the strings to numbers and decide what to do about extremely large
values, but there will be very few consumers and they would need to have a
policy for dealing with "no limit" in any case.
This commit is contained in:
David Roberts 2017-10-03 12:08:36 +01:00 committed by GitHub
parent a18bd9caa2
commit a292740b9e
7 changed files with 205 additions and 10 deletions

View File

@ -36,8 +36,6 @@ import java.nio.file.Path;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class OsProbe { public class OsProbe {
@ -382,12 +380,70 @@ public class OsProbe {
} }
/** /**
* Checks if cgroup stats are available by checking for the existence of {@code /proc/self/cgroup}, {@code /sys/fs/cgroup/cpu}, and * The maximum amount of user memory (including file cache).
* {@code /sys/fs/cgroup/cpuacct}. * If there is no limit then some Linux versions return the maximum value that can be stored in an
* unsigned 64 bit number, and this will overflow a long, hence the result type is <code>String</code>.
* (The alternative would have been <code>BigInteger</code> but then it would not be possible to index
* the OS stats document into Elasticsearch without losing information, as <code>BigInteger</code> is
* not a supported Elasticsearch type.)
*
* @param controlGroup the control group for the Elasticsearch process for the {@code memory} subsystem
* @return the maximum amount of user memory (including file cache)
* @throws IOException if an I/O exception occurs reading {@code memory.limit_in_bytes} for the control group
*/
private String getCgroupMemoryLimitInBytes(final String controlGroup) throws IOException {
return readSysFsCgroupMemoryLimitInBytes(controlGroup);
}
/**
* Returns the line from {@code memory.limit_in_bytes} for the control group to which the Elasticsearch process belongs for the
* {@code memory} subsystem. This line represents the maximum amount of user memory (including file cache).
*
* @param controlGroup the control group to which the Elasticsearch process belongs for the {@code memory} subsystem
* @return the line from {@code memory.limit_in_bytes}
* @throws IOException if an I/O exception occurs reading {@code memory.limit_in_bytes} for the control group
*/
@SuppressForbidden(reason = "access /sys/fs/cgroup/memory")
String readSysFsCgroupMemoryLimitInBytes(final String controlGroup) throws IOException {
return readSingleLine(PathUtils.get("/sys/fs/cgroup/memory", controlGroup, "memory.limit_in_bytes"));
}
/**
* The total current memory usage by processes in the cgroup (in bytes).
* If there is no limit then some Linux versions return the maximum value that can be stored in an
* unsigned 64 bit number, and this will overflow a long, hence the result type is <code>String</code>.
* (The alternative would have been <code>BigInteger</code> but then it would not be possible to index
* the OS stats document into Elasticsearch without losing information, as <code>BigInteger</code> is
* not a supported Elasticsearch type.)
*
* @param controlGroup the control group for the Elasticsearch process for the {@code memory} subsystem
* @return the total current memory usage by processes in the cgroup (in bytes)
* @throws IOException if an I/O exception occurs reading {@code memory.limit_in_bytes} for the control group
*/
private String getCgroupMemoryUsageInBytes(final String controlGroup) throws IOException {
return readSysFsCgroupMemoryUsageInBytes(controlGroup);
}
/**
* Returns the line from {@code memory.usage_in_bytes} for the control group to which the Elasticsearch process belongs for the
* {@code memory} subsystem. This line represents the total current memory usage by processes in the cgroup (in bytes).
*
* @param controlGroup the control group to which the Elasticsearch process belongs for the {@code memory} subsystem
* @return the line from {@code memory.usage_in_bytes}
* @throws IOException if an I/O exception occurs reading {@code memory.usage_in_bytes} for the control group
*/
@SuppressForbidden(reason = "access /sys/fs/cgroup/memory")
String readSysFsCgroupMemoryUsageInBytes(final String controlGroup) throws IOException {
return readSingleLine(PathUtils.get("/sys/fs/cgroup/memory", controlGroup, "memory.usage_in_bytes"));
}
/**
* Checks if cgroup stats are available by checking for the existence of {@code /proc/self/cgroup}, {@code /sys/fs/cgroup/cpu},
* {@code /sys/fs/cgroup/cpuacct} and {@code /sys/fs/cgroup/memory}.
* *
* @return {@code true} if the stats are available, otherwise {@code false} * @return {@code true} if the stats are available, otherwise {@code false}
*/ */
@SuppressForbidden(reason = "access /proc/self/cgroup, /sys/fs/cgroup/cpu, and /sys/fs/cgroup/cpuacct") @SuppressForbidden(reason = "access /proc/self/cgroup, /sys/fs/cgroup/cpu, /sys/fs/cgroup/cpuacct and /sys/fs/cgroup/memory")
boolean areCgroupStatsAvailable() { boolean areCgroupStatsAvailable() {
if (!Files.exists(PathUtils.get("/proc/self/cgroup"))) { if (!Files.exists(PathUtils.get("/proc/self/cgroup"))) {
return false; return false;
@ -398,6 +454,9 @@ public class OsProbe {
if (!Files.exists(PathUtils.get("/sys/fs/cgroup/cpuacct"))) { if (!Files.exists(PathUtils.get("/sys/fs/cgroup/cpuacct"))) {
return false; return false;
} }
if (!Files.exists(PathUtils.get("/sys/fs/cgroup/memory"))) {
return false;
}
return true; return true;
} }
@ -424,13 +483,21 @@ public class OsProbe {
final long cgroupCpuAcctCpuCfsQuotaMicros = getCgroupCpuAcctCpuCfsQuotaMicros(cpuControlGroup); final long cgroupCpuAcctCpuCfsQuotaMicros = getCgroupCpuAcctCpuCfsQuotaMicros(cpuControlGroup);
final OsStats.Cgroup.CpuStat cpuStat = getCgroupCpuAcctCpuStat(cpuControlGroup); final OsStats.Cgroup.CpuStat cpuStat = getCgroupCpuAcctCpuStat(cpuControlGroup);
final String memoryControlGroup = controllerMap.get("memory");
assert memoryControlGroup != null;
final String cgroupMemoryLimitInBytes = getCgroupMemoryLimitInBytes(memoryControlGroup);
final String cgroupMemoryUsageInBytes = getCgroupMemoryUsageInBytes(memoryControlGroup);
return new OsStats.Cgroup( return new OsStats.Cgroup(
cpuAcctControlGroup, cpuAcctControlGroup,
cgroupCpuAcctUsageNanos, cgroupCpuAcctUsageNanos,
cpuControlGroup, cpuControlGroup,
cgroupCpuAcctCpuCfsPeriodMicros, cgroupCpuAcctCpuCfsPeriodMicros,
cgroupCpuAcctCpuCfsQuotaMicros, cgroupCpuAcctCpuCfsQuotaMicros,
cpuStat); cpuStat,
memoryControlGroup,
cgroupMemoryLimitInBytes,
cgroupMemoryUsageInBytes);
} }
} catch (final IOException e) { } catch (final IOException e) {
logger.debug("error reading control group stats", e); logger.debug("error reading control group stats", e);

View File

@ -294,6 +294,10 @@ public class OsStats implements Writeable, ToXContentFragment {
private final long cpuCfsPeriodMicros; private final long cpuCfsPeriodMicros;
private final long cpuCfsQuotaMicros; private final long cpuCfsQuotaMicros;
private final CpuStat cpuStat; private final CpuStat cpuStat;
// These will be null for nodes running versions prior to 6.1.0
private final String memoryControlGroup;
private final String memoryLimitInBytes;
private final String memoryUsageInBytes;
/** /**
* The control group for the {@code cpuacct} subsystem. * The control group for the {@code cpuacct} subsystem.
@ -355,19 +359,57 @@ public class OsStats implements Writeable, ToXContentFragment {
return cpuStat; return cpuStat;
} }
/**
* The control group for the {@code memory} subsystem.
*
* @return the control group
*/
public String getMemoryControlGroup() {
return memoryControlGroup;
}
/**
* The maximum amount of user memory (including file cache).
* This is stored as a <code>String</code> because the value can be too big to fit in a
* <code>long</code>. (The alternative would have been <code>BigInteger</code> but then
* it would not be possible to index the OS stats document into Elasticsearch without
* losing information, as <code>BigInteger</code> is not a supported Elasticsearch type.)
*
* @return the maximum amount of user memory (including file cache).
*/
public String getMemoryLimitInBytes() {
return memoryLimitInBytes;
}
/**
* The total current memory usage by processes in the cgroup (in bytes).
* This is stored as a <code>String</code> for consistency with <code>memoryLimitInBytes</code>.
*
* @return the total current memory usage by processes in the cgroup (in bytes).
*/
public String getMemoryUsageInBytes() {
return memoryUsageInBytes;
}
public Cgroup( public Cgroup(
final String cpuAcctControlGroup, final String cpuAcctControlGroup,
final long cpuAcctUsageNanos, final long cpuAcctUsageNanos,
final String cpuControlGroup, final String cpuControlGroup,
final long cpuCfsPeriodMicros, final long cpuCfsPeriodMicros,
final long cpuCfsQuotaMicros, final long cpuCfsQuotaMicros,
final CpuStat cpuStat) { final CpuStat cpuStat,
final String memoryControlGroup,
final String memoryLimitInBytes,
final String memoryUsageInBytes) {
this.cpuAcctControlGroup = Objects.requireNonNull(cpuAcctControlGroup); this.cpuAcctControlGroup = Objects.requireNonNull(cpuAcctControlGroup);
this.cpuAcctUsageNanos = cpuAcctUsageNanos; this.cpuAcctUsageNanos = cpuAcctUsageNanos;
this.cpuControlGroup = Objects.requireNonNull(cpuControlGroup); this.cpuControlGroup = Objects.requireNonNull(cpuControlGroup);
this.cpuCfsPeriodMicros = cpuCfsPeriodMicros; this.cpuCfsPeriodMicros = cpuCfsPeriodMicros;
this.cpuCfsQuotaMicros = cpuCfsQuotaMicros; this.cpuCfsQuotaMicros = cpuCfsQuotaMicros;
this.cpuStat = Objects.requireNonNull(cpuStat); this.cpuStat = Objects.requireNonNull(cpuStat);
this.memoryControlGroup = memoryControlGroup;
this.memoryLimitInBytes = memoryLimitInBytes;
this.memoryUsageInBytes = memoryUsageInBytes;
} }
Cgroup(final StreamInput in) throws IOException { Cgroup(final StreamInput in) throws IOException {
@ -377,6 +419,16 @@ public class OsStats implements Writeable, ToXContentFragment {
cpuCfsPeriodMicros = in.readLong(); cpuCfsPeriodMicros = in.readLong();
cpuCfsQuotaMicros = in.readLong(); cpuCfsQuotaMicros = in.readLong();
cpuStat = new CpuStat(in); cpuStat = new CpuStat(in);
// TODO: change this to 6.1.0 after backporting
if (in.getVersion().onOrAfter(Version.V_7_0_0_alpha1)) {
memoryControlGroup = in.readOptionalString();
memoryLimitInBytes = in.readOptionalString();
memoryUsageInBytes = in.readOptionalString();
} else {
memoryControlGroup = null;
memoryLimitInBytes = null;
memoryUsageInBytes = null;
}
} }
@Override @Override
@ -387,6 +439,12 @@ public class OsStats implements Writeable, ToXContentFragment {
out.writeLong(cpuCfsPeriodMicros); out.writeLong(cpuCfsPeriodMicros);
out.writeLong(cpuCfsQuotaMicros); out.writeLong(cpuCfsQuotaMicros);
cpuStat.writeTo(out); cpuStat.writeTo(out);
// TODO: change this to 6.1.0 after backporting
if (out.getVersion().onOrAfter(Version.V_7_0_0_alpha1)) {
out.writeOptionalString(memoryControlGroup);
out.writeOptionalString(memoryLimitInBytes);
out.writeOptionalString(memoryUsageInBytes);
}
} }
@Override @Override
@ -407,6 +465,19 @@ public class OsStats implements Writeable, ToXContentFragment {
cpuStat.toXContent(builder, params); cpuStat.toXContent(builder, params);
} }
builder.endObject(); builder.endObject();
if (memoryControlGroup != null) {
builder.startObject("memory");
{
builder.field("control_group", memoryControlGroup);
if (memoryLimitInBytes != null) {
builder.field("limit_in_bytes", memoryLimitInBytes);
}
if (memoryUsageInBytes != null) {
builder.field("usage_in_bytes", memoryUsageInBytes);
}
}
builder.endObject();
}
} }
builder.endObject(); builder.endObject();
return builder; return builder;

View File

@ -129,4 +129,6 @@ grant {
permission java.io.FilePermission "/sys/fs/cgroup/cpu/-", "read"; permission java.io.FilePermission "/sys/fs/cgroup/cpu/-", "read";
permission java.io.FilePermission "/sys/fs/cgroup/cpuacct", "read"; permission java.io.FilePermission "/sys/fs/cgroup/cpuacct", "read";
permission java.io.FilePermission "/sys/fs/cgroup/cpuacct/-", "read"; permission java.io.FilePermission "/sys/fs/cgroup/cpuacct/-", "read";
permission java.io.FilePermission "/sys/fs/cgroup/memory", "read";
permission java.io.FilePermission "/sys/fs/cgroup/memory/-", "read";
}; };

View File

@ -96,6 +96,12 @@ public class NodeStatsTests extends ESTestCase {
assertEquals( assertEquals(
nodeStats.getOs().getCgroup().getCpuStat().getTimeThrottledNanos(), nodeStats.getOs().getCgroup().getCpuStat().getTimeThrottledNanos(),
deserializedNodeStats.getOs().getCgroup().getCpuStat().getTimeThrottledNanos()); deserializedNodeStats.getOs().getCgroup().getCpuStat().getTimeThrottledNanos());
assertEquals(
nodeStats.getOs().getCgroup().getMemoryLimitInBytes(),
deserializedNodeStats.getOs().getCgroup().getMemoryLimitInBytes());
assertEquals(
nodeStats.getOs().getCgroup().getMemoryUsageInBytes(),
deserializedNodeStats.getOs().getCgroup().getMemoryUsageInBytes());
assertArrayEquals(nodeStats.getOs().getCpu().getLoadAverage(), assertArrayEquals(nodeStats.getOs().getCpu().getLoadAverage(),
deserializedNodeStats.getOs().getCpu().getLoadAverage(), 0); deserializedNodeStats.getOs().getCpu().getLoadAverage(), 0);
} }
@ -294,7 +300,10 @@ public class NodeStatsTests extends ESTestCase {
randomAlphaOfLength(8), randomAlphaOfLength(8),
randomNonNegativeLong(), randomNonNegativeLong(),
randomNonNegativeLong(), randomNonNegativeLong(),
new OsStats.Cgroup.CpuStat(randomNonNegativeLong(), randomNonNegativeLong(), randomNonNegativeLong()))); new OsStats.Cgroup.CpuStat(randomNonNegativeLong(), randomNonNegativeLong(), randomNonNegativeLong()),
randomAlphaOfLength(8),
Long.toString(randomNonNegativeLong()),
Long.toString(randomNonNegativeLong())));
} }
ProcessStats processStats = frequently() ? ProcessStats processStats = frequently() ?
new ProcessStats( new ProcessStats(

View File

@ -22,6 +22,7 @@ package org.elasticsearch.monitor.os;
import org.apache.lucene.util.Constants; import org.apache.lucene.util.Constants;
import org.elasticsearch.test.ESTestCase; import org.elasticsearch.test.ESTestCase;
import java.math.BigInteger;
import java.util.Arrays; import java.util.Arrays;
import java.util.List; import java.util.List;
@ -117,6 +118,12 @@ public class OsProbeTests extends ESTestCase {
assertThat(stats.getCgroup().getCpuStat().getNumberOfElapsedPeriods(), greaterThanOrEqualTo(0L)); assertThat(stats.getCgroup().getCpuStat().getNumberOfElapsedPeriods(), greaterThanOrEqualTo(0L));
assertThat(stats.getCgroup().getCpuStat().getNumberOfTimesThrottled(), greaterThanOrEqualTo(0L)); assertThat(stats.getCgroup().getCpuStat().getNumberOfTimesThrottled(), greaterThanOrEqualTo(0L));
assertThat(stats.getCgroup().getCpuStat().getTimeThrottledNanos(), greaterThanOrEqualTo(0L)); assertThat(stats.getCgroup().getCpuStat().getTimeThrottledNanos(), greaterThanOrEqualTo(0L));
// These could be null if transported from a node running an older version, but shouldn't be null on the current node
assertThat(stats.getCgroup().getMemoryControlGroup(), notNullValue());
assertThat(stats.getCgroup().getMemoryLimitInBytes(), notNullValue());
assertThat(new BigInteger(stats.getCgroup().getMemoryLimitInBytes()), greaterThan(BigInteger.ZERO));
assertThat(stats.getCgroup().getMemoryUsageInBytes(), notNullValue());
assertThat(new BigInteger(stats.getCgroup().getMemoryUsageInBytes()), greaterThan(BigInteger.ZERO));
} }
} else { } else {
assertNull(stats.getCgroup()); assertNull(stats.getCgroup());
@ -159,7 +166,7 @@ public class OsProbeTests extends ESTestCase {
"9:net_cls,net_prio:/", "9:net_cls,net_prio:/",
"8:pids:/", "8:pids:/",
"7:blkio:/", "7:blkio:/",
"6:memory:/", "6:memory:/" + hierarchy,
"5:devices:/user.slice", "5:devices:/user.slice",
"4:hugetlb:/", "4:hugetlb:/",
"3:perf_event:/", "3:perf_event:/",
@ -194,6 +201,19 @@ public class OsProbeTests extends ESTestCase {
"throttled_time 139298645489"); "throttled_time 139298645489");
} }
@Override
String readSysFsCgroupMemoryLimitInBytes(String controlGroup) {
assertThat(controlGroup, equalTo("/" + hierarchy));
// This is the highest value that can be stored in an unsigned 64 bit number, hence too big for long
return "18446744073709551615";
}
@Override
String readSysFsCgroupMemoryUsageInBytes(String controlGroup) {
assertThat(controlGroup, equalTo("/" + hierarchy));
return "4796416";
}
@Override @Override
boolean areCgroupStatsAvailable() { boolean areCgroupStatsAvailable() {
return areCgroupStatsAvailable; return areCgroupStatsAvailable;
@ -213,6 +233,8 @@ public class OsProbeTests extends ESTestCase {
assertThat(cgroup.getCpuStat().getNumberOfElapsedPeriods(), equalTo(17992L)); assertThat(cgroup.getCpuStat().getNumberOfElapsedPeriods(), equalTo(17992L));
assertThat(cgroup.getCpuStat().getNumberOfTimesThrottled(), equalTo(1311L)); assertThat(cgroup.getCpuStat().getNumberOfTimesThrottled(), equalTo(1311L));
assertThat(cgroup.getCpuStat().getTimeThrottledNanos(), equalTo(139298645489L)); assertThat(cgroup.getCpuStat().getTimeThrottledNanos(), equalTo(139298645489L));
assertThat(cgroup.getMemoryLimitInBytes(), equalTo(Long.MAX_VALUE));
assertThat(cgroup.getMemoryUsageInBytes(), equalTo(4796416L));
} else { } else {
assertNull(cgroup); assertNull(cgroup);
} }

View File

@ -42,7 +42,10 @@ public class OsStatsTests extends ESTestCase {
randomAlphaOfLength(8), randomAlphaOfLength(8),
randomNonNegativeLong(), randomNonNegativeLong(),
randomNonNegativeLong(), randomNonNegativeLong(),
new OsStats.Cgroup.CpuStat(randomNonNegativeLong(), randomNonNegativeLong(), randomNonNegativeLong())); new OsStats.Cgroup.CpuStat(randomNonNegativeLong(), randomNonNegativeLong(), randomNonNegativeLong()),
randomAlphaOfLength(8),
Long.toString(randomNonNegativeLong()),
Long.toString(randomNonNegativeLong()));
OsStats osStats = new OsStats(System.currentTimeMillis(), cpu, mem, swap, cgroup); OsStats osStats = new OsStats(System.currentTimeMillis(), cpu, mem, swap, cgroup);
try (BytesStreamOutput out = new BytesStreamOutput()) { try (BytesStreamOutput out = new BytesStreamOutput()) {
@ -70,6 +73,8 @@ public class OsStatsTests extends ESTestCase {
assertEquals( assertEquals(
osStats.getCgroup().getCpuStat().getTimeThrottledNanos(), osStats.getCgroup().getCpuStat().getTimeThrottledNanos(),
deserializedOsStats.getCgroup().getCpuStat().getTimeThrottledNanos()); deserializedOsStats.getCgroup().getCpuStat().getTimeThrottledNanos());
assertEquals(osStats.getCgroup().getMemoryLimitInBytes(), deserializedOsStats.getCgroup().getMemoryLimitInBytes());
assertEquals(osStats.getCgroup().getMemoryUsageInBytes(), deserializedOsStats.getCgroup().getMemoryUsageInBytes());
} }
} }
} }

View File

@ -255,6 +255,25 @@ the operating system:
The total amount of time (in nanoseconds) for which all tasks in The total amount of time (in nanoseconds) for which all tasks in
the same cgroup as the Elasticsearch process have been throttled. the same cgroup as the Elasticsearch process have been throttled.
`os.cgroup.memory.control_group` (Linux only)::
The `memory` control group to which the Elasticsearch process
belongs
`os.cgroup.memory.limit_in_bytes` (Linux only)::
The maximum amount of user memory (including file cache) allowed
for all tasks in the same cgroup as the Elasticsearch process.
This value can be too big to store in a `long`, so is returned as
a string so that the value returned can exactly match what the
underlying operating system interface returns. Any value that is
too large to parse into a `long` almost certainly means no limit
has been set for the cgroup.
`os.cgroup.memory.usage_in_bytes` (Linux only)::
The total current memory usage by processes in the cgroup (in bytes)
by all tasks in the same cgroup as the Elasticsearch process.
This value is stored as a string for consistency with
`os.cgroup.memory.limit_in_bytes`.
NOTE: For the cgroup stats to be visible, cgroups must be compiled into NOTE: For the cgroup stats to be visible, cgroups must be compiled into
the kernal, the `cpu` and `cpuacct` cgroup subsystems must be the kernal, the `cpu` and `cpuacct` cgroup subsystems must be
configured and stats must be readable from `/sys/fs/cgroup/cpu` configured and stats must be readable from `/sys/fs/cgroup/cpu`