Return 0 for negative "free" and "total" memory reported by the OS (#42725)
* Return 0 for negative "free" and "total" memory reported by the OS We've had a situation where the MX bean reported negative values for the free memory of the OS, in those rare cases we want to return a value of 0 rather than blowing up later down the pipeline. In the event that there is a serialization or creation error with regard to memory use, this adds asserts so the failure will occur as soon as possible and give us a better location for investigation. Resolves #42157 * Fix test passing in invalid memory value * Fix another test passing in invalid memory value * Also change mem check in MachineLearning.machineMemoryFromStats * Add background documentation for why we prevent negative return values * Clarify comment a bit more
This commit is contained in:
parent
23a3471394
commit
d81ce9a647
|
@ -42,6 +42,24 @@ import java.util.regex.Matcher;
|
|||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* The {@link OsProbe} class retrieves information about the physical and swap size of the machine
|
||||
* memory, as well as the system load average and cpu load.
|
||||
*
|
||||
* In some exceptional cases, it's possible the underlying native method used by
|
||||
* {@link #getFreePhysicalMemorySize()} and {@link #getTotalPhysicalMemorySize()} can return a
|
||||
* negative value. Because of this, we prevent those methods from returning negative values,
|
||||
* returning 0 instead.
|
||||
*
|
||||
* The OS can report a negative number in a number of cases:
|
||||
* - Non-supported OSes (HP-UX, or AIX)
|
||||
* - A failure of macOS to initialize host statistics
|
||||
* - An OS that does not support the {@code _SC_PHYS_PAGES} or {@code _SC_PAGE_SIZE} flags for the {@code sysconf()} linux kernel call
|
||||
* - An overflow of the product of {@code _SC_PHYS_PAGES} and {@code _SC_PAGE_SIZE}
|
||||
* - An error case retrieving these values from a linux kernel
|
||||
* - A non-standard libc implementation not implementing the required values
|
||||
* For a more exhaustive explanation, see https://github.com/elastic/elasticsearch/pull/42725
|
||||
*/
|
||||
public class OsProbe {
|
||||
|
||||
private static final OperatingSystemMXBean osMxBean = ManagementFactory.getOperatingSystemMXBean();
|
||||
|
@ -67,12 +85,19 @@ public class OsProbe {
|
|||
*/
|
||||
public long getFreePhysicalMemorySize() {
|
||||
if (getFreePhysicalMemorySize == null) {
|
||||
return -1;
|
||||
logger.warn("getFreePhysicalMemorySize is not available");
|
||||
return 0;
|
||||
}
|
||||
try {
|
||||
return (long) getFreePhysicalMemorySize.invoke(osMxBean);
|
||||
final long freeMem = (long) getFreePhysicalMemorySize.invoke(osMxBean);
|
||||
if (freeMem < 0) {
|
||||
logger.warn("OS reported a negative free memory value [{}]", freeMem);
|
||||
return 0;
|
||||
}
|
||||
return freeMem;
|
||||
} catch (Exception e) {
|
||||
return -1;
|
||||
logger.warn("exception retrieving free physical memory", e);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -81,12 +106,19 @@ public class OsProbe {
|
|||
*/
|
||||
public long getTotalPhysicalMemorySize() {
|
||||
if (getTotalPhysicalMemorySize == null) {
|
||||
return -1;
|
||||
logger.warn("getTotalPhysicalMemorySize is not available");
|
||||
return 0;
|
||||
}
|
||||
try {
|
||||
return (long) getTotalPhysicalMemorySize.invoke(osMxBean);
|
||||
final long totalMem = (long) getTotalPhysicalMemorySize.invoke(osMxBean);
|
||||
if (totalMem < 0) {
|
||||
logger.warn("OS reported a negative total memory value [{}]", totalMem);
|
||||
return 0;
|
||||
}
|
||||
return totalMem;
|
||||
} catch (Exception e) {
|
||||
return -1;
|
||||
logger.warn("exception retrieving total physical memory", e);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -229,13 +229,17 @@ public class OsStats implements Writeable, ToXContentFragment {
|
|||
private final long free;
|
||||
|
||||
public Mem(long total, long free) {
|
||||
assert total >= 0 : "expected total memory to be positive, got: " + total;
|
||||
assert free >= 0 : "expected free memory to be positive, got: " + total;
|
||||
this.total = total;
|
||||
this.free = free;
|
||||
}
|
||||
|
||||
public Mem(StreamInput in) throws IOException {
|
||||
this.total = in.readLong();
|
||||
assert total >= 0 : "expected total memory to be positive, got: " + total;
|
||||
this.free = in.readLong();
|
||||
assert free >= 0 : "expected free memory to be positive, got: " + total;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -805,8 +805,8 @@ public class MachineLearning extends Plugin implements ActionPlugin, AnalysisPlu
|
|||
if (containerLimitStr != null) {
|
||||
BigInteger containerLimit = new BigInteger(containerLimitStr);
|
||||
if ((containerLimit.compareTo(BigInteger.valueOf(mem)) < 0 && containerLimit.compareTo(BigInteger.ZERO) > 0)
|
||||
// mem < 0 means the value couldn't be obtained for some reason
|
||||
|| (mem < 0 && containerLimit.compareTo(BigInteger.valueOf(Long.MAX_VALUE)) < 0)) {
|
||||
// mem <= 0 means the value couldn't be obtained for some reason
|
||||
|| (mem <= 0 && containerLimit.compareTo(BigInteger.valueOf(Long.MAX_VALUE)) < 0)) {
|
||||
mem = containerLimit.longValue();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -97,8 +97,8 @@ public class MachineLearningTests extends ESTestCase {
|
|||
|
||||
public void testMachineMemory_givenStatsFailure() throws IOException {
|
||||
OsStats stats = mock(OsStats.class);
|
||||
when(stats.getMem()).thenReturn(new OsStats.Mem(-1, -1));
|
||||
assertEquals(-1L, MachineLearning.machineMemoryFromStats(stats));
|
||||
when(stats.getMem()).thenReturn(new OsStats.Mem(0, 0));
|
||||
assertEquals(0L, MachineLearning.machineMemoryFromStats(stats));
|
||||
}
|
||||
|
||||
public void testMachineMemory_givenNoCgroup() throws IOException {
|
||||
|
|
|
@ -329,7 +329,7 @@ public class NodeStatsMonitoringDocTests extends BaseFilteredMonitoringDocTestCa
|
|||
final OsStats.Cgroup osCgroup = new OsStats.Cgroup("_cpu_acct_ctrl_group", ++iota, "_cpu_ctrl_group", ++iota, ++iota, osCpuStat,
|
||||
"_memory_ctrl_group", "2000000000", "1000000000");
|
||||
|
||||
final OsStats.Mem osMem = new OsStats.Mem(no, no);
|
||||
final OsStats.Mem osMem = new OsStats.Mem(0, 0);
|
||||
final OsStats.Swap osSwap = new OsStats.Swap(no, no);
|
||||
final OsStats os = new OsStats(no, osCpu, osMem, osSwap, osCgroup);
|
||||
|
||||
|
|
Loading…
Reference in New Issue