From 0df76f0c7722cbec3a273af70093c7f14494927b Mon Sep 17 00:00:00 2001 From: Jason Tedor Date: Tue, 18 Oct 2016 23:21:13 -0400 Subject: [PATCH 1/8] Add basic cgroup CPU metrics This commit adds basic cgroup CPU metrics to the node stats API. --- .../org/elasticsearch/monitor/os/OsProbe.java | 157 +++++++++++++++- .../org/elasticsearch/monitor/os/OsStats.java | 170 +++++++++++++++++- .../elasticsearch/bootstrap/security.policy | 4 + .../cluster/node/stats/NodeStatsTests.java | 27 ++- .../monitor/os/OsProbeTests.java | 87 +++++++++ .../monitor/os/OsStatsTests.java | 24 ++- docs/reference/cluster/nodes-stats.asciidoc | 32 ++++ 7 files changed, 491 insertions(+), 10 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/monitor/os/OsProbe.java b/core/src/main/java/org/elasticsearch/monitor/os/OsProbe.java index 847e38bbf8c31..7b24098d8d77a 100644 --- a/core/src/main/java/org/elasticsearch/monitor/os/OsProbe.java +++ b/core/src/main/java/org/elasticsearch/monitor/os/OsProbe.java @@ -33,7 +33,12 @@ import java.lang.management.OperatingSystemMXBean; import java.lang.reflect.Method; import java.nio.file.Files; +import java.util.Collections; +import java.util.HashMap; import java.util.List; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; public class OsProbe { @@ -180,6 +185,124 @@ public short getSystemCpuPercent() { return Probes.getLoadAndScaleToPercent(getSystemCpuLoad, osMxBean); } + private Map getCpuAccountingCGroup() { + try { + final List lines = readProcSelfCgroup(); + if (!lines.isEmpty()) { + final Map controllerMap = new HashMap<>(); + final Pattern pattern = Pattern.compile("\\d+:(\\w+(?:,\\w+)?):(/.*)"); + for (final String line : lines) { + final Matcher matcher = pattern.matcher(line); + if (matcher.matches()) { + final String[] controllers = matcher.group(1).split(","); + for (final String controller : controllers) { + controllerMap.put(controller, matcher.group(2)); + } + } + } + return controllerMap; + } + } catch (final IOException e) { + // do not fail Elasticsearch if something unexpected happens here + } + + return Collections.emptyMap(); + } + + // visible for testing + List readProcSelfCgroup() throws IOException { + return Files.readAllLines(PathUtils.get("/proc/self/cgroup")); + } + + private long getCgroupCpuAcctUsageNanos(final String path) { + try { + final List lines = readSysFsCgroupCpuAcctCpuAcctUsage(path); + if (!lines.isEmpty()) { + return Long.parseLong(lines.get(0)); + } + } catch (IOException e) { + // do not fail Elasticsearch is something unexpected happens here + } + + return -1; + } + + // visible for testing + List readSysFsCgroupCpuAcctCpuAcctUsage(final String path) throws IOException { + return Files.readAllLines(PathUtils.get("/sys/fs/cgroup/cpuacct", path, "cpuacct.usage")); + } + + private long getCgroupCpuAcctCpuCfsPeriodMicros(final String path) { + try { + final List lines = readSysFsCgroupCpuAcctCpuCfsPeriod(path); + if (!lines.isEmpty()) { + return Long.parseLong(lines.get(0)); + } + } catch (IOException e) { + // do not fail Elasticsearch is something unexpected happens here + } + + return -1; + } + + // visible for testing + List readSysFsCgroupCpuAcctCpuCfsPeriod(final String path) throws IOException { + return Files.readAllLines(PathUtils.get("/sys/fs/cgroup/cpu", path, "cpu.cfs_period_us")); + } + + private long getCGroupCpuAcctCpuCfsQuotaMicros(final String path) { + try { + final List lines = readSysFsCgroupCpuAcctCpuAcctCfsQuota(path); + if (!lines.isEmpty()) { + return Long.parseLong(lines.get(0)); + } + } catch (IOException e) { + // do not fail Elasticsearch is something unexpected happens here + } + + return -1; + } + + // visible for testing + List readSysFsCgroupCpuAcctCpuAcctCfsQuota(final String path) throws IOException { + return Files.readAllLines(PathUtils.get("/sys/fs/cgroup/cpu", path, "cpu.cfs_quota_us")); + } + + private OsStats.Cgroup.CpuStat getCgroupCpuAcctCpuStat(final String path) { + try { + final List lines = readSysFsCgroupCpuAcctCpuStat(path); + long numberOfPeriods = -1; + long numberOfTimesThrottled = -1; + long timeThrottledNanos = -1; + if (!lines.isEmpty()) { + for (final String line : lines) { + final String[] fields = line.split("\\s+"); + switch(fields[0]) { + case "nr_periods": + numberOfPeriods = Long.parseLong(fields[1]); + break; + case "nr_throttled": + numberOfTimesThrottled = Long.parseLong(fields[1]); + break; + case "throttled_time": + timeThrottledNanos = Long.parseLong(fields[1]); + break; + } + } + } + return new OsStats.Cgroup.CpuStat(numberOfPeriods, numberOfTimesThrottled, timeThrottledNanos); + } catch (IOException e) { + // do not fail Elasticsearch is something unexpected happens here + } + + return null; + } + + // visible for testing + List readSysFsCgroupCpuAcctCpuStat(final String path) throws IOException { + return Files.readAllLines(PathUtils.get("/sys/fs/cgroup/cpu", path, "cpu.stat")); + } + private static class OsProbeHolder { private static final OsProbe INSTANCE = new OsProbe(); } @@ -199,10 +322,35 @@ public OsInfo osInfo(long refreshInterval, int allocatedProcessors) { } public OsStats osStats() { - OsStats.Cpu cpu = new OsStats.Cpu(getSystemCpuPercent(), getSystemLoadAverage()); - OsStats.Mem mem = new OsStats.Mem(getTotalPhysicalMemorySize(), getFreePhysicalMemorySize()); - OsStats.Swap swap = new OsStats.Swap(getTotalSwapSpaceSize(), getFreeSwapSpaceSize()); - return new OsStats(System.currentTimeMillis(), cpu, mem , swap); + final OsStats.Cpu cpu = new OsStats.Cpu(getSystemCpuPercent(), getSystemLoadAverage()); + final OsStats.Mem mem = new OsStats.Mem(getTotalPhysicalMemorySize(), getFreePhysicalMemorySize()); + final OsStats.Swap swap = new OsStats.Swap(getTotalSwapSpaceSize(), getFreeSwapSpaceSize()); + final OsStats.Cgroup cgroup; + if (shouldReadCgroups()) { + final Map controllerMap = getCpuAccountingCGroup(); + if (controllerMap.containsKey("cpu") && controllerMap.containsKey("cpuacct")) { + final String cpuAcctControlGroup = controllerMap.get("cpuacct"); + final String cpuControlGroup = controllerMap.get("cpu"); + cgroup = + new OsStats.Cgroup( + cpuAcctControlGroup, + getCgroupCpuAcctUsageNanos(cpuAcctControlGroup), + cpuControlGroup, + getCgroupCpuAcctCpuCfsPeriodMicros(cpuControlGroup), + getCGroupCpuAcctCpuCfsQuotaMicros(cpuControlGroup), + getCgroupCpuAcctCpuStat(cpuControlGroup)); + } else { + cgroup = null; + } + } else { + cgroup = null; + } + return new OsStats(System.currentTimeMillis(), cpu, mem, swap, cgroup); + } + + // visible for testing + boolean shouldReadCgroups() { + return Constants.LINUX; } /** @@ -217,4 +365,5 @@ private static Method getMethod(String methodName) { return null; } } + } diff --git a/core/src/main/java/org/elasticsearch/monitor/os/OsStats.java b/core/src/main/java/org/elasticsearch/monitor/os/OsStats.java index e07b92a6cb43b..103f2555fd4bb 100644 --- a/core/src/main/java/org/elasticsearch/monitor/os/OsStats.java +++ b/core/src/main/java/org/elasticsearch/monitor/os/OsStats.java @@ -36,12 +36,14 @@ public class OsStats implements Writeable, ToXContent { private final Cpu cpu; private final Mem mem; private final Swap swap; + private final Cgroup cgroup; - public OsStats(long timestamp, Cpu cpu, Mem mem, Swap swap) { + public OsStats(final long timestamp, final Cpu cpu, final Mem mem, final Swap swap, final Cgroup cgroup) { this.timestamp = timestamp; - this.cpu = Objects.requireNonNull(cpu, "cpu must not be null"); - this.mem = Objects.requireNonNull(mem, "mem must not be null");; - this.swap = Objects.requireNonNull(swap, "swap must not be null");; + this.cpu = Objects.requireNonNull(cpu); + this.mem = Objects.requireNonNull(mem); + this.swap = Objects.requireNonNull(swap); + this.cgroup = cgroup; } public OsStats(StreamInput in) throws IOException { @@ -49,6 +51,7 @@ public OsStats(StreamInput in) throws IOException { this.cpu = new Cpu(in); this.mem = new Mem(in); this.swap = new Swap(in); + this.cgroup = new Cgroup(in); } @Override @@ -57,6 +60,7 @@ public void writeTo(StreamOutput out) throws IOException { cpu.writeTo(out); mem.writeTo(out); swap.writeTo(out); + cgroup.writeTo(out); } public long getTimestamp() { @@ -73,6 +77,10 @@ public Swap getSwap() { return swap; } + public Cgroup getCgroup() { + return cgroup; + } + static final class Fields { static final String OS = "os"; static final String TIMESTAMP = "timestamp"; @@ -103,6 +111,7 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws cpu.toXContent(builder, params); mem.toXContent(builder, params); swap.toXContent(builder, params); + cgroup.toXContent(builder, params); builder.endObject(); return builder; } @@ -265,7 +274,160 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws } } + public static class Cgroup implements Writeable, ToXContent { + + private final String cpuAcctControlGroup; + private final long cpuAcctUsageNanos; + private final String cpuControlGroup; + private final long cpuCfsPeriodMicros; // completely fair scheduler enforcement period + private final long cpuCfsQuotaMicros; // completely fair scheduler quota + private final CpuStat cpuStat; + + public String getCpuAcctControlGroup() { + return cpuAcctControlGroup; + } + + public long getCpuAcctUsageNanos() { + return cpuAcctUsageNanos; + } + + public String getCpuControlGroup() { + return cpuControlGroup; + } + + public long getCpuCfsPeriodMicros() { + return cpuCfsPeriodMicros; + } + + public long getCpuCfsQuotaMicros() { + return cpuCfsQuotaMicros; + } + + public CpuStat getCpuStat() { + return cpuStat; + } + + public Cgroup( + final String cpuAcctControlGroup, + final long cpuAcctUsageNanos, + final String cpuControlGroup, + final long cpuCfsPeriodMicros, + final long cpuCfsQuotaMicros, + final CpuStat cpuStat) { + this.cpuAcctControlGroup = cpuAcctControlGroup; + this.cpuAcctUsageNanos = cpuAcctUsageNanos; + this.cpuControlGroup = cpuControlGroup; + this.cpuCfsPeriodMicros = cpuCfsPeriodMicros; + this.cpuCfsQuotaMicros = cpuCfsQuotaMicros; + this.cpuStat = cpuStat; + } + + Cgroup(final StreamInput in) throws IOException { + cpuAcctControlGroup = in.readString(); + cpuAcctUsageNanos = in.readLong(); + cpuControlGroup = in.readString(); + cpuCfsPeriodMicros = in.readLong(); + cpuCfsQuotaMicros = in.readLong(); + if (!in.readBoolean()) { + cpuStat = null; + } else { + cpuStat = new CpuStat(in); + } + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + out.writeString(cpuAcctControlGroup); + out.writeLong(cpuAcctUsageNanos); + out.writeString(cpuControlGroup); + out.writeLong(cpuCfsPeriodMicros); + out.writeLong(cpuCfsQuotaMicros); + if (cpuStat == null) { + out.writeBoolean(false); + } else { + out.writeBoolean(true); + cpuStat.writeTo(out); + } + } + + @Override + public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { + builder.startObject("cgroup"); + { + builder.startObject("cpuacct"); + { + builder.field("control_group", cpuAcctControlGroup); + builder.field("usage_nanos", cpuAcctUsageNanos); + } + builder.endObject(); + builder.startObject("cpu"); + { + builder.field("control_group", cpuControlGroup); + builder.field("cfs_period_micros", cpuCfsPeriodMicros); + builder.field("cfs_quota_micros", cpuCfsQuotaMicros); + cpuStat.toXContent(builder, params); + } + builder.endObject(); + } + builder.endObject(); + return builder; + } + + public static class CpuStat implements Writeable, ToXContent { + + private final long numberOfElapsedPeriods; + private final long numberOfTimesThrottled; + private final long timeThrottledNanos; + + public long getNumberOfElapsedPeriods() { + return numberOfElapsedPeriods; + } + + public long getNumberOfTimesThrottled() { + return numberOfTimesThrottled; + } + + public long getTimeThrottledNanos() { + return timeThrottledNanos; + } + + public CpuStat(final long numberOfElapsedPeriods, final long numberOfTimesThrottled, final long timeThrottledNanos) { + this.numberOfElapsedPeriods = numberOfElapsedPeriods; + this.numberOfTimesThrottled = numberOfTimesThrottled; + this.timeThrottledNanos = timeThrottledNanos; + } + + CpuStat(final StreamInput in) throws IOException { + numberOfElapsedPeriods = in.readLong(); + numberOfTimesThrottled = in.readLong(); + timeThrottledNanos = in.readLong(); + } + + @Override + public void writeTo(final StreamOutput out) throws IOException { + out.writeLong(numberOfElapsedPeriods); + out.writeLong(numberOfTimesThrottled); + out.writeLong(timeThrottledNanos); + } + + @Override + public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { + builder.startObject("stat"); + { + builder.field("number_of_elapsed_periods", numberOfElapsedPeriods); + builder.field("number_of_times_throttled", numberOfTimesThrottled); + builder.field("time_throttled_nanos", timeThrottledNanos); + } + builder.endObject(); + return builder; + } + + } + + } + public static short calculatePercentage(long used, long max) { return max <= 0 ? 0 : (short) (Math.round((100d * used) / max)); } + } diff --git a/core/src/main/resources/org/elasticsearch/bootstrap/security.policy b/core/src/main/resources/org/elasticsearch/bootstrap/security.policy index 4c5c2768987ad..4eb716d10b367 100644 --- a/core/src/main/resources/org/elasticsearch/bootstrap/security.policy +++ b/core/src/main/resources/org/elasticsearch/bootstrap/security.policy @@ -121,4 +121,8 @@ grant { // io stats on Linux permission java.io.FilePermission "/proc/diskstats", "read"; + + // control group stats on Linux + permission java.io.FilePermission "/proc/self/cgroup", "read"; + permission java.io.FilePermission "/sys/fs/cgroup/cpuacct/-", "read"; }; diff --git a/core/src/test/java/org/elasticsearch/action/admin/cluster/node/stats/NodeStatsTests.java b/core/src/test/java/org/elasticsearch/action/admin/cluster/node/stats/NodeStatsTests.java index 88aaabe301ff5..7c82df1157082 100644 --- a/core/src/test/java/org/elasticsearch/action/admin/cluster/node/stats/NodeStatsTests.java +++ b/core/src/test/java/org/elasticsearch/action/admin/cluster/node/stats/NodeStatsTests.java @@ -72,6 +72,24 @@ public void testSerialization() throws IOException { assertEquals(nodeStats.getOs().getMem().getFreePercent(), deserializedNodeStats.getOs().getMem().getFreePercent()); assertEquals(nodeStats.getOs().getMem().getUsedPercent(), deserializedNodeStats.getOs().getMem().getUsedPercent()); assertEquals(nodeStats.getOs().getCpu().getPercent(), deserializedNodeStats.getOs().getCpu().getPercent()); + assertEquals( + nodeStats.getOs().getCgroup().getCpuAcctControlGroup(), + deserializedNodeStats.getOs().getCgroup().getCpuAcctControlGroup()); + assertEquals( + nodeStats.getOs().getCgroup().getCpuAcctUsageNanos(), + deserializedNodeStats.getOs().getCgroup().getCpuAcctUsageNanos()); + assertEquals( + nodeStats.getOs().getCgroup().getCpuCfsPeriodMicros(), + deserializedNodeStats.getOs().getCgroup().getCpuCfsPeriodMicros()); + assertEquals( + nodeStats.getOs().getCgroup().getCpuStat().getNumberOfElapsedPeriods(), + deserializedNodeStats.getOs().getCgroup().getCpuStat().getNumberOfElapsedPeriods()); + assertEquals( + nodeStats.getOs().getCgroup().getCpuStat().getNumberOfTimesThrottled(), + deserializedNodeStats.getOs().getCgroup().getCpuStat().getNumberOfTimesThrottled()); + assertEquals( + nodeStats.getOs().getCgroup().getCpuStat().getTimeThrottledNanos(), + deserializedNodeStats.getOs().getCgroup().getCpuStat().getTimeThrottledNanos()); assertArrayEquals(nodeStats.getOs().getCpu().getLoadAverage(), deserializedNodeStats.getOs().getCpu().getLoadAverage(), 0); } @@ -264,7 +282,14 @@ private static NodeStats createNodeStats() { } osStats = new OsStats(System.currentTimeMillis(), new OsStats.Cpu(randomShort(), loadAverages), new OsStats.Mem(randomLong(), randomLong()), - new OsStats.Swap(randomLong(), randomLong())); + new OsStats.Swap(randomLong(), randomLong()), + new OsStats.Cgroup( + randomAsciiOfLength(8), + randomPositiveLong(), + randomAsciiOfLength(8), + randomPositiveLong(), + randomPositiveLong(), + new OsStats.Cgroup.CpuStat(randomPositiveLong(), randomPositiveLong(), randomPositiveLong()))); } ProcessStats processStats = frequently() ? new ProcessStats(randomPositiveLong(), randomPositiveLong(), randomPositiveLong(), new ProcessStats.Cpu(randomShort(), randomPositiveLong()), diff --git a/core/src/test/java/org/elasticsearch/monitor/os/OsProbeTests.java b/core/src/test/java/org/elasticsearch/monitor/os/OsProbeTests.java index 9cef6671c3ef0..c4923ee8616fb 100644 --- a/core/src/test/java/org/elasticsearch/monitor/os/OsProbeTests.java +++ b/core/src/test/java/org/elasticsearch/monitor/os/OsProbeTests.java @@ -22,6 +22,11 @@ import org.apache.lucene.util.Constants; import org.elasticsearch.test.ESTestCase; +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + import static org.hamcrest.Matchers.allOf; import static org.hamcrest.Matchers.anyOf; import static org.hamcrest.Matchers.both; @@ -30,8 +35,10 @@ import static org.hamcrest.Matchers.greaterThanOrEqualTo; import static org.hamcrest.Matchers.is; import static org.hamcrest.Matchers.lessThanOrEqualTo; +import static org.hamcrest.Matchers.notNullValue; public class OsProbeTests extends ESTestCase { + private final OsProbe probe = OsProbe.getInstance(); public void testOsInfo() { @@ -102,6 +109,20 @@ public void testOsStats() { assertThat(stats.getSwap().getFree().getBytes(), equalTo(0L)); assertThat(stats.getSwap().getUsed().getBytes(), equalTo(0L)); } + + if (Constants.LINUX) { + if (stats.getCgroup() != null) { + assertThat(stats.getCgroup().getCpuAcctControlGroup(), notNullValue()); + assertThat(stats.getCgroup().getCpuAcctUsageNanos(), greaterThan(0L)); + assertThat(stats.getCgroup().getCpuCfsQuotaMicros(), anyOf(equalTo(-1L), greaterThanOrEqualTo(0L))); + assertThat(stats.getCgroup().getCpuCfsPeriodMicros(), greaterThanOrEqualTo(0L)); + assertThat(stats.getCgroup().getCpuStat().getNumberOfElapsedPeriods(), greaterThanOrEqualTo(0L)); + assertThat(stats.getCgroup().getCpuStat().getNumberOfTimesThrottled(), greaterThanOrEqualTo(0L)); + assertThat(stats.getCgroup().getCpuStat().getTimeThrottledNanos(), greaterThanOrEqualTo(0L)); + } + } else { + assertNull(stats.getCgroup()); + } } public void testGetSystemLoadAverage() { @@ -125,4 +146,70 @@ String readProcLoadavg() { assertThat(systemLoadAverage[2], equalTo(Double.parseDouble("1.99"))); } + public void testCGroupProbe() { + + final String hierarchy = randomAsciiOfLength(16); + + final OsProbe probe = new OsProbe() { + + @Override + List readProcSelfCgroup() throws IOException { + return Arrays.asList( + "11:freezer:/", + "10:net_cls,net_prio:/", + "9:pids:/", + "8:cpuset:/", + "7:blkio:/", + "6:memory:/", + "5:devices:/user.slice", + "4:hugetlb:/", + "3:perf_event:/", + "2:cpu,cpuacct:/" + hierarchy, + "1:name=systemd:/user.slice/user-1000.slice/session-2359.scope"); + } + + @Override + List readSysFsCgroupCpuAcctCpuAcctUsage(String path) throws IOException { + assertThat(path, equalTo("/" + hierarchy)); + return Collections.singletonList("364869866063112"); + } + + @Override + List readSysFsCgroupCpuAcctCpuCfsPeriod(String path) throws IOException { + assertThat(path, equalTo("/" + hierarchy)); + return Collections.singletonList("100000"); + } + + @Override + List readSysFsCgroupCpuAcctCpuAcctCfsQuota(String path) throws IOException { + assertThat(path, equalTo("/" + hierarchy)); + return Collections.singletonList("50000"); + } + + @Override + List readSysFsCgroupCpuAcctCpuStat(String path) throws IOException { + return Arrays.asList( + "nr_periods 17992", + "nr_throttled 1311", + "throttled_time 139298645489"); + } + + @Override + boolean shouldReadCgroups() { + return true; + } + + }; + + final OsStats.Cgroup cgroup = probe.osStats().getCgroup(); + assertThat(cgroup.getCpuAcctControlGroup(), equalTo("/" + hierarchy)); + assertThat(cgroup.getCpuAcctUsageNanos(), equalTo(364869866063112L)); + assertThat(cgroup.getCpuControlGroup(), equalTo("/" + hierarchy)); + assertThat(cgroup.getCpuCfsPeriodMicros(), equalTo(100000L)); + assertThat(cgroup.getCpuCfsQuotaMicros(), equalTo(50000L)); + assertThat(cgroup.getCpuStat().getNumberOfElapsedPeriods(), equalTo(17992L)); + assertThat(cgroup.getCpuStat().getNumberOfTimesThrottled(), equalTo(1311L)); + assertThat(cgroup.getCpuStat().getTimeThrottledNanos(), equalTo(139298645489L)); + } + } diff --git a/core/src/test/java/org/elasticsearch/monitor/os/OsStatsTests.java b/core/src/test/java/org/elasticsearch/monitor/os/OsStatsTests.java index 30d527311b31f..8334f71e86a67 100644 --- a/core/src/test/java/org/elasticsearch/monitor/os/OsStatsTests.java +++ b/core/src/test/java/org/elasticsearch/monitor/os/OsStatsTests.java @@ -36,7 +36,14 @@ public void testSerialization() throws IOException { OsStats.Cpu cpu = new OsStats.Cpu(randomShort(), loadAverages); OsStats.Mem mem = new OsStats.Mem(randomLong(), randomLong()); OsStats.Swap swap = new OsStats.Swap(randomLong(), randomLong()); - OsStats osStats = new OsStats(System.currentTimeMillis(), cpu, mem, swap); + OsStats.Cgroup cgroup = new OsStats.Cgroup( + randomAsciiOfLength(8), + randomPositiveLong(), + randomAsciiOfLength(8), + randomPositiveLong(), + randomPositiveLong(), + new OsStats.Cgroup.CpuStat(randomPositiveLong(), randomPositiveLong(), randomPositiveLong())); + OsStats osStats = new OsStats(System.currentTimeMillis(), cpu, mem, swap, cgroup); try (BytesStreamOutput out = new BytesStreamOutput()) { osStats.writeTo(out); @@ -49,7 +56,22 @@ public void testSerialization() throws IOException { assertEquals(osStats.getMem().getTotal(), deserializedOsStats.getMem().getTotal()); assertEquals(osStats.getSwap().getFree(), deserializedOsStats.getSwap().getFree()); assertEquals(osStats.getSwap().getTotal(), deserializedOsStats.getSwap().getTotal()); + assertEquals(osStats.getCgroup().getCpuAcctControlGroup(), deserializedOsStats.getCgroup().getCpuAcctControlGroup()); + assertEquals(osStats.getCgroup().getCpuAcctUsageNanos(), deserializedOsStats.getCgroup().getCpuAcctUsageNanos()); + assertEquals(osStats.getCgroup().getCpuControlGroup(), deserializedOsStats.getCgroup().getCpuControlGroup()); + assertEquals(osStats.getCgroup().getCpuCfsPeriodMicros(), deserializedOsStats.getCgroup().getCpuCfsPeriodMicros()); + assertEquals(osStats.getCgroup().getCpuCfsQuotaMicros(), deserializedOsStats.getCgroup().getCpuCfsQuotaMicros()); + assertEquals( + osStats.getCgroup().getCpuStat().getNumberOfElapsedPeriods(), + deserializedOsStats.getCgroup().getCpuStat().getNumberOfElapsedPeriods()); + assertEquals( + osStats.getCgroup().getCpuStat().getNumberOfTimesThrottled(), + deserializedOsStats.getCgroup().getCpuStat().getNumberOfTimesThrottled()); + assertEquals( + osStats.getCgroup().getCpuStat().getTimeThrottledNanos(), + deserializedOsStats.getCgroup().getCpuStat().getTimeThrottledNanos()); } } } + } diff --git a/docs/reference/cluster/nodes-stats.asciidoc b/docs/reference/cluster/nodes-stats.asciidoc index e49e2f6b8aadc..a9b21d8ddaa7d 100644 --- a/docs/reference/cluster/nodes-stats.asciidoc +++ b/docs/reference/cluster/nodes-stats.asciidoc @@ -217,6 +217,38 @@ the operating system: `os.swap.used_in_bytes`:: Amount of used swap space in bytes +`os.cgroup.cpuacct.control_group` (Linux only):: + The `cpuacct` control group to which the Elasticsearch process + belongs + +`os.cgroup.cpuacct.usage` (Linux only):: + The total CPU time (in nanoseconds) consumed by all tasks in the + same cgroup as the Elasticsearch process + +`os.cgroup.cpu.control_group` (Linux only):: + The `cpu` control group to which the Elasticsearch process belongs + +`os.cgroup.cpu.cfs_period_micros` (Linux only):: + The period of time (in microseconds) for how regularly all tasks in + the same cgroup as the Elasticsearch process should have their + access to CPU resources reallocated. + +`os.cgroup.cpu.cfs_quota_micros` (Linux only):: + The total amount of time (in microseconds) for which all tasks in + the same cgroup as the Elasticsearch process can run during one + period `os.cgroup.cpu.cfs_period_micros` + +`os.cgroup.cpu.stat.number_of_elapsed_periods` (Linux only):: + The number of reporting periods (as specified by + `os.cgroup.cpu.cfs_period_micros`) that have elapsed + +`os.cgroup.cpu.stat.number_of_times_throttled` (Linux only):: + The number of times all tasks in the same cgroup as the + Elasticsearch process have been throttled. + +`os.cgroup.cpu.stat.time_throttled_nanos` (Linux only):: + The total amount of time (in nanoseconds) for which all tasks in + the same cgroup as the Elasticsearch process have been throttled. [float] [[process-stats]] From 9a67d4b75a98730862d98720f446e6b62e31e4aa Mon Sep 17 00:00:00 2001 From: Jason Tedor Date: Wed, 19 Oct 2016 16:25:49 -0400 Subject: [PATCH 2/8] Add forbidden API suppressions to cgroup reads This commit adds some forbidden API suppressions to methods in OsProbe that are reading from various cgroup virtual files. --- core/src/main/java/org/elasticsearch/monitor/os/OsProbe.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/core/src/main/java/org/elasticsearch/monitor/os/OsProbe.java b/core/src/main/java/org/elasticsearch/monitor/os/OsProbe.java index 7b24098d8d77a..11bb1b4c3b698 100644 --- a/core/src/main/java/org/elasticsearch/monitor/os/OsProbe.java +++ b/core/src/main/java/org/elasticsearch/monitor/os/OsProbe.java @@ -210,6 +210,7 @@ private Map getCpuAccountingCGroup() { } // visible for testing + @SuppressForbidden(reason = "access /proc/self/cgroup") List readProcSelfCgroup() throws IOException { return Files.readAllLines(PathUtils.get("/proc/self/cgroup")); } @@ -228,6 +229,7 @@ private long getCgroupCpuAcctUsageNanos(final String path) { } // visible for testing + @SuppressForbidden(reason = "access /sys/fs/cgroup/cpuacct") List readSysFsCgroupCpuAcctCpuAcctUsage(final String path) throws IOException { return Files.readAllLines(PathUtils.get("/sys/fs/cgroup/cpuacct", path, "cpuacct.usage")); } @@ -246,6 +248,7 @@ private long getCgroupCpuAcctCpuCfsPeriodMicros(final String path) { } // visible for testing + @SuppressForbidden(reason = "access /sys/fs/cgroup/cpu") List readSysFsCgroupCpuAcctCpuCfsPeriod(final String path) throws IOException { return Files.readAllLines(PathUtils.get("/sys/fs/cgroup/cpu", path, "cpu.cfs_period_us")); } @@ -264,6 +267,7 @@ private long getCGroupCpuAcctCpuCfsQuotaMicros(final String path) { } // visible for testing + @SuppressForbidden(reason = "access /sys/fs/cgroup/cpu") List readSysFsCgroupCpuAcctCpuAcctCfsQuota(final String path) throws IOException { return Files.readAllLines(PathUtils.get("/sys/fs/cgroup/cpu", path, "cpu.cfs_quota_us")); } @@ -299,6 +303,7 @@ private OsStats.Cgroup.CpuStat getCgroupCpuAcctCpuStat(final String path) { } // visible for testing + @SuppressForbidden(reason = "access /sys/fs/cgroup/cpu") List readSysFsCgroupCpuAcctCpuStat(final String path) throws IOException { return Files.readAllLines(PathUtils.get("/sys/fs/cgroup/cpu", path, "cpu.stat")); } From c502d6dc28a9b27aad74d1dbbd8aae38c285d8b3 Mon Sep 17 00:00:00 2001 From: Jason Tedor Date: Wed, 19 Oct 2016 17:08:44 -0400 Subject: [PATCH 3/8] Add file permission for /sys/fs/cgroup/cpu/- This commit adds a file permission for /sys/fs/cgroup/cpu/- so that the OS probe can read from here when reading CPU cgroup stats. --- .../main/resources/org/elasticsearch/bootstrap/security.policy | 1 + 1 file changed, 1 insertion(+) diff --git a/core/src/main/resources/org/elasticsearch/bootstrap/security.policy b/core/src/main/resources/org/elasticsearch/bootstrap/security.policy index 4eb716d10b367..dfc00dcb01ac8 100644 --- a/core/src/main/resources/org/elasticsearch/bootstrap/security.policy +++ b/core/src/main/resources/org/elasticsearch/bootstrap/security.policy @@ -124,5 +124,6 @@ grant { // control group stats on Linux permission java.io.FilePermission "/proc/self/cgroup", "read"; + permission java.io.FilePermission "/sys/fs/cgroup/cpu/-", "read"; permission java.io.FilePermission "/sys/fs/cgroup/cpuacct/-", "read"; }; From 65d3df20c2698e18f8243add3a8e28be8cb11742 Mon Sep 17 00:00:00 2001 From: Jason Tedor Date: Wed, 19 Oct 2016 22:07:21 -0400 Subject: [PATCH 4/8] Fix method name in OsProbe This commit fixes the name of a method in OsProbe. In particular, a method that previously got the control group for the current process for the cpuacct sub-system has since evolved to get all control groups for the current process. This commit reflects this by renaming this method. --- core/src/main/java/org/elasticsearch/monitor/os/OsProbe.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/monitor/os/OsProbe.java b/core/src/main/java/org/elasticsearch/monitor/os/OsProbe.java index 11bb1b4c3b698..1530789d368e5 100644 --- a/core/src/main/java/org/elasticsearch/monitor/os/OsProbe.java +++ b/core/src/main/java/org/elasticsearch/monitor/os/OsProbe.java @@ -185,7 +185,7 @@ public short getSystemCpuPercent() { return Probes.getLoadAndScaleToPercent(getSystemCpuLoad, osMxBean); } - private Map getCpuAccountingCGroup() { + private Map getControlGroups() { try { final List lines = readProcSelfCgroup(); if (!lines.isEmpty()) { @@ -332,7 +332,7 @@ public OsStats osStats() { final OsStats.Swap swap = new OsStats.Swap(getTotalSwapSpaceSize(), getFreeSwapSpaceSize()); final OsStats.Cgroup cgroup; if (shouldReadCgroups()) { - final Map controllerMap = getCpuAccountingCGroup(); + final Map controllerMap = getControlGroups(); if (controllerMap.containsKey("cpu") && controllerMap.containsKey("cpuacct")) { final String cpuAcctControlGroup = controllerMap.get("cpuacct"); final String cpuControlGroup = controllerMap.get("cpu"); From 5bc48cd4052fe300a8567a1ad0ab194a5f7f3877 Mon Sep 17 00:00:00 2001 From: Jason Tedor Date: Thu, 20 Oct 2016 15:13:25 -0400 Subject: [PATCH 5/8] Add Javadocs for basic cgroup stats This commit adds Javadocs for the code that obtains the basic cgroup stats. --- .../org/elasticsearch/monitor/os/OsProbe.java | 349 +++++++++++++----- .../org/elasticsearch/monitor/os/OsStats.java | 68 +++- .../cluster/node/stats/NodeStatsTests.java | 6 + .../monitor/os/OsProbeTests.java | 32 +- 4 files changed, 338 insertions(+), 117 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/monitor/os/OsProbe.java b/core/src/main/java/org/elasticsearch/monitor/os/OsProbe.java index 1530789d368e5..e4d607ebca5f1 100644 --- a/core/src/main/java/org/elasticsearch/monitor/os/OsProbe.java +++ b/core/src/main/java/org/elasticsearch/monitor/os/OsProbe.java @@ -20,8 +20,6 @@ package org.elasticsearch.monitor.os; import org.apache.logging.log4j.Logger; -import org.apache.logging.log4j.message.ParameterizedMessage; -import org.apache.logging.log4j.util.Supplier; import org.apache.lucene.util.Constants; import org.elasticsearch.common.SuppressForbidden; import org.elasticsearch.common.io.PathUtils; @@ -33,10 +31,13 @@ import java.lang.management.OperatingSystemMXBean; import java.lang.reflect.Method; import java.nio.file.Files; +import java.nio.file.Path; import java.util.Collections; import java.util.HashMap; import java.util.List; +import java.util.Locale; import java.util.Map; +import java.util.function.Supplier; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -141,7 +142,9 @@ final double[] getSystemLoadAverage() { try { return new double[]{Double.parseDouble(fields[0]), Double.parseDouble(fields[1]), Double.parseDouble(fields[2])}; } catch (final NumberFormatException e) { - logger.debug((Supplier) () -> new ParameterizedMessage("error parsing /proc/loadavg [{}]", procLoadAvg), e); + if (logger.isDebugEnabled()) { + logger.debug(String.format(Locale.ROOT, "error parsing /proc/loadavg [%s]", procLoadAvg), e); + } } } // fallback @@ -185,127 +188,292 @@ public short getSystemCpuPercent() { return Probes.getLoadAndScaleToPercent(getSystemCpuLoad, osMxBean); } - private Map getControlGroups() { + /** + * Reads all lines of a file and logs any {@link IOException}. + * + * @param path path to the file to read + * @return all the line or an empty list if an {@link IOException} + * occurred + */ + private List readAllLines(final Path path) { try { - final List lines = readProcSelfCgroup(); - if (!lines.isEmpty()) { - final Map controllerMap = new HashMap<>(); - final Pattern pattern = Pattern.compile("\\d+:(\\w+(?:,\\w+)?):(/.*)"); - for (final String line : lines) { - final Matcher matcher = pattern.matcher(line); - if (matcher.matches()) { - final String[] controllers = matcher.group(1).split(","); - for (final String controller : controllers) { - controllerMap.put(controller, matcher.group(2)); - } - } - } - return controllerMap; - } + return Files.readAllLines(path); } catch (final IOException e) { - // do not fail Elasticsearch if something unexpected happens here + if (logger.isDebugEnabled()) { + logger.debug("error reading " + path, e); + } + return Collections.emptyList(); } - - return Collections.emptyMap(); } - // visible for testing - @SuppressForbidden(reason = "access /proc/self/cgroup") - List readProcSelfCgroup() throws IOException { - return Files.readAllLines(PathUtils.get("/proc/self/cgroup")); + /** + * Reads a file containing a single line and logs any + * {@link IOException}. + * + * @param path path to the file to read + * @return the single line or {@code null} if an + * {@link IOException} occurred + */ + private String readSingleLine(final Path path) { + try { + final List lines = Files.readAllLines(path); + assert lines != null && lines.size() == 1; + return lines.get(0); + } catch (final IOException e) { + if (logger.isDebugEnabled()) { + logger.debug("error reading " + path, e); + } + return null; + } } - private long getCgroupCpuAcctUsageNanos(final String path) { - try { - final List lines = readSysFsCgroupCpuAcctCpuAcctUsage(path); - if (!lines.isEmpty()) { - return Long.parseLong(lines.get(0)); + /** + * Parses the input to a long and logs any + * {@link NumberFormatException} that occurs during parsing. + * + * @param line the line to parse + * @param message a debug message to log if parsing fails + * @return the parsed value + */ + private long parseSingleLineAsLong(final String line, final Supplier message) { + if (line != null) { + try { + return Long.parseLong(line); + } catch (final NumberFormatException e) { + if (logger.isDebugEnabled()) { + logger.debug(message.get(), e); + } + // fallback } - } catch (IOException e) { - // do not fail Elasticsearch is something unexpected happens here } return -1; } - // visible for testing - @SuppressForbidden(reason = "access /sys/fs/cgroup/cpuacct") - List readSysFsCgroupCpuAcctCpuAcctUsage(final String path) throws IOException { - return Files.readAllLines(PathUtils.get("/sys/fs/cgroup/cpuacct", path, "cpuacct.usage")); - } + // pattern for lines in /proc/self/cgroup + private static final Pattern CONTROL_GROUP_PATTERN = Pattern.compile("\\d+:([^:,]+(?:,[^:,]+)?):(/.*)"); - private long getCgroupCpuAcctCpuCfsPeriodMicros(final String path) { - try { - final List lines = readSysFsCgroupCpuAcctCpuCfsPeriod(path); - if (!lines.isEmpty()) { - return Long.parseLong(lines.get(0)); + /** + * A map of the control groups to which the Elasticsearch process + * belongs. Note that this is a map because the control groups can + * vary from subsystem to subsystem. Additionally, this map can not + * be cached because a running process can be reclassified. + * + * @return a map from subsystems to the control group for the + * Elasticsearch process. + */ + private Map getControlGroups() { + final List lines = readProcSelfCgroup(); + if (!lines.isEmpty()) { + final Map controllerMap = new HashMap<>(); + for (final String line : lines) { + final Matcher matcher = CONTROL_GROUP_PATTERN.matcher(line); + // note that Matcher#matches must be invoked as + // matching is lazy; this can not happen in an assert + // as assertions might not be enabled + if (!matcher.matches()) { + assert false : line; + } + // at this point we have captured the subsystems and the + // control group + final String[] controllers = matcher.group(1).split(","); + for (final String controller : controllers) { + controllerMap.put(controller, matcher.group(2)); + } } - } catch (IOException e) { - // do not fail Elasticsearch is something unexpected happens here + return controllerMap; } - return -1; + return Collections.emptyMap(); } - // visible for testing - @SuppressForbidden(reason = "access /sys/fs/cgroup/cpu") - List readSysFsCgroupCpuAcctCpuCfsPeriod(final String path) throws IOException { - return Files.readAllLines(PathUtils.get("/sys/fs/cgroup/cpu", path, "cpu.cfs_period_us")); + /** + * The lines from {@code /proc/self/cgroup}. This file represents + * the control groups to which the Elasticsearch process belongs. + * Each line in this file represents a control group hierarchy of + * the form + *

+ * {@code \d+:([^:,]+(?:,[^:,]+)?):(/.*)} + *

+ * with the first field representing the hierarchy ID, the second + * field representing a comma-separated list of the subsystems + * bound to the hierarchy, and the last field representing the + * control group. + * + * @return the lines from {@code /proc/self/cgroup} or an empty + * list. + */ + @SuppressForbidden(reason = "access /proc/self/cgroup") + List readProcSelfCgroup() { + return readAllLines(PathUtils.get("/proc/self/cgroup")); } - private long getCGroupCpuAcctCpuCfsQuotaMicros(final String path) { - try { - final List lines = readSysFsCgroupCpuAcctCpuAcctCfsQuota(path); - if (!lines.isEmpty()) { - return Long.parseLong(lines.get(0)); - } - } catch (IOException e) { - // do not fail Elasticsearch is something unexpected happens here - } + /** + * The total CPU time in nanoseconds consumed by all tasks in the + * cgroup to which the Elasticsearch process belongs for the + * {@code cpuacct} subsystem. + * + * @param controlGroup the control group for the Elasticsearch + * process for the {@code cpuacct} subsystem + * @return the total CPU time in nanoseconds, or -1 + */ + private long getCgroupCpuAcctUsageNanos(final String controlGroup) { + final String line = readSysFsCgroupCpuAcctCpuAcctUsage(controlGroup); + return parseSingleLineAsLong( + line, + () -> String.format(Locale.ROOT, "error parsing cpuacct.usage [%s] for control group [%s]", line, controlGroup)); + } - return -1; + /** + * Returns the line from {@code cpuacct.usage} for the control + * group to which the Elasticsearch process belongs for the + * {@code cpuacct} subsystem. This line represents the total CPU + * time in nanoseconds consumed by all tasks in the same control + * group. + * + * @param controlGroup the control group to which the Elasticsearch + * process belongs for the {@code cpuacct} + * subsystem + * @return the line from {@code cpuacct.usage} or {@code null} + */ + @SuppressForbidden(reason = "access /sys/fs/cgroup/cpuacct") + String readSysFsCgroupCpuAcctCpuAcctUsage(final String controlGroup) { + return readSingleLine(PathUtils.get("/sys/fs/cgroup/cpuacct", controlGroup, "cpuacct.usage")); + } + + /** + * The total period of time in microseconds for how frequently the + * Elasticsearch control group's access to CPU resources will be + * reallocated. + * + * @param controlGroup the control group for the Elasticsearch + * process for the {@code cpuacct} subsystem + * @return the CFS quota period in microseconds, or -1 + */ + private long getCgroupCpuAcctCpuCfsPeriodMicros(final String controlGroup) { + final String line = readSysFsCgroupCpuAcctCpuCfsPeriod(controlGroup); + return parseSingleLineAsLong( + line, + () -> String.format(Locale.ROOT, "error parsing cpu.cfs_period_us [%s] for control group [%s]", line, controlGroup)); } - // visible for testing + /** + * Returns the line from {@code cpu.cfs_period_us} for the control + * group to which the Elasticsearch process belongs for the + * {@code cpu} subsystem. This line represents the period of time + * in microseconds for how frequently the control group's access to + * CPU resources will be reallocated. + * + * @param controlGroup the control group to which the Elasticsearch + * process belongs for the {@code cpu} + * subsystem + * @return the line from {@code cpu.cfs_period_us} or {@code null} + */ @SuppressForbidden(reason = "access /sys/fs/cgroup/cpu") - List readSysFsCgroupCpuAcctCpuAcctCfsQuota(final String path) throws IOException { - return Files.readAllLines(PathUtils.get("/sys/fs/cgroup/cpu", path, "cpu.cfs_quota_us")); + String readSysFsCgroupCpuAcctCpuCfsPeriod(final String controlGroup) { + return readSingleLine(PathUtils.get("/sys/fs/cgroup/cpu", controlGroup, "cpu.cfs_period_us")); } - private OsStats.Cgroup.CpuStat getCgroupCpuAcctCpuStat(final String path) { - try { - final List lines = readSysFsCgroupCpuAcctCpuStat(path); + /** + * The total time in microseconds that all tasks in the + * Elasticsearch control group can run during one period as + * specified by {@code cpu.cfs_period_us}. + * + * @param controlGroup the control group for the Elasticsearch + * process for the {@code cpuacct} subsystem + * @return the CFS quota in microseconds, or -1 + */ + private long getCGroupCpuAcctCpuCfsQuotaMicros(final String controlGroup) { + final String line = readSysFsCgroupCpuAcctCpuAcctCfsQuota(controlGroup); + return parseSingleLineAsLong( + line, + () -> String.format(Locale.ROOT, "error parsing cpu.cfs_quota_us [%s] for control group [%s]", line, controlGroup)); + } + + /** + * Returns the line from {@code cpu.cfs_quota_us} for the control + * group to which the Elasticsearch process belongs for the + * {@code cpu} subsystem. This line represents the total time in + * microseconds that all tasks in the control group can run during + * one period as specified by {@code cpu.cfs_period_us}. + * + * @param controlGroup the control group to which the Elasticsearch + * process belongs for the {@code cpu} + * subsystem + * @return the line from {@code cpu.cfs_quota_us} or {@code null} + */ + @SuppressForbidden(reason = "access /sys/fs/cgroup/cpu") + String readSysFsCgroupCpuAcctCpuAcctCfsQuota(final String controlGroup) { + return readSingleLine(PathUtils.get("/sys/fs/cgroup/cpu", controlGroup, "cpu.cfs_quota_us")); + } + + /** + * The CPU time statistics for all tasks in the Elasticsearch + * control group. + * + * @param controlGroup the control group for the Elasticsearch + * process for the {@code cpuacct} subsystem + * @return the CPU time statistics or {@code null} + */ + private OsStats.Cgroup.CpuStat getCgroupCpuAcctCpuStat(final String controlGroup) { + final List lines = readSysFsCgroupCpuAcctCpuStat(controlGroup); + if (!lines.isEmpty()) { + assert lines.size() == 3; long numberOfPeriods = -1; long numberOfTimesThrottled = -1; long timeThrottledNanos = -1; - if (!lines.isEmpty()) { - for (final String line : lines) { - final String[] fields = line.split("\\s+"); - switch(fields[0]) { - case "nr_periods": - numberOfPeriods = Long.parseLong(fields[1]); - break; - case "nr_throttled": - numberOfTimesThrottled = Long.parseLong(fields[1]); - break; - case "throttled_time": - timeThrottledNanos = Long.parseLong(fields[1]); - break; - } + for (final String line : lines) { + final String[] fields = line.split("\\s+"); + switch (fields[0]) { + case "nr_periods": + numberOfPeriods = + parseSingleLineAsLong(fields[1], () -> String.format(Locale.ROOT, "error parsing nr_periods [%s]", line)); + break; + case "nr_throttled": + numberOfTimesThrottled = + parseSingleLineAsLong(fields[1], () -> String.format(Locale.ROOT, "error parsing nr_throttled [%s]", line)); + break; + case "throttled_time": + timeThrottledNanos = + parseSingleLineAsLong(fields[1], () -> String.format(Locale.ROOT, "error parsing throttled_time [%s]", line)); + break; } } + assert numberOfPeriods != -1; + assert numberOfTimesThrottled != -1; + assert timeThrottledNanos != -1; return new OsStats.Cgroup.CpuStat(numberOfPeriods, numberOfTimesThrottled, timeThrottledNanos); - } catch (IOException e) { - // do not fail Elasticsearch is something unexpected happens here } return null; } - // visible for testing + /** + * Returns the lines from {@code cpu.stat} for the control + * group to which the Elasticsearch process belongs for the + * {@code cpu} subsystem. These lines represent the CPU time + * statistics and have the form + * + * nr_periods \d+ + * nr_throttled \d+ + * throttled_time \d+ + * + * where {@code nr_periods} is the number of period intervals + * as specified by {@code cpu.cfs_period_us} that have elapsed, + * {@code nr_throttled} is the number of times tasks in the given + * control group have been throttled, and {@code throttled_time} is + * the total time in nanoseconds for which tasks in the given + * control group have been throttled. + * + * @param controlGroup the control group to which the Elasticsearch + * process belongs for the {@code cpu} + * subsystem + * + * @return the line from {@code cpu.cfs_quota_us} or {@code null} + */ @SuppressForbidden(reason = "access /sys/fs/cgroup/cpu") - List readSysFsCgroupCpuAcctCpuStat(final String path) throws IOException { - return Files.readAllLines(PathUtils.get("/sys/fs/cgroup/cpu", path, "cpu.stat")); + List readSysFsCgroupCpuAcctCpuStat(final String controlGroup) { + return readAllLines(PathUtils.get("/sys/fs/cgroup/cpu", controlGroup, "cpu.stat")); } private static class OsProbeHolder { @@ -331,11 +499,11 @@ public OsStats osStats() { final OsStats.Mem mem = new OsStats.Mem(getTotalPhysicalMemorySize(), getFreePhysicalMemorySize()); final OsStats.Swap swap = new OsStats.Swap(getTotalSwapSpaceSize(), getFreeSwapSpaceSize()); final OsStats.Cgroup cgroup; - if (shouldReadCgroups()) { + if (Constants.LINUX) { final Map controllerMap = getControlGroups(); - if (controllerMap.containsKey("cpu") && controllerMap.containsKey("cpuacct")) { - final String cpuAcctControlGroup = controllerMap.get("cpuacct"); + if (controllerMap.containsKey("cpuacct") && controllerMap.containsKey("cpu")) { final String cpuControlGroup = controllerMap.get("cpu"); + final String cpuAcctControlGroup = controllerMap.get("cpuacct"); cgroup = new OsStats.Cgroup( cpuAcctControlGroup, @@ -353,11 +521,6 @@ public OsStats osStats() { return new OsStats(System.currentTimeMillis(), cpu, mem, swap, cgroup); } - // visible for testing - boolean shouldReadCgroups() { - return Constants.LINUX; - } - /** * Returns a given method of the OperatingSystemMXBean, * or null if the method is not found or unavailable. diff --git a/core/src/main/java/org/elasticsearch/monitor/os/OsStats.java b/core/src/main/java/org/elasticsearch/monitor/os/OsStats.java index 103f2555fd4bb..ec6fdc6c1f0fb 100644 --- a/core/src/main/java/org/elasticsearch/monitor/os/OsStats.java +++ b/core/src/main/java/org/elasticsearch/monitor/os/OsStats.java @@ -274,35 +274,74 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws } } + /** + * Encapsulates basic cgroup statistics. + */ public static class Cgroup implements Writeable, ToXContent { private final String cpuAcctControlGroup; private final long cpuAcctUsageNanos; private final String cpuControlGroup; - private final long cpuCfsPeriodMicros; // completely fair scheduler enforcement period - private final long cpuCfsQuotaMicros; // completely fair scheduler quota + private final long cpuCfsPeriodMicros; + private final long cpuCfsQuotaMicros; private final CpuStat cpuStat; + /** + * The control group for the {@code cpuacct} subsystem. + * + * @return the control group + */ public String getCpuAcctControlGroup() { return cpuAcctControlGroup; } + /** + * The total CPU time consumed by all tasks in the + * {@code cpuacct} control group from + * {@link Cgroup#cpuAcctControlGroup}. + * + * @return the total CPU time in nanoseconds + */ public long getCpuAcctUsageNanos() { return cpuAcctUsageNanos; } + /** + * The control group for the {@code cpu} subsystem. + * + * @return the control group + */ public String getCpuControlGroup() { return cpuControlGroup; } + /** + * The period of time for how frequently the control group from + * {@link Cgroup#cpuControlGroup} has its access to CPU + * resources reallocated. + * + * @return the period of time in microseconds + */ public long getCpuCfsPeriodMicros() { return cpuCfsPeriodMicros; } + /** + * The total amount of time for which all tasks in the control + * group from {@link Cgroup#cpuControlGroup} can run in one + * period as represented by {@link Cgroup#cpuCfsPeriodMicros}. + * + * @return the total amount of time in microseconds + */ public long getCpuCfsQuotaMicros() { return cpuCfsQuotaMicros; } + /** + * The CPU time statistics. See {@link CpuStat}. + * + * @return the CPU time statistics. + */ public CpuStat getCpuStat() { return cpuStat; } @@ -336,7 +375,7 @@ public Cgroup( } @Override - public void writeTo(StreamOutput out) throws IOException { + public void writeTo(final StreamOutput out) throws IOException { out.writeString(cpuAcctControlGroup); out.writeLong(cpuAcctUsageNanos); out.writeString(cpuControlGroup); @@ -351,7 +390,7 @@ public void writeTo(StreamOutput out) throws IOException { } @Override - public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { + public XContentBuilder toXContent(final XContentBuilder builder, final Params params) throws IOException { builder.startObject("cgroup"); { builder.startObject("cpuacct"); @@ -373,20 +412,41 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws return builder; } + /** + * Encapsulates CPU time statistics. + */ public static class CpuStat implements Writeable, ToXContent { private final long numberOfElapsedPeriods; private final long numberOfTimesThrottled; private final long timeThrottledNanos; + /** + * The number of elapsed periods. + * + * @return the number of elapsed periods as measured by + * {@code cpu.cfs_period_us} + */ public long getNumberOfElapsedPeriods() { return numberOfElapsedPeriods; } + /** + * The number of times tasks in the control group have been + * throttled. + * + * @return the number of times + */ public long getNumberOfTimesThrottled() { return numberOfTimesThrottled; } + /** + * The total time duration for which tasks in the control + * group have been throttled. + * + * @return the total time in nanoseconds + */ public long getTimeThrottledNanos() { return timeThrottledNanos; } diff --git a/core/src/test/java/org/elasticsearch/action/admin/cluster/node/stats/NodeStatsTests.java b/core/src/test/java/org/elasticsearch/action/admin/cluster/node/stats/NodeStatsTests.java index 7c82df1157082..6913abd81a4ae 100644 --- a/core/src/test/java/org/elasticsearch/action/admin/cluster/node/stats/NodeStatsTests.java +++ b/core/src/test/java/org/elasticsearch/action/admin/cluster/node/stats/NodeStatsTests.java @@ -78,9 +78,15 @@ public void testSerialization() throws IOException { assertEquals( nodeStats.getOs().getCgroup().getCpuAcctUsageNanos(), deserializedNodeStats.getOs().getCgroup().getCpuAcctUsageNanos()); + assertEquals( + nodeStats.getOs().getCgroup().getCpuControlGroup(), + deserializedNodeStats.getOs().getCgroup().getCpuControlGroup()); assertEquals( nodeStats.getOs().getCgroup().getCpuCfsPeriodMicros(), deserializedNodeStats.getOs().getCgroup().getCpuCfsPeriodMicros()); + assertEquals( + nodeStats.getOs().getCgroup().getCpuCfsQuotaMicros(), + deserializedNodeStats.getOs().getCgroup().getCpuCfsQuotaMicros()); assertEquals( nodeStats.getOs().getCgroup().getCpuStat().getNumberOfElapsedPeriods(), deserializedNodeStats.getOs().getCgroup().getCpuStat().getNumberOfElapsedPeriods()); diff --git a/core/src/test/java/org/elasticsearch/monitor/os/OsProbeTests.java b/core/src/test/java/org/elasticsearch/monitor/os/OsProbeTests.java index c4923ee8616fb..978a699bc16dd 100644 --- a/core/src/test/java/org/elasticsearch/monitor/os/OsProbeTests.java +++ b/core/src/test/java/org/elasticsearch/monitor/os/OsProbeTests.java @@ -22,9 +22,7 @@ import org.apache.lucene.util.Constants; import org.elasticsearch.test.ESTestCase; -import java.io.IOException; import java.util.Arrays; -import java.util.Collections; import java.util.List; import static org.hamcrest.Matchers.allOf; @@ -146,14 +144,13 @@ String readProcLoadavg() { assertThat(systemLoadAverage[2], equalTo(Double.parseDouble("1.99"))); } - public void testCGroupProbe() { - + public void testCgroupProbe() { final String hierarchy = randomAsciiOfLength(16); final OsProbe probe = new OsProbe() { @Override - List readProcSelfCgroup() throws IOException { + List readProcSelfCgroup() { return Arrays.asList( "11:freezer:/", "10:net_cls,net_prio:/", @@ -169,36 +166,31 @@ List readProcSelfCgroup() throws IOException { } @Override - List readSysFsCgroupCpuAcctCpuAcctUsage(String path) throws IOException { - assertThat(path, equalTo("/" + hierarchy)); - return Collections.singletonList("364869866063112"); + String readSysFsCgroupCpuAcctCpuAcctUsage(String controlGroup) { + assertThat(controlGroup, equalTo("/" + hierarchy)); + return "364869866063112"; } @Override - List readSysFsCgroupCpuAcctCpuCfsPeriod(String path) throws IOException { - assertThat(path, equalTo("/" + hierarchy)); - return Collections.singletonList("100000"); + String readSysFsCgroupCpuAcctCpuCfsPeriod(String controlGroup) { + assertThat(controlGroup, equalTo("/" + hierarchy)); + return "100000"; } @Override - List readSysFsCgroupCpuAcctCpuAcctCfsQuota(String path) throws IOException { - assertThat(path, equalTo("/" + hierarchy)); - return Collections.singletonList("50000"); + String readSysFsCgroupCpuAcctCpuAcctCfsQuota(String controlGroup) { + assertThat(controlGroup, equalTo("/" + hierarchy)); + return "50000"; } @Override - List readSysFsCgroupCpuAcctCpuStat(String path) throws IOException { + List readSysFsCgroupCpuAcctCpuStat(String controlGroup) { return Arrays.asList( "nr_periods 17992", "nr_throttled 1311", "throttled_time 139298645489"); } - @Override - boolean shouldReadCgroups() { - return true; - } - }; final OsStats.Cgroup cgroup = probe.osStats().getCgroup(); From 5e2b71f06a2ee350f2c34695f38146f108323edb Mon Sep 17 00:00:00 2001 From: Jason Tedor Date: Fri, 21 Oct 2016 05:40:30 -0400 Subject: [PATCH 6/8] Improve readability of control group matching This commit improves the readability of control group matching in OsProbe. --- core/src/main/java/org/elasticsearch/monitor/os/OsProbe.java | 5 ++--- .../test/java/org/elasticsearch/monitor/os/OsProbeTests.java | 2 ++ 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/monitor/os/OsProbe.java b/core/src/main/java/org/elasticsearch/monitor/os/OsProbe.java index e4d607ebca5f1..3cd64f70eb9a5 100644 --- a/core/src/main/java/org/elasticsearch/monitor/os/OsProbe.java +++ b/core/src/main/java/org/elasticsearch/monitor/os/OsProbe.java @@ -271,9 +271,8 @@ private Map getControlGroups() { // note that Matcher#matches must be invoked as // matching is lazy; this can not happen in an assert // as assertions might not be enabled - if (!matcher.matches()) { - assert false : line; - } + final boolean matches = matcher.matches(); + assert matches : line; // at this point we have captured the subsystems and the // control group final String[] controllers = matcher.group(1).split(","); diff --git a/core/src/test/java/org/elasticsearch/monitor/os/OsProbeTests.java b/core/src/test/java/org/elasticsearch/monitor/os/OsProbeTests.java index 978a699bc16dd..f477ef3214ff4 100644 --- a/core/src/test/java/org/elasticsearch/monitor/os/OsProbeTests.java +++ b/core/src/test/java/org/elasticsearch/monitor/os/OsProbeTests.java @@ -145,6 +145,8 @@ String readProcLoadavg() { } public void testCgroupProbe() { + assumeTrue("test runs on Linux only", Constants.LINUX); + final String hierarchy = randomAsciiOfLength(16); final OsProbe probe = new OsProbe() { From 8ff1a875792fbc6ff47a82c556202ca4d180f796 Mon Sep 17 00:00:00 2001 From: Jason Tedor Date: Fri, 21 Oct 2016 07:28:36 -0400 Subject: [PATCH 7/8] Remove cgroup stats leniency This commit removes some leniency in the handling of I/O exceptions during the reading of cgroup stats. --- .../org/elasticsearch/monitor/os/OsProbe.java | 273 ++++++++---------- .../org/elasticsearch/monitor/os/OsStats.java | 15 +- 2 files changed, 119 insertions(+), 169 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/monitor/os/OsProbe.java b/core/src/main/java/org/elasticsearch/monitor/os/OsProbe.java index 3cd64f70eb9a5..346a3915cbf34 100644 --- a/core/src/main/java/org/elasticsearch/monitor/os/OsProbe.java +++ b/core/src/main/java/org/elasticsearch/monitor/os/OsProbe.java @@ -32,12 +32,10 @@ import java.lang.reflect.Method; import java.nio.file.Files; import java.nio.file.Path; -import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Locale; import java.util.Map; -import java.util.function.Supplier; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -189,65 +187,16 @@ public short getSystemCpuPercent() { } /** - * Reads all lines of a file and logs any {@link IOException}. + * Reads a file containing a single line. * * @param path path to the file to read - * @return all the line or an empty list if an {@link IOException} - * occurred + * @return the single line + * @throws IOException if an I/O exception occurs reading the file */ - private List readAllLines(final Path path) { - try { - return Files.readAllLines(path); - } catch (final IOException e) { - if (logger.isDebugEnabled()) { - logger.debug("error reading " + path, e); - } - return Collections.emptyList(); - } - } - - /** - * Reads a file containing a single line and logs any - * {@link IOException}. - * - * @param path path to the file to read - * @return the single line or {@code null} if an - * {@link IOException} occurred - */ - private String readSingleLine(final Path path) { - try { - final List lines = Files.readAllLines(path); - assert lines != null && lines.size() == 1; - return lines.get(0); - } catch (final IOException e) { - if (logger.isDebugEnabled()) { - logger.debug("error reading " + path, e); - } - return null; - } - } - - /** - * Parses the input to a long and logs any - * {@link NumberFormatException} that occurs during parsing. - * - * @param line the line to parse - * @param message a debug message to log if parsing fails - * @return the parsed value - */ - private long parseSingleLineAsLong(final String line, final Supplier message) { - if (line != null) { - try { - return Long.parseLong(line); - } catch (final NumberFormatException e) { - if (logger.isDebugEnabled()) { - logger.debug(message.get(), e); - } - // fallback - } - } - - return -1; + private String readSingleLine(final Path path) throws IOException { + final List lines = Files.readAllLines(path); + assert lines != null && lines.size() == 1; + return lines.get(0); } // pattern for lines in /proc/self/cgroup @@ -261,29 +210,27 @@ private long parseSingleLineAsLong(final String line, final Supplier mes * * @return a map from subsystems to the control group for the * Elasticsearch process. + * @throws IOException if an I/O exception occurs reading + * {@code /proc/self/cgroup} */ - private Map getControlGroups() { + private Map getControlGroups() throws IOException { final List lines = readProcSelfCgroup(); - if (!lines.isEmpty()) { - final Map controllerMap = new HashMap<>(); - for (final String line : lines) { - final Matcher matcher = CONTROL_GROUP_PATTERN.matcher(line); - // note that Matcher#matches must be invoked as - // matching is lazy; this can not happen in an assert - // as assertions might not be enabled - final boolean matches = matcher.matches(); - assert matches : line; - // at this point we have captured the subsystems and the - // control group - final String[] controllers = matcher.group(1).split(","); - for (final String controller : controllers) { - controllerMap.put(controller, matcher.group(2)); - } + final Map controllerMap = new HashMap<>(); + for (final String line : lines) { + final Matcher matcher = CONTROL_GROUP_PATTERN.matcher(line); + // note that Matcher#matches must be invoked as + // matching is lazy; this can not happen in an assert + // as assertions might not be enabled + final boolean matches = matcher.matches(); + assert matches : line; + // at this point we have captured the subsystems and the + // control group + final String[] controllers = matcher.group(1).split(","); + for (final String controller : controllers) { + controllerMap.put(controller, matcher.group(2)); } - return controllerMap; } - - return Collections.emptyMap(); + return controllerMap; } /** @@ -299,12 +246,15 @@ private Map getControlGroups() { * bound to the hierarchy, and the last field representing the * control group. * - * @return the lines from {@code /proc/self/cgroup} or an empty - * list. + * @return the lines from {@code /proc/self/cgroup} + * @throws IOException if an I/O exception occurs reading + * {@code /proc/self/cgroup} */ @SuppressForbidden(reason = "access /proc/self/cgroup") - List readProcSelfCgroup() { - return readAllLines(PathUtils.get("/proc/self/cgroup")); + List readProcSelfCgroup() throws IOException { + final List lines = Files.readAllLines(PathUtils.get("/proc/self/cgroup")); + assert lines != null && !lines.isEmpty(); + return lines; } /** @@ -314,13 +264,12 @@ List readProcSelfCgroup() { * * @param controlGroup the control group for the Elasticsearch * process for the {@code cpuacct} subsystem - * @return the total CPU time in nanoseconds, or -1 + * @return the total CPU time in nanoseconds + * @throws IOException if an I/O exception occurs reading + * {@code cpuacct.usage} for the control group */ - private long getCgroupCpuAcctUsageNanos(final String controlGroup) { - final String line = readSysFsCgroupCpuAcctCpuAcctUsage(controlGroup); - return parseSingleLineAsLong( - line, - () -> String.format(Locale.ROOT, "error parsing cpuacct.usage [%s] for control group [%s]", line, controlGroup)); + private long getCgroupCpuAcctUsageNanos(final String controlGroup) throws IOException { + return Long.parseLong(readSysFsCgroupCpuAcctCpuAcctUsage(controlGroup)); } /** @@ -333,10 +282,12 @@ private long getCgroupCpuAcctUsageNanos(final String controlGroup) { * @param controlGroup the control group to which the Elasticsearch * process belongs for the {@code cpuacct} * subsystem - * @return the line from {@code cpuacct.usage} or {@code null} + * @return the line from {@code cpuacct.usage} + * @throws IOException if an I/O exception occurs reading + * {@code cpuacct.usage} for the control group */ @SuppressForbidden(reason = "access /sys/fs/cgroup/cpuacct") - String readSysFsCgroupCpuAcctCpuAcctUsage(final String controlGroup) { + String readSysFsCgroupCpuAcctCpuAcctUsage(final String controlGroup) throws IOException { return readSingleLine(PathUtils.get("/sys/fs/cgroup/cpuacct", controlGroup, "cpuacct.usage")); } @@ -347,13 +298,12 @@ String readSysFsCgroupCpuAcctCpuAcctUsage(final String controlGroup) { * * @param controlGroup the control group for the Elasticsearch * process for the {@code cpuacct} subsystem - * @return the CFS quota period in microseconds, or -1 + * @return the CFS quota period in microseconds + * @throws IOException if an I/O exception occurs reading + * {@code cpu.cfs_period_us} for the control group */ - private long getCgroupCpuAcctCpuCfsPeriodMicros(final String controlGroup) { - final String line = readSysFsCgroupCpuAcctCpuCfsPeriod(controlGroup); - return parseSingleLineAsLong( - line, - () -> String.format(Locale.ROOT, "error parsing cpu.cfs_period_us [%s] for control group [%s]", line, controlGroup)); + private long getCgroupCpuAcctCpuCfsPeriodMicros(final String controlGroup) throws IOException { + return Long.parseLong(readSysFsCgroupCpuAcctCpuCfsPeriod(controlGroup)); } /** @@ -366,10 +316,12 @@ private long getCgroupCpuAcctCpuCfsPeriodMicros(final String controlGroup) { * @param controlGroup the control group to which the Elasticsearch * process belongs for the {@code cpu} * subsystem - * @return the line from {@code cpu.cfs_period_us} or {@code null} + * @return the line from {@code cpu.cfs_period_us} + * @throws IOException if an I/O exception occurs reading + * {@code cpu.cfs_period_us} for the control group */ @SuppressForbidden(reason = "access /sys/fs/cgroup/cpu") - String readSysFsCgroupCpuAcctCpuCfsPeriod(final String controlGroup) { + String readSysFsCgroupCpuAcctCpuCfsPeriod(final String controlGroup) throws IOException { return readSingleLine(PathUtils.get("/sys/fs/cgroup/cpu", controlGroup, "cpu.cfs_period_us")); } @@ -380,13 +332,12 @@ String readSysFsCgroupCpuAcctCpuCfsPeriod(final String controlGroup) { * * @param controlGroup the control group for the Elasticsearch * process for the {@code cpuacct} subsystem - * @return the CFS quota in microseconds, or -1 + * @return the CFS quota in microseconds + * @throws IOException if an I/O exception occurs reading + * {@code cpu.cfs_quota_us} for the control group */ - private long getCGroupCpuAcctCpuCfsQuotaMicros(final String controlGroup) { - final String line = readSysFsCgroupCpuAcctCpuAcctCfsQuota(controlGroup); - return parseSingleLineAsLong( - line, - () -> String.format(Locale.ROOT, "error parsing cpu.cfs_quota_us [%s] for control group [%s]", line, controlGroup)); + private long getCGroupCpuAcctCpuCfsQuotaMicros(final String controlGroup) throws IOException { + return Long.parseLong(readSysFsCgroupCpuAcctCpuAcctCfsQuota(controlGroup)); } /** @@ -399,10 +350,12 @@ private long getCGroupCpuAcctCpuCfsQuotaMicros(final String controlGroup) { * @param controlGroup the control group to which the Elasticsearch * process belongs for the {@code cpu} * subsystem - * @return the line from {@code cpu.cfs_quota_us} or {@code null} + * @return the line from {@code cpu.cfs_quota_us} + * @throws IOException if an I/O exception occurs reading + * {@code cpu.cfs_quota_us} for the control group */ @SuppressForbidden(reason = "access /sys/fs/cgroup/cpu") - String readSysFsCgroupCpuAcctCpuAcctCfsQuota(final String controlGroup) { + String readSysFsCgroupCpuAcctCpuAcctCfsQuota(final String controlGroup) throws IOException { return readSingleLine(PathUtils.get("/sys/fs/cgroup/cpu", controlGroup, "cpu.cfs_quota_us")); } @@ -412,39 +365,33 @@ String readSysFsCgroupCpuAcctCpuAcctCfsQuota(final String controlGroup) { * * @param controlGroup the control group for the Elasticsearch * process for the {@code cpuacct} subsystem - * @return the CPU time statistics or {@code null} + * @return the CPU time statistics + * @throws IOException if an I/O exception occurs reading + * {@code cpu.stat} for the control group */ - private OsStats.Cgroup.CpuStat getCgroupCpuAcctCpuStat(final String controlGroup) { + private OsStats.Cgroup.CpuStat getCgroupCpuAcctCpuStat(final String controlGroup) throws IOException { final List lines = readSysFsCgroupCpuAcctCpuStat(controlGroup); - if (!lines.isEmpty()) { - assert lines.size() == 3; - long numberOfPeriods = -1; - long numberOfTimesThrottled = -1; - long timeThrottledNanos = -1; - for (final String line : lines) { - final String[] fields = line.split("\\s+"); - switch (fields[0]) { - case "nr_periods": - numberOfPeriods = - parseSingleLineAsLong(fields[1], () -> String.format(Locale.ROOT, "error parsing nr_periods [%s]", line)); - break; - case "nr_throttled": - numberOfTimesThrottled = - parseSingleLineAsLong(fields[1], () -> String.format(Locale.ROOT, "error parsing nr_throttled [%s]", line)); - break; - case "throttled_time": - timeThrottledNanos = - parseSingleLineAsLong(fields[1], () -> String.format(Locale.ROOT, "error parsing throttled_time [%s]", line)); - break; - } + long numberOfPeriods = -1; + long numberOfTimesThrottled = -1; + long timeThrottledNanos = -1; + for (final String line : lines) { + final String[] fields = line.split("\\s+"); + switch (fields[0]) { + case "nr_periods": + numberOfPeriods = Long.parseLong(fields[1]); + break; + case "nr_throttled": + numberOfTimesThrottled = Long.parseLong(fields[1]); + break; + case "throttled_time": + timeThrottledNanos = Long.parseLong(fields[1]); + break; } - assert numberOfPeriods != -1; - assert numberOfTimesThrottled != -1; - assert timeThrottledNanos != -1; - return new OsStats.Cgroup.CpuStat(numberOfPeriods, numberOfTimesThrottled, timeThrottledNanos); } - - return null; + assert numberOfPeriods != -1; + assert numberOfTimesThrottled != -1; + assert timeThrottledNanos != -1; + return new OsStats.Cgroup.CpuStat(numberOfPeriods, numberOfTimesThrottled, timeThrottledNanos); } /** @@ -468,11 +415,41 @@ private OsStats.Cgroup.CpuStat getCgroupCpuAcctCpuStat(final String controlGroup * process belongs for the {@code cpu} * subsystem * - * @return the line from {@code cpu.cfs_quota_us} or {@code null} + * @return the lines from {@code cpu.stat} + * @throws IOException if an I/O exception occurs reading + * {@code cpu.stat} for the control group */ @SuppressForbidden(reason = "access /sys/fs/cgroup/cpu") - List readSysFsCgroupCpuAcctCpuStat(final String controlGroup) { - return readAllLines(PathUtils.get("/sys/fs/cgroup/cpu", controlGroup, "cpu.stat")); + List readSysFsCgroupCpuAcctCpuStat(final String controlGroup) throws IOException { + final List lines = Files.readAllLines(PathUtils.get("/sys/fs/cgroup/cpu", controlGroup, "cpu.stat")); + assert lines != null && lines.size() == 3; + return lines; + } + + /** + * Basic cgroup stats. + * + * @return basic cgroup stats, or {@code null} if an I/O exception + * occurred reading the cgroup stats + */ + private OsStats.Cgroup getCgroup() { + try { + final Map controllerMap = getControlGroups(); + final String cpuControlGroup = controllerMap.get("cpu"); + final String cpuAcctControlGroup = controllerMap.get("cpuacct"); + return new OsStats.Cgroup( + cpuAcctControlGroup, + getCgroupCpuAcctUsageNanos(cpuAcctControlGroup), + cpuControlGroup, + getCgroupCpuAcctCpuCfsPeriodMicros(cpuControlGroup), + getCGroupCpuAcctCpuCfsQuotaMicros(cpuControlGroup), + getCgroupCpuAcctCpuStat(cpuControlGroup)); + } catch (final IOException e) { + if (logger.isDebugEnabled()) { + logger.debug("error reading control group stats", e); + } + return null; + } } private static class OsProbeHolder { @@ -484,6 +461,7 @@ public static OsProbe getInstance() { } OsProbe() { + } private final Logger logger = ESLoggerFactory.getLogger(getClass()); @@ -497,26 +475,7 @@ public OsStats osStats() { final OsStats.Cpu cpu = new OsStats.Cpu(getSystemCpuPercent(), getSystemLoadAverage()); final OsStats.Mem mem = new OsStats.Mem(getTotalPhysicalMemorySize(), getFreePhysicalMemorySize()); final OsStats.Swap swap = new OsStats.Swap(getTotalSwapSpaceSize(), getFreeSwapSpaceSize()); - final OsStats.Cgroup cgroup; - if (Constants.LINUX) { - final Map controllerMap = getControlGroups(); - if (controllerMap.containsKey("cpuacct") && controllerMap.containsKey("cpu")) { - final String cpuControlGroup = controllerMap.get("cpu"); - final String cpuAcctControlGroup = controllerMap.get("cpuacct"); - cgroup = - new OsStats.Cgroup( - cpuAcctControlGroup, - getCgroupCpuAcctUsageNanos(cpuAcctControlGroup), - cpuControlGroup, - getCgroupCpuAcctCpuCfsPeriodMicros(cpuControlGroup), - getCGroupCpuAcctCpuCfsQuotaMicros(cpuControlGroup), - getCgroupCpuAcctCpuStat(cpuControlGroup)); - } else { - cgroup = null; - } - } else { - cgroup = null; - } + final OsStats.Cgroup cgroup = Constants.LINUX ? getCgroup() : null; return new OsStats(System.currentTimeMillis(), cpu, mem, swap, cgroup); } diff --git a/core/src/main/java/org/elasticsearch/monitor/os/OsStats.java b/core/src/main/java/org/elasticsearch/monitor/os/OsStats.java index ec6fdc6c1f0fb..f8797dc6d7c5e 100644 --- a/core/src/main/java/org/elasticsearch/monitor/os/OsStats.java +++ b/core/src/main/java/org/elasticsearch/monitor/os/OsStats.java @@ -358,7 +358,7 @@ public Cgroup( this.cpuControlGroup = cpuControlGroup; this.cpuCfsPeriodMicros = cpuCfsPeriodMicros; this.cpuCfsQuotaMicros = cpuCfsQuotaMicros; - this.cpuStat = cpuStat; + this.cpuStat = Objects.requireNonNull(cpuStat); } Cgroup(final StreamInput in) throws IOException { @@ -367,11 +367,7 @@ public Cgroup( cpuControlGroup = in.readString(); cpuCfsPeriodMicros = in.readLong(); cpuCfsQuotaMicros = in.readLong(); - if (!in.readBoolean()) { - cpuStat = null; - } else { - cpuStat = new CpuStat(in); - } + cpuStat = new CpuStat(in); } @Override @@ -381,12 +377,7 @@ public void writeTo(final StreamOutput out) throws IOException { out.writeString(cpuControlGroup); out.writeLong(cpuCfsPeriodMicros); out.writeLong(cpuCfsQuotaMicros); - if (cpuStat == null) { - out.writeBoolean(false); - } else { - out.writeBoolean(true); - cpuStat.writeTo(out); - } + cpuStat.writeTo(out); } @Override From 9f790a791f567f80cae223987ef62a1607e77daa Mon Sep 17 00:00:00 2001 From: Jason Tedor Date: Fri, 21 Oct 2016 10:58:44 -0400 Subject: [PATCH 8/8] Handle when cgroups are unavailable in OsStats This commit fixes an issue when cgroups are not available and thus the cgroup field is null. This can happen on non-Linux systems, for example, or Linux kernels where cgroups are not compiled in. --- .../main/java/org/elasticsearch/monitor/os/OsStats.java | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/monitor/os/OsStats.java b/core/src/main/java/org/elasticsearch/monitor/os/OsStats.java index f8797dc6d7c5e..8a7a842e9de1d 100644 --- a/core/src/main/java/org/elasticsearch/monitor/os/OsStats.java +++ b/core/src/main/java/org/elasticsearch/monitor/os/OsStats.java @@ -51,7 +51,7 @@ public OsStats(StreamInput in) throws IOException { this.cpu = new Cpu(in); this.mem = new Mem(in); this.swap = new Swap(in); - this.cgroup = new Cgroup(in); + this.cgroup = in.readOptionalWriteable(Cgroup::new); } @Override @@ -60,7 +60,7 @@ public void writeTo(StreamOutput out) throws IOException { cpu.writeTo(out); mem.writeTo(out); swap.writeTo(out); - cgroup.writeTo(out); + out.writeOptionalWriteable(cgroup); } public long getTimestamp() { @@ -111,7 +111,9 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws cpu.toXContent(builder, params); mem.toXContent(builder, params); swap.toXContent(builder, params); - cgroup.toXContent(builder, params); + if (cgroup != null) { + cgroup.toXContent(builder, params); + } builder.endObject(); return builder; }