diff --git a/core/src/main/java/org/elasticsearch/monitor/os/OsProbe.java b/core/src/main/java/org/elasticsearch/monitor/os/OsProbe.java index 346a3915cbf34..49c24fad9f4d5 100644 --- a/core/src/main/java/org/elasticsearch/monitor/os/OsProbe.java +++ b/core/src/main/java/org/elasticsearch/monitor/os/OsProbe.java @@ -32,6 +32,7 @@ import java.lang.reflect.Method; import java.nio.file.Files; import java.nio.file.Path; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Locale; @@ -211,7 +212,7 @@ private String readSingleLine(final Path path) throws IOException { * @return a map from subsystems to the control group for the * Elasticsearch process. * @throws IOException if an I/O exception occurs reading - * {@code /proc/self/cgroup} + * {@code /proc/self/cgroup} */ private Map getControlGroups() throws IOException { final List lines = readProcSelfCgroup(); @@ -248,7 +249,7 @@ private Map getControlGroups() throws IOException { * * @return the lines from {@code /proc/self/cgroup} * @throws IOException if an I/O exception occurs reading - * {@code /proc/self/cgroup} + * {@code /proc/self/cgroup} */ @SuppressForbidden(reason = "access /proc/self/cgroup") List readProcSelfCgroup() throws IOException { @@ -266,7 +267,7 @@ List readProcSelfCgroup() throws IOException { * process for the {@code cpuacct} subsystem * @return the total CPU time in nanoseconds * @throws IOException if an I/O exception occurs reading - * {@code cpuacct.usage} for the control group + * {@code cpuacct.usage} for the control group */ private long getCgroupCpuAcctUsageNanos(final String controlGroup) throws IOException { return Long.parseLong(readSysFsCgroupCpuAcctCpuAcctUsage(controlGroup)); @@ -284,7 +285,7 @@ private long getCgroupCpuAcctUsageNanos(final String controlGroup) throws IOExce * subsystem * @return the line from {@code cpuacct.usage} * @throws IOException if an I/O exception occurs reading - * {@code cpuacct.usage} for the control group + * {@code cpuacct.usage} for the control group */ @SuppressForbidden(reason = "access /sys/fs/cgroup/cpuacct") String readSysFsCgroupCpuAcctCpuAcctUsage(final String controlGroup) throws IOException { @@ -300,7 +301,7 @@ String readSysFsCgroupCpuAcctCpuAcctUsage(final String controlGroup) throws IOEx * process for the {@code cpuacct} subsystem * @return the CFS quota period in microseconds * @throws IOException if an I/O exception occurs reading - * {@code cpu.cfs_period_us} for the control group + * {@code cpu.cfs_period_us} for the control group */ private long getCgroupCpuAcctCpuCfsPeriodMicros(final String controlGroup) throws IOException { return Long.parseLong(readSysFsCgroupCpuAcctCpuCfsPeriod(controlGroup)); @@ -318,7 +319,7 @@ private long getCgroupCpuAcctCpuCfsPeriodMicros(final String controlGroup) throw * subsystem * @return the line from {@code cpu.cfs_period_us} * @throws IOException if an I/O exception occurs reading - * {@code cpu.cfs_period_us} for the control group + * {@code cpu.cfs_period_us} for the control group */ @SuppressForbidden(reason = "access /sys/fs/cgroup/cpu") String readSysFsCgroupCpuAcctCpuCfsPeriod(final String controlGroup) throws IOException { @@ -334,9 +335,9 @@ String readSysFsCgroupCpuAcctCpuCfsPeriod(final String controlGroup) throws IOEx * process for the {@code cpuacct} subsystem * @return the CFS quota in microseconds * @throws IOException if an I/O exception occurs reading - * {@code cpu.cfs_quota_us} for the control group + * {@code cpu.cfs_quota_us} for the control group */ - private long getCGroupCpuAcctCpuCfsQuotaMicros(final String controlGroup) throws IOException { + private long getCgroupCpuAcctCpuCfsQuotaMicros(final String controlGroup) throws IOException { return Long.parseLong(readSysFsCgroupCpuAcctCpuAcctCfsQuota(controlGroup)); } @@ -352,7 +353,7 @@ private long getCGroupCpuAcctCpuCfsQuotaMicros(final String controlGroup) throws * subsystem * @return the line from {@code cpu.cfs_quota_us} * @throws IOException if an I/O exception occurs reading - * {@code cpu.cfs_quota_us} for the control group + * {@code cpu.cfs_quota_us} for the control group */ @SuppressForbidden(reason = "access /sys/fs/cgroup/cpu") String readSysFsCgroupCpuAcctCpuAcctCfsQuota(final String controlGroup) throws IOException { @@ -367,7 +368,7 @@ String readSysFsCgroupCpuAcctCpuAcctCfsQuota(final String controlGroup) throws I * process for the {@code cpuacct} subsystem * @return the CPU time statistics * @throws IOException if an I/O exception occurs reading - * {@code cpu.stat} for the control group + * {@code cpu.stat} for the control group */ private OsStats.Cgroup.CpuStat getCgroupCpuAcctCpuStat(final String controlGroup) throws IOException { final List lines = readSysFsCgroupCpuAcctCpuStat(controlGroup); @@ -399,11 +400,11 @@ private OsStats.Cgroup.CpuStat getCgroupCpuAcctCpuStat(final String controlGroup * group to which the Elasticsearch process belongs for the * {@code cpu} subsystem. These lines represent the CPU time * statistics and have the form - * + *

* nr_periods \d+ * nr_throttled \d+ * throttled_time \d+ - * + *

* where {@code nr_periods} is the number of period intervals * as specified by {@code cpu.cfs_period_us} that have elapsed, * {@code nr_throttled} is the number of times tasks in the given @@ -414,10 +415,9 @@ private OsStats.Cgroup.CpuStat getCgroupCpuAcctCpuStat(final String controlGroup * @param controlGroup the control group to which the Elasticsearch * process belongs for the {@code cpu} * subsystem - * * @return the lines from {@code cpu.stat} * @throws IOException if an I/O exception occurs reading - * {@code cpu.stat} for the control group + * {@code cpu.stat} for the control group */ @SuppressForbidden(reason = "access /sys/fs/cgroup/cpu") List readSysFsCgroupCpuAcctCpuStat(final String controlGroup) throws IOException { @@ -426,6 +426,27 @@ List readSysFsCgroupCpuAcctCpuStat(final String controlGroup) throws IOE return lines; } + /** + * Checks if cgroup stats are available by checking for the existence of {@code /proc/self/cgroup}, + * {@code /sys/fs/cgroup/cpu}, and {@code /sys/fs/cgroup/cpuacct}. + * + * @return {@code true} if the stats are available, otherwise + * {@code false} + */ + @SuppressForbidden(reason = "access /proc/self/cgroup, /sys/fs/cgroup/cpu, and /sys/fs/cgroup/cpuacct") + private boolean areCgroupStatsAvailable() { + if (!Files.exists(PathUtils.get("/proc/self/cgroup"))) { + return false; + } + if (!Files.exists(PathUtils.get("/sys/fs/cgroup/cpu"))) { + return false; + } + if (!Files.exists(PathUtils.get("/sys/fs/cgroup/cpuacct"))) { + return false; + } + return true; + } + /** * Basic cgroup stats. * @@ -434,16 +455,30 @@ List readSysFsCgroupCpuAcctCpuStat(final String controlGroup) throws IOE */ private OsStats.Cgroup getCgroup() { try { - final Map controllerMap = getControlGroups(); - final String cpuControlGroup = controllerMap.get("cpu"); - final String cpuAcctControlGroup = controllerMap.get("cpuacct"); - return new OsStats.Cgroup( - cpuAcctControlGroup, - getCgroupCpuAcctUsageNanos(cpuAcctControlGroup), - cpuControlGroup, - getCgroupCpuAcctCpuCfsPeriodMicros(cpuControlGroup), - getCGroupCpuAcctCpuCfsQuotaMicros(cpuControlGroup), - getCgroupCpuAcctCpuStat(cpuControlGroup)); + if (!areCgroupStatsAvailable()) { + return null; + } else { + final Map controllerMap = getControlGroups(); + assert !controllerMap.isEmpty(); + + final String cpuAcctControlGroup = controllerMap.get("cpuacct"); + assert cpuAcctControlGroup != null; + final long cgroupCpuAcctUsageNanos = getCgroupCpuAcctUsageNanos(cpuAcctControlGroup); + + final String cpuControlGroup = controllerMap.get("cpu"); + assert cpuControlGroup != null; + final long cgroupCpuAcctCpuCfsPeriodMicros = getCgroupCpuAcctCpuCfsPeriodMicros(cpuControlGroup); + final long cgroupCpuAcctCpuCfsQuotaMicros = getCgroupCpuAcctCpuCfsQuotaMicros(cpuControlGroup); + final OsStats.Cgroup.CpuStat cpuStat = getCgroupCpuAcctCpuStat(cpuControlGroup); + + return new OsStats.Cgroup( + cpuAcctControlGroup, + cgroupCpuAcctUsageNanos, + cpuControlGroup, + cgroupCpuAcctCpuCfsPeriodMicros, + cgroupCpuAcctCpuCfsQuotaMicros, + cpuStat); + } } catch (final IOException e) { if (logger.isDebugEnabled()) { logger.debug("error reading control group stats", e); diff --git a/core/src/main/java/org/elasticsearch/monitor/os/OsStats.java b/core/src/main/java/org/elasticsearch/monitor/os/OsStats.java index 8a7a842e9de1d..fa3c6aa861def 100644 --- a/core/src/main/java/org/elasticsearch/monitor/os/OsStats.java +++ b/core/src/main/java/org/elasticsearch/monitor/os/OsStats.java @@ -355,9 +355,9 @@ public Cgroup( final long cpuCfsPeriodMicros, final long cpuCfsQuotaMicros, final CpuStat cpuStat) { - this.cpuAcctControlGroup = cpuAcctControlGroup; + this.cpuAcctControlGroup = Objects.requireNonNull(cpuAcctControlGroup); this.cpuAcctUsageNanos = cpuAcctUsageNanos; - this.cpuControlGroup = cpuControlGroup; + this.cpuControlGroup = Objects.requireNonNull(cpuControlGroup); this.cpuCfsPeriodMicros = cpuCfsPeriodMicros; this.cpuCfsQuotaMicros = cpuCfsQuotaMicros; this.cpuStat = Objects.requireNonNull(cpuStat); diff --git a/core/src/main/resources/org/elasticsearch/bootstrap/security.policy b/core/src/main/resources/org/elasticsearch/bootstrap/security.policy index dfc00dcb01ac8..cbd1f93491b68 100644 --- a/core/src/main/resources/org/elasticsearch/bootstrap/security.policy +++ b/core/src/main/resources/org/elasticsearch/bootstrap/security.policy @@ -124,6 +124,8 @@ grant { // control group stats on Linux permission java.io.FilePermission "/proc/self/cgroup", "read"; + permission java.io.FilePermission "/sys/fs/cgroup/cpu", "read"; permission java.io.FilePermission "/sys/fs/cgroup/cpu/-", "read"; + permission java.io.FilePermission "/sys/fs/cgroup/cpuacct", "read"; permission java.io.FilePermission "/sys/fs/cgroup/cpuacct/-", "read"; }; diff --git a/docs/reference/cluster/nodes-stats.asciidoc b/docs/reference/cluster/nodes-stats.asciidoc index a9b21d8ddaa7d..94d954f066444 100644 --- a/docs/reference/cluster/nodes-stats.asciidoc +++ b/docs/reference/cluster/nodes-stats.asciidoc @@ -250,6 +250,11 @@ the operating system: The total amount of time (in nanoseconds) for which all tasks in the same cgroup as the Elasticsearch process have been throttled. +NOTE: For the cgroup stats to be visible, cgroups must be compiled into +the kernal, the `cpu` and `cpuacct` cgroup subsystems must be +configured and stats must be readable from `/sys/fs/cgroup/cpu` +and `/sys/fs/cgroup/cpuacct`. + [float] [[process-stats]] ==== Process statistics