Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 59 additions & 24 deletions core/src/main/java/org/elasticsearch/monitor/os/OsProbe.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import java.lang.reflect.Method;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
Expand Down Expand Up @@ -211,7 +212,7 @@ private String readSingleLine(final Path path) throws IOException {
* @return a map from subsystems to the control group for the
* Elasticsearch process.
* @throws IOException if an I/O exception occurs reading
* {@code /proc/self/cgroup}
* {@code /proc/self/cgroup}
*/
private Map<String, String> getControlGroups() throws IOException {
final List<String> lines = readProcSelfCgroup();
Expand Down Expand Up @@ -248,7 +249,7 @@ private Map<String, String> getControlGroups() throws IOException {
*
* @return the lines from {@code /proc/self/cgroup}
* @throws IOException if an I/O exception occurs reading
* {@code /proc/self/cgroup}
* {@code /proc/self/cgroup}
*/
@SuppressForbidden(reason = "access /proc/self/cgroup")
List<String> readProcSelfCgroup() throws IOException {
Expand All @@ -266,7 +267,7 @@ List<String> readProcSelfCgroup() throws IOException {
* process for the {@code cpuacct} subsystem
* @return the total CPU time in nanoseconds
* @throws IOException if an I/O exception occurs reading
* {@code cpuacct.usage} for the control group
* {@code cpuacct.usage} for the control group
*/
private long getCgroupCpuAcctUsageNanos(final String controlGroup) throws IOException {
return Long.parseLong(readSysFsCgroupCpuAcctCpuAcctUsage(controlGroup));
Expand All @@ -284,7 +285,7 @@ private long getCgroupCpuAcctUsageNanos(final String controlGroup) throws IOExce
* subsystem
* @return the line from {@code cpuacct.usage}
* @throws IOException if an I/O exception occurs reading
* {@code cpuacct.usage} for the control group
* {@code cpuacct.usage} for the control group
*/
@SuppressForbidden(reason = "access /sys/fs/cgroup/cpuacct")
String readSysFsCgroupCpuAcctCpuAcctUsage(final String controlGroup) throws IOException {
Expand All @@ -300,7 +301,7 @@ String readSysFsCgroupCpuAcctCpuAcctUsage(final String controlGroup) throws IOEx
* process for the {@code cpuacct} subsystem
* @return the CFS quota period in microseconds
* @throws IOException if an I/O exception occurs reading
* {@code cpu.cfs_period_us} for the control group
* {@code cpu.cfs_period_us} for the control group
*/
private long getCgroupCpuAcctCpuCfsPeriodMicros(final String controlGroup) throws IOException {
return Long.parseLong(readSysFsCgroupCpuAcctCpuCfsPeriod(controlGroup));
Expand All @@ -318,7 +319,7 @@ private long getCgroupCpuAcctCpuCfsPeriodMicros(final String controlGroup) throw
* subsystem
* @return the line from {@code cpu.cfs_period_us}
* @throws IOException if an I/O exception occurs reading
* {@code cpu.cfs_period_us} for the control group
* {@code cpu.cfs_period_us} for the control group
*/
@SuppressForbidden(reason = "access /sys/fs/cgroup/cpu")
String readSysFsCgroupCpuAcctCpuCfsPeriod(final String controlGroup) throws IOException {
Expand All @@ -334,9 +335,9 @@ String readSysFsCgroupCpuAcctCpuCfsPeriod(final String controlGroup) throws IOEx
* process for the {@code cpuacct} subsystem
* @return the CFS quota in microseconds
* @throws IOException if an I/O exception occurs reading
* {@code cpu.cfs_quota_us} for the control group
* {@code cpu.cfs_quota_us} for the control group
*/
private long getCGroupCpuAcctCpuCfsQuotaMicros(final String controlGroup) throws IOException {
private long getCgroupCpuAcctCpuCfsQuotaMicros(final String controlGroup) throws IOException {
return Long.parseLong(readSysFsCgroupCpuAcctCpuAcctCfsQuota(controlGroup));
}

Expand All @@ -352,7 +353,7 @@ private long getCGroupCpuAcctCpuCfsQuotaMicros(final String controlGroup) throws
* subsystem
* @return the line from {@code cpu.cfs_quota_us}
* @throws IOException if an I/O exception occurs reading
* {@code cpu.cfs_quota_us} for the control group
* {@code cpu.cfs_quota_us} for the control group
*/
@SuppressForbidden(reason = "access /sys/fs/cgroup/cpu")
String readSysFsCgroupCpuAcctCpuAcctCfsQuota(final String controlGroup) throws IOException {
Expand All @@ -367,7 +368,7 @@ String readSysFsCgroupCpuAcctCpuAcctCfsQuota(final String controlGroup) throws I
* process for the {@code cpuacct} subsystem
* @return the CPU time statistics
* @throws IOException if an I/O exception occurs reading
* {@code cpu.stat} for the control group
* {@code cpu.stat} for the control group
*/
private OsStats.Cgroup.CpuStat getCgroupCpuAcctCpuStat(final String controlGroup) throws IOException {
final List<String> lines = readSysFsCgroupCpuAcctCpuStat(controlGroup);
Expand Down Expand Up @@ -399,11 +400,11 @@ private OsStats.Cgroup.CpuStat getCgroupCpuAcctCpuStat(final String controlGroup
* group to which the Elasticsearch process belongs for the
* {@code cpu} subsystem. These lines represent the CPU time
* statistics and have the form
*
* <p>
* nr_periods \d+
* nr_throttled \d+
* throttled_time \d+
*
* <p>
* where {@code nr_periods} is the number of period intervals
* as specified by {@code cpu.cfs_period_us} that have elapsed,
* {@code nr_throttled} is the number of times tasks in the given
Expand All @@ -414,10 +415,9 @@ private OsStats.Cgroup.CpuStat getCgroupCpuAcctCpuStat(final String controlGroup
* @param controlGroup the control group to which the Elasticsearch
* process belongs for the {@code cpu}
* subsystem
*
* @return the lines from {@code cpu.stat}
* @throws IOException if an I/O exception occurs reading
* {@code cpu.stat} for the control group
* {@code cpu.stat} for the control group
*/
@SuppressForbidden(reason = "access /sys/fs/cgroup/cpu")
List<String> readSysFsCgroupCpuAcctCpuStat(final String controlGroup) throws IOException {
Expand All @@ -426,6 +426,27 @@ List<String> readSysFsCgroupCpuAcctCpuStat(final String controlGroup) throws IOE
return lines;
}

/**
* Checks if cgroup stats are available by checking for the existence of {@code /proc/self/cgroup},
* {@code /sys/fs/cgroup/cpu}, and {@code /sys/fs/cgroup/cpuacct}.
*
* @return {@code true} if the stats are available, otherwise
* {@code false}
*/
@SuppressForbidden(reason = "access /proc/self/cgroup, /sys/fs/cgroup/cpu, and /sys/fs/cgroup/cpuacct")
private boolean areCgroupStatsAvailable() {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

if (!Files.exists(PathUtils.get("/proc/self/cgroup"))) {
return false;
}
if (!Files.exists(PathUtils.get("/sys/fs/cgroup/cpu"))) {
return false;
}
if (!Files.exists(PathUtils.get("/sys/fs/cgroup/cpuacct"))) {
return false;
}
return true;
}

/**
* Basic cgroup stats.
*
Expand All @@ -434,16 +455,30 @@ List<String> readSysFsCgroupCpuAcctCpuStat(final String controlGroup) throws IOE
*/
private OsStats.Cgroup getCgroup() {
try {
final Map<String, String> controllerMap = getControlGroups();
final String cpuControlGroup = controllerMap.get("cpu");
final String cpuAcctControlGroup = controllerMap.get("cpuacct");
return new OsStats.Cgroup(
cpuAcctControlGroup,
getCgroupCpuAcctUsageNanos(cpuAcctControlGroup),
cpuControlGroup,
getCgroupCpuAcctCpuCfsPeriodMicros(cpuControlGroup),
getCGroupCpuAcctCpuCfsQuotaMicros(cpuControlGroup),
getCgroupCpuAcctCpuStat(cpuControlGroup));
if (!areCgroupStatsAvailable()) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

return null;
} else {
final Map<String, String> controllerMap = getControlGroups();
assert !controllerMap.isEmpty();

final String cpuAcctControlGroup = controllerMap.get("cpuacct");
assert cpuAcctControlGroup != null;
final long cgroupCpuAcctUsageNanos = getCgroupCpuAcctUsageNanos(cpuAcctControlGroup);

final String cpuControlGroup = controllerMap.get("cpu");
assert cpuControlGroup != null;
final long cgroupCpuAcctCpuCfsPeriodMicros = getCgroupCpuAcctCpuCfsPeriodMicros(cpuControlGroup);
final long cgroupCpuAcctCpuCfsQuotaMicros = getCgroupCpuAcctCpuCfsQuotaMicros(cpuControlGroup);
final OsStats.Cgroup.CpuStat cpuStat = getCgroupCpuAcctCpuStat(cpuControlGroup);

return new OsStats.Cgroup(
cpuAcctControlGroup,
cgroupCpuAcctUsageNanos,
cpuControlGroup,
cgroupCpuAcctCpuCfsPeriodMicros,
cgroupCpuAcctCpuCfsQuotaMicros,
cpuStat);
}
} catch (final IOException e) {
if (logger.isDebugEnabled()) {
logger.debug("error reading control group stats", e);
Expand Down
4 changes: 2 additions & 2 deletions core/src/main/java/org/elasticsearch/monitor/os/OsStats.java
Original file line number Diff line number Diff line change
Expand Up @@ -355,9 +355,9 @@ public Cgroup(
final long cpuCfsPeriodMicros,
final long cpuCfsQuotaMicros,
final CpuStat cpuStat) {
this.cpuAcctControlGroup = cpuAcctControlGroup;
this.cpuAcctControlGroup = Objects.requireNonNull(cpuAcctControlGroup);
this.cpuAcctUsageNanos = cpuAcctUsageNanos;
this.cpuControlGroup = cpuControlGroup;
this.cpuControlGroup = Objects.requireNonNull(cpuControlGroup);
this.cpuCfsPeriodMicros = cpuCfsPeriodMicros;
this.cpuCfsQuotaMicros = cpuCfsQuotaMicros;
this.cpuStat = Objects.requireNonNull(cpuStat);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,8 @@ grant {

// control group stats on Linux
permission java.io.FilePermission "/proc/self/cgroup", "read";
permission java.io.FilePermission "/sys/fs/cgroup/cpu", "read";
permission java.io.FilePermission "/sys/fs/cgroup/cpu/-", "read";
permission java.io.FilePermission "/sys/fs/cgroup/cpuacct", "read";
permission java.io.FilePermission "/sys/fs/cgroup/cpuacct/-", "read";
};
5 changes: 5 additions & 0 deletions docs/reference/cluster/nodes-stats.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,11 @@ the operating system:
The total amount of time (in nanoseconds) for which all tasks in
the same cgroup as the Elasticsearch process have been throttled.

NOTE: For the cgroup stats to be visible, cgroups must be compiled into
the kernal, the `cpu` and `cpuacct` cgroup subsystems must be
configured and stats must be readable from `/sys/fs/cgroup/cpu`
and `/sys/fs/cgroup/cpuacct`.

[float]
[[process-stats]]
==== Process statistics
Expand Down