Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Pressure Stall Information Metrics #3649

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cmd/cadvisor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ func TestToIncludedMetrics(t *testing.T) {
container.ResctrlMetrics: struct{}{},
container.CPUSetMetrics: struct{}{},
container.OOMMetrics: struct{}{},
container.PressureMetrics: struct{}{},
},
container.AllMetrics,
{},
Expand Down
2 changes: 2 additions & 0 deletions container/factory.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ const (
ResctrlMetrics MetricKind = "resctrl"
CPUSetMetrics MetricKind = "cpuset"
OOMMetrics MetricKind = "oom_event"
PressureMetrics MetricKind = "pressure"
)

// AllMetrics represents all kinds of metrics that cAdvisor supported.
Expand All @@ -91,6 +92,7 @@ var AllMetrics = MetricSet{
ResctrlMetrics: struct{}{},
CPUSetMetrics: struct{}{},
OOMMetrics: struct{}{},
PressureMetrics: struct{}{},
}

// AllNetworkMetrics represents all network metrics that cAdvisor supports.
Expand Down
19 changes: 19 additions & 0 deletions container/libcontainer/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -771,6 +771,7 @@ func setCPUStats(s *cgroups.Stats, ret *info.ContainerStats, withPerCPU bool) {
ret.Cpu.CFS.Periods = s.CpuStats.ThrottlingData.Periods
ret.Cpu.CFS.ThrottledPeriods = s.CpuStats.ThrottlingData.ThrottledPeriods
ret.Cpu.CFS.ThrottledTime = s.CpuStats.ThrottlingData.ThrottledTime
setPSIStats(s.CpuStats.PSI, &ret.Cpu.PSI)

if !withPerCPU {
return
Expand All @@ -792,13 +793,15 @@ func setDiskIoStats(s *cgroups.Stats, ret *info.ContainerStats) {
ret.DiskIo.IoWaitTime = diskStatsCopy(s.BlkioStats.IoWaitTimeRecursive)
ret.DiskIo.IoMerged = diskStatsCopy(s.BlkioStats.IoMergedRecursive)
ret.DiskIo.IoTime = diskStatsCopy(s.BlkioStats.IoTimeRecursive)
setPSIStats(s.BlkioStats.PSI, &ret.DiskIo.PSI)
}

func setMemoryStats(s *cgroups.Stats, ret *info.ContainerStats) {
ret.Memory.Usage = s.MemoryStats.Usage.Usage
ret.Memory.MaxUsage = s.MemoryStats.Usage.MaxUsage
ret.Memory.Failcnt = s.MemoryStats.Usage.Failcnt
ret.Memory.KernelUsage = s.MemoryStats.KernelUsage.Usage
setPSIStats(s.MemoryStats.PSI, &ret.Memory.PSI)

if cgroups.IsCgroup2UnifiedMode() {
ret.Memory.Cache = s.MemoryStats.Stats["file"]
Expand Down Expand Up @@ -884,6 +887,22 @@ func setHugepageStats(s *cgroups.Stats, ret *info.ContainerStats) {
}
}

func setPSIData(d *cgroups.PSIData, ret *info.PSIData) {
if d != nil {
ret.Total = d.Total
ret.Avg10 = d.Avg10
ret.Avg60 = d.Avg60
ret.Avg300 = d.Avg300
}
}

func setPSIStats(s *cgroups.PSIStats, ret *info.PSIStats) {
if s != nil {
setPSIData(&s.Full, &ret.Full)
setPSIData(&s.Some, &ret.Some)
}
}

// read from pids path not cpu
func setThreadsStats(s *cgroups.Stats, ret *info.ContainerStats) {
if s != nil {
Expand Down
28 changes: 28 additions & 0 deletions container/libcontainer/handler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,20 @@ func TestSetCPUStats(t *testing.T) {
UsageInKernelmode: 734746 * nanosecondsInSeconds / clockTicks,
UsageInUsermode: 2767637 * nanosecondsInSeconds / clockTicks,
},
PSI: &cgroups.PSIStats{
Full: cgroups.PSIData{
Avg10: 0.3,
Avg60: 0.2,
Avg300: 0.1,
Total: 100,
},
Some: cgroups.PSIData{
Avg10: 0.6,
Avg60: 0.4,
Avg300: 0.2,
Total: 200,
},
},
},
}
var ret info.ContainerStats
Expand All @@ -123,6 +137,20 @@ func TestSetCPUStats(t *testing.T) {
System: s.CpuStats.CpuUsage.UsageInKernelmode,
Total: 33802947350272,
},
PSI: info.PSIStats{
Full: info.PSIData{
Avg10: 0.3,
Avg60: 0.2,
Avg300: 0.1,
Total: 100,
},
Some: info.PSIData{
Avg10: 0.6,
Avg60: 0.4,
Avg300: 0.2,
Total: 200,
},
},
},
}

Expand Down
26 changes: 25 additions & 1 deletion info/v1/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,26 @@ func (ci *ContainerInfo) StatsEndTime() time.Time {
return ret
}

// PSI statistics for an individual resource.
type PSIStats struct {
// PSI data for all tasks of in the cgroup.
Full PSIData `json:"full,omitempty"`
// PSI data for some tasks in the cgroup.
Some PSIData `json:"some,omitempty"`
}

type PSIData struct {
// Total time duration for tasks in the cgroup have waited due to congestion.
// Unit: nanoseconds.
Total uint64 `json:"total"`
// The average (in %) tasks have waited due to congestion over a 10 second window.
Avg10 float64 `json:"avg10"`
// The average (in %) tasks have waited due to congestion over a 60 second window.
Avg60 float64 `json:"avg60"`
// The average (in %) tasks have waited due to congestion over a 300 second window.
Avg300 float64 `json:"avg300"`
}

// This mirrors kernel internal structure.
type LoadStats struct {
// Number of sleeping tasks.
Expand Down Expand Up @@ -334,7 +354,8 @@ type CpuStats struct {
// from LoadStats.NrRunning.
LoadAverage int32 `json:"load_average"`
// from LoadStats.NrUninterruptible
LoadDAverage int32 `json:"load_d_average"`
LoadDAverage int32 `json:"load_d_average"`
PSI PSIStats `json:"psi"`
}

type PerDiskStats struct {
Expand All @@ -353,6 +374,7 @@ type DiskIoStats struct {
IoWaitTime []PerDiskStats `json:"io_wait_time,omitempty"`
IoMerged []PerDiskStats `json:"io_merged,omitempty"`
IoTime []PerDiskStats `json:"io_time,omitempty"`
PSI PSIStats `json:"psi"`
}

type HugetlbStats struct {
Expand Down Expand Up @@ -411,6 +433,8 @@ type MemoryStats struct {

ContainerData MemoryStatsMemoryData `json:"container_data,omitempty"`
HierarchicalData MemoryStatsMemoryData `json:"hierarchical_data,omitempty"`

PSI PSIStats `json:"psi"`
}

type CPUSetStats struct {
Expand Down
48 changes: 47 additions & 1 deletion metrics/prometheus.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,14 @@ import (
// asFloat64 converts a uint64 into a float64.
func asFloat64(v uint64) float64 { return float64(v) }

// asMicrosecondsToSeconds converts nanoseconds into a float64 representing seconds.
func asMicrosecondsToSeconds(v uint64) float64 {
return float64(v) / 1e6
}

// asNanosecondsToSeconds converts nanoseconds into a float64 representing seconds.
func asNanosecondsToSeconds(v uint64) float64 {
return float64(v) / float64(time.Second)
return float64(v) / 1e9
}

// fsValues is a helper method for assembling per-filesystem stats.
Expand Down Expand Up @@ -1746,6 +1751,47 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc, includedMetri
})
}

if includedMetrics.Has(container.PressureMetrics) {
c.containerMetrics = append(c.containerMetrics, []containerMetric{
{
name: "container_pressure_cpu_waiting_seconds_total",
help: "Total time duration tasks in the container have waited due to CPU congestion.",
valueType: prometheus.CounterValue,
getValues: func(s *info.ContainerStats) metricValues {
return metricValues{{value: asMicrosecondsToSeconds(s.Cpu.PSI.Some.Total), timestamp: s.Timestamp}}
},
}, {
name: "container_pressure_memory_stalled_seconds_total",
help: "Total time duration no tasks in the container could make progress due to memory congestion.",
valueType: prometheus.CounterValue,
getValues: func(s *info.ContainerStats) metricValues {
return metricValues{{value: asMicrosecondsToSeconds(s.Memory.PSI.Full.Total), timestamp: s.Timestamp}}
},
}, {
name: "container_pressure_memory_waiting_seconds_total",
help: "Total time duration tasks in the container have waited due to memory congestion.",
valueType: prometheus.CounterValue,
getValues: func(s *info.ContainerStats) metricValues {
return metricValues{{value: asMicrosecondsToSeconds(s.Memory.PSI.Some.Total), timestamp: s.Timestamp}}
},
}, {
name: "container_pressure_io_stalled_seconds_total",
help: "Total time duration no tasks in the container could make progress due to IO congestion.",
valueType: prometheus.CounterValue,
getValues: func(s *info.ContainerStats) metricValues {
return metricValues{{value: asMicrosecondsToSeconds(s.DiskIo.PSI.Full.Total), timestamp: s.Timestamp}}
},
}, {
name: "container_pressure_io_waiting_seconds_total",
help: "Total time duration tasks in the container have waited due to IO congestion.",
valueType: prometheus.CounterValue,
getValues: func(s *info.ContainerStats) metricValues {
return metricValues{{value: asMicrosecondsToSeconds(s.DiskIo.PSI.Some.Total), timestamp: s.Timestamp}}
},
},
}...)
}

return c
}

Expand Down
42 changes: 42 additions & 0 deletions metrics/prometheus_fake.go
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,20 @@ func (p testSubcontainersInfoProvider) GetRequestedContainersInfo(string, v2.Req
},
LoadAverage: 2,
LoadDAverage: 2,
PSI: info.PSIStats{
Full: info.PSIData{
Avg10: 0.3,
Avg60: 0.2,
Avg300: 0.1,
Total: 100,
},
Some: info.PSIData{
Avg10: 0.6,
Avg60: 0.4,
Avg300: 0.2,
Total: 200,
},
},
},
Memory: info.MemoryStats{
Usage: 8,
Expand Down Expand Up @@ -358,6 +372,20 @@ func (p testSubcontainersInfoProvider) GetRequestedContainersInfo(string, v2.Req
MappedFile: 16,
KernelUsage: 17,
Swap: 8192,
PSI: info.PSIStats{
Full: info.PSIData{
Avg10: 0.3,
Avg60: 0.2,
Avg300: 0.1,
Total: 1000,
},
Some: info.PSIData{
Avg10: 0.6,
Avg60: 0.4,
Avg300: 0.2,
Total: 2000,
},
},
},
Hugetlb: map[string]info.HugetlbStats{
"2Mi": {
Expand Down Expand Up @@ -550,6 +578,20 @@ func (p testSubcontainersInfoProvider) GetRequestedContainersInfo(string, v2.Req
"Write": 6,
},
}},
PSI: info.PSIStats{
Full: info.PSIData{
Avg10: 0.3,
Avg60: 0.2,
Avg300: 0.1,
Total: 1100,
},
Some: info.PSIData{
Avg10: 0.6,
Avg60: 0.4,
Avg300: 0.2,
Total: 2200,
},
},
},
Filesystem: []info.FsStats{
{
Expand Down
15 changes: 15 additions & 0 deletions metrics/testdata/prometheus_metrics
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,21 @@ container_perf_uncore_events_total{container_env_foo_env="prod",container_label_
# TYPE container_perf_uncore_events_scaling_ratio gauge
container_perf_uncore_events_scaling_ratio{container_env_foo_env="prod",container_label_foo_label="bar",event="cas_count_read",id="testcontainer",image="test",name="testcontaineralias",pmu="uncore_imc_0",socket="0",zone_name="hello"} 1 1395066363000
container_perf_uncore_events_scaling_ratio{container_env_foo_env="prod",container_label_foo_label="bar",event="cas_count_read",id="testcontainer",image="test",name="testcontaineralias",pmu="uncore_imc_0",socket="1",zone_name="hello"} 1 1395066363000
# HELP container_pressure_cpu_waiting_seconds_total Total time duration tasks in the container have waited due to CPU congestion.
# TYPE container_pressure_cpu_waiting_seconds_total counter
container_pressure_cpu_waiting_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.0002 1395066363000
# HELP container_pressure_io_stalled_seconds_total Total time duration no tasks in the container could make progress due to IO congestion.
# TYPE container_pressure_io_stalled_seconds_total counter
container_pressure_io_stalled_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.0011 1395066363000
# HELP container_pressure_io_waiting_seconds_total Total time duration tasks in the container have waited due to IO congestion.
# TYPE container_pressure_io_waiting_seconds_total counter
container_pressure_io_waiting_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.0022 1395066363000
# HELP container_pressure_memory_stalled_seconds_total Total time duration no tasks in the container could make progress due to memory congestion.
# TYPE container_pressure_memory_stalled_seconds_total counter
container_pressure_memory_stalled_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.001 1395066363000
# HELP container_pressure_memory_waiting_seconds_total Total time duration tasks in the container have waited due to memory congestion.
# TYPE container_pressure_memory_waiting_seconds_total counter
container_pressure_memory_waiting_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.002 1395066363000
# HELP container_processes Number of processes running inside the container.
# TYPE container_processes gauge
container_processes{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 1 1395066363000
Expand Down
15 changes: 15 additions & 0 deletions metrics/testdata/prometheus_metrics_whitelist_filtered
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,21 @@ container_perf_uncore_events_total{container_env_foo_env="prod",event="cas_count
# TYPE container_perf_uncore_events_scaling_ratio gauge
container_perf_uncore_events_scaling_ratio{container_env_foo_env="prod",event="cas_count_read",id="testcontainer",image="test",name="testcontaineralias",pmu="uncore_imc_0",socket="0",zone_name="hello"} 1 1395066363000
container_perf_uncore_events_scaling_ratio{container_env_foo_env="prod",event="cas_count_read",id="testcontainer",image="test",name="testcontaineralias",pmu="uncore_imc_0",socket="1",zone_name="hello"} 1 1395066363000
# HELP container_pressure_cpu_waiting_seconds_total Total time duration tasks in the container have waited due to CPU congestion.
# TYPE container_pressure_cpu_waiting_seconds_total counter
container_pressure_cpu_waiting_seconds_total{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.0002 1395066363000
# HELP container_pressure_io_stalled_seconds_total Total time duration no tasks in the container could make progress due to IO congestion.
# TYPE container_pressure_io_stalled_seconds_total counter
container_pressure_io_stalled_seconds_total{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.0011 1395066363000
# HELP container_pressure_io_waiting_seconds_total Total time duration tasks in the container have waited due to IO congestion.
# TYPE container_pressure_io_waiting_seconds_total counter
container_pressure_io_waiting_seconds_total{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.0022 1395066363000
# HELP container_pressure_memory_stalled_seconds_total Total time duration no tasks in the container could make progress due to memory congestion.
# TYPE container_pressure_memory_stalled_seconds_total counter
container_pressure_memory_stalled_seconds_total{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.001 1395066363000
# HELP container_pressure_memory_waiting_seconds_total Total time duration tasks in the container have waited due to memory congestion.
# TYPE container_pressure_memory_waiting_seconds_total counter
container_pressure_memory_waiting_seconds_total{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.002 1395066363000
# HELP container_processes Number of processes running inside the container.
# TYPE container_processes gauge
container_processes{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 1 1395066363000
Expand Down
Loading