google · xinau · Mar 22, 2022 · Mar 22, 2022 · Mar 22, 2022 · Jan 26, 2025
diff --git a/cmd/cadvisor_test.go b/cmd/cadvisor_test.go
@@ -112,6 +112,7 @@ func TestToIncludedMetrics(t *testing.T) {
 			container.ResctrlMetrics:                 struct{}{},
 			container.CPUSetMetrics:                  struct{}{},
 			container.OOMMetrics:                     struct{}{},
+			container.PressureMetrics:                struct{}{},
 		},
 		container.AllMetrics,
 		{},

diff --git a/container/factory.go b/container/factory.go
@@ -66,6 +66,7 @@ const (
 	ResctrlMetrics                 MetricKind = "resctrl"
 	CPUSetMetrics                  MetricKind = "cpuset"
 	OOMMetrics                     MetricKind = "oom_event"
+	PressureMetrics                MetricKind = "pressure"
 )
 
 // AllMetrics represents all kinds of metrics that cAdvisor supported.
@@ -91,6 +92,7 @@ var AllMetrics = MetricSet{
 	ResctrlMetrics:                 struct{}{},
 	CPUSetMetrics:                  struct{}{},
 	OOMMetrics:                     struct{}{},
+	PressureMetrics:                struct{}{},
 }
 
 // AllNetworkMetrics represents all network metrics that cAdvisor supports.

diff --git a/container/libcontainer/handler.go b/container/libcontainer/handler.go
@@ -771,6 +771,7 @@ func setCPUStats(s *cgroups.Stats, ret *info.ContainerStats, withPerCPU bool) {
 	ret.Cpu.CFS.Periods = s.CpuStats.ThrottlingData.Periods
 	ret.Cpu.CFS.ThrottledPeriods = s.CpuStats.ThrottlingData.ThrottledPeriods
 	ret.Cpu.CFS.ThrottledTime = s.CpuStats.ThrottlingData.ThrottledTime
+	setPSIStats(s.CpuStats.PSI, &ret.Cpu.PSI)
 
 	if !withPerCPU {
 		return
@@ -792,13 +793,15 @@ func setDiskIoStats(s *cgroups.Stats, ret *info.ContainerStats) {
 	ret.DiskIo.IoWaitTime = diskStatsCopy(s.BlkioStats.IoWaitTimeRecursive)
 	ret.DiskIo.IoMerged = diskStatsCopy(s.BlkioStats.IoMergedRecursive)
 	ret.DiskIo.IoTime = diskStatsCopy(s.BlkioStats.IoTimeRecursive)
+	setPSIStats(s.BlkioStats.PSI, &ret.DiskIo.PSI)
 }
 
 func setMemoryStats(s *cgroups.Stats, ret *info.ContainerStats) {
 	ret.Memory.Usage = s.MemoryStats.Usage.Usage
 	ret.Memory.MaxUsage = s.MemoryStats.Usage.MaxUsage
 	ret.Memory.Failcnt = s.MemoryStats.Usage.Failcnt
 	ret.Memory.KernelUsage = s.MemoryStats.KernelUsage.Usage
+	setPSIStats(s.MemoryStats.PSI, &ret.Memory.PSI)
 
 	if cgroups.IsCgroup2UnifiedMode() {
 		ret.Memory.Cache = s.MemoryStats.Stats["file"]
@@ -884,6 +887,22 @@ func setHugepageStats(s *cgroups.Stats, ret *info.ContainerStats) {
 	}
 }
 
+func setPSIData(d *cgroups.PSIData, ret *info.PSIData) {
+	if d != nil {
+		ret.Total = d.Total
+		ret.Avg10 = d.Avg10
+		ret.Avg60 = d.Avg60
+		ret.Avg300 = d.Avg300
+	}
+}
+
+func setPSIStats(s *cgroups.PSIStats, ret *info.PSIStats) {
+	if s != nil {
+		setPSIData(&s.Full, &ret.Full)
+		setPSIData(&s.Some, &ret.Some)
+	}
+}
+
 // read from pids path not cpu
 func setThreadsStats(s *cgroups.Stats, ret *info.ContainerStats) {
 	if s != nil {

diff --git a/container/libcontainer/handler_test.go b/container/libcontainer/handler_test.go
@@ -110,6 +110,20 @@ func TestSetCPUStats(t *testing.T) {
 				UsageInKernelmode: 734746 * nanosecondsInSeconds / clockTicks,
 				UsageInUsermode:   2767637 * nanosecondsInSeconds / clockTicks,
 			},
+			PSI: &cgroups.PSIStats{
+				Full: cgroups.PSIData{
+					Avg10:  0.3,
+					Avg60:  0.2,
+					Avg300: 0.1,
+					Total:  100,
+				},
+				Some: cgroups.PSIData{
+					Avg10:  0.6,
+					Avg60:  0.4,
+					Avg300: 0.2,
+					Total:  200,
+				},
+			},
 		},
 	}
 	var ret info.ContainerStats
@@ -123,6 +137,20 @@ func TestSetCPUStats(t *testing.T) {
 				System: s.CpuStats.CpuUsage.UsageInKernelmode,
 				Total:  33802947350272,
 			},
+			PSI: info.PSIStats{
+				Full: info.PSIData{
+					Avg10:  0.3,
+					Avg60:  0.2,
+					Avg300: 0.1,
+					Total:  100,
+				},
+				Some: info.PSIData{
+					Avg10:  0.6,
+					Avg60:  0.4,
+					Avg300: 0.2,
+					Total:  200,
+				},
+			},
 		},
 	}
 

diff --git a/info/v1/container.go b/info/v1/container.go
@@ -261,6 +261,26 @@ func (ci *ContainerInfo) StatsEndTime() time.Time {
 	return ret
 }
 
+// PSI statistics for an individual resource.
+type PSIStats struct {
+	// PSI data for all tasks of in the cgroup.
+	Full PSIData `json:"full,omitempty"`
+	// PSI data for some tasks in the cgroup.
+	Some PSIData `json:"some,omitempty"`
+}
+
+type PSIData struct {
+	// Total time duration for tasks in the cgroup have waited due to congestion.
+	// Unit: nanoseconds.
+	Total uint64 `json:"total"`
+	// The average (in %) tasks have waited due to congestion over a 10 second window.
+	Avg10 float64 `json:"avg10"`
+	// The average (in %) tasks have waited due to congestion over a 60 second window.
+	Avg60 float64 `json:"avg60"`
+	// The average (in %) tasks have waited due to congestion over a 300 second window.
+	Avg300 float64 `json:"avg300"`
+}
+
 // This mirrors kernel internal structure.
 type LoadStats struct {
 	// Number of sleeping tasks.
@@ -334,7 +354,8 @@ type CpuStats struct {
 	// from LoadStats.NrRunning.
 	LoadAverage int32 `json:"load_average"`
 	// from LoadStats.NrUninterruptible
-	LoadDAverage int32 `json:"load_d_average"`
+	LoadDAverage int32    `json:"load_d_average"`
+	PSI          PSIStats `json:"psi"`
 }
 
 type PerDiskStats struct {
@@ -353,6 +374,7 @@ type DiskIoStats struct {
 	IoWaitTime     []PerDiskStats `json:"io_wait_time,omitempty"`
 	IoMerged       []PerDiskStats `json:"io_merged,omitempty"`
 	IoTime         []PerDiskStats `json:"io_time,omitempty"`
+	PSI            PSIStats       `json:"psi"`
 }
 
 type HugetlbStats struct {
@@ -411,6 +433,8 @@ type MemoryStats struct {
 
 	ContainerData    MemoryStatsMemoryData `json:"container_data,omitempty"`
 	HierarchicalData MemoryStatsMemoryData `json:"hierarchical_data,omitempty"`
+
+	PSI PSIStats `json:"psi"`
 }
 
 type CPUSetStats struct {

diff --git a/metrics/prometheus.go b/metrics/prometheus.go
@@ -33,9 +33,14 @@ import (
 // asFloat64 converts a uint64 into a float64.
 func asFloat64(v uint64) float64 { return float64(v) }
 
+// asMicrosecondsToSeconds converts nanoseconds into a float64 representing seconds.
+func asMicrosecondsToSeconds(v uint64) float64 {
+	return float64(v) / 1e6
+}
+
 // asNanosecondsToSeconds converts nanoseconds into a float64 representing seconds.
 func asNanosecondsToSeconds(v uint64) float64 {
-	return float64(v) / float64(time.Second)
+	return float64(v) / 1e9
 }
 
 // fsValues is a helper method for assembling per-filesystem stats.
@@ -1746,6 +1751,47 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc, includedMetri
 		})
 	}
 
+	if includedMetrics.Has(container.PressureMetrics) {
+		c.containerMetrics = append(c.containerMetrics, []containerMetric{
+			{
+				name:      "container_pressure_cpu_waiting_seconds_total",
+				help:      "Total time duration tasks in the container have waited due to CPU congestion.",
+				valueType: prometheus.CounterValue,
+				getValues: func(s *info.ContainerStats) metricValues {
+					return metricValues{{value: asMicrosecondsToSeconds(s.Cpu.PSI.Some.Total), timestamp: s.Timestamp}}
+				},
+			}, {
+				name:      "container_pressure_memory_stalled_seconds_total",
+				help:      "Total time duration no tasks in the container could make progress due to memory congestion.",
+				valueType: prometheus.CounterValue,
+				getValues: func(s *info.ContainerStats) metricValues {
+					return metricValues{{value: asMicrosecondsToSeconds(s.Memory.PSI.Full.Total), timestamp: s.Timestamp}}
+				},
+			}, {
+				name:      "container_pressure_memory_waiting_seconds_total",
+				help:      "Total time duration tasks in the container have waited due to memory congestion.",
+				valueType: prometheus.CounterValue,
+				getValues: func(s *info.ContainerStats) metricValues {
+					return metricValues{{value: asMicrosecondsToSeconds(s.Memory.PSI.Some.Total), timestamp: s.Timestamp}}
+				},
+			}, {
+				name:      "container_pressure_io_stalled_seconds_total",
+				help:      "Total time duration no tasks in the container could make progress due to IO congestion.",
+				valueType: prometheus.CounterValue,
+				getValues: func(s *info.ContainerStats) metricValues {
+					return metricValues{{value: asMicrosecondsToSeconds(s.DiskIo.PSI.Full.Total), timestamp: s.Timestamp}}
+				},
+			}, {
+				name:      "container_pressure_io_waiting_seconds_total",
+				help:      "Total time duration tasks in the container have waited due to IO congestion.",
+				valueType: prometheus.CounterValue,
+				getValues: func(s *info.ContainerStats) metricValues {
+					return metricValues{{value: asMicrosecondsToSeconds(s.DiskIo.PSI.Some.Total), timestamp: s.Timestamp}}
+				},
+			},
+		}...)
+	}
+
 	return c
 }
 

diff --git a/metrics/prometheus_fake.go b/metrics/prometheus_fake.go
@@ -328,6 +328,20 @@ func (p testSubcontainersInfoProvider) GetRequestedContainersInfo(string, v2.Req
 						},
 						LoadAverage:  2,
 						LoadDAverage: 2,
+						PSI: info.PSIStats{
+							Full: info.PSIData{
+								Avg10:  0.3,
+								Avg60:  0.2,
+								Avg300: 0.1,
+								Total:  100,
+							},
+							Some: info.PSIData{
+								Avg10:  0.6,
+								Avg60:  0.4,
+								Avg300: 0.2,
+								Total:  200,
+							},
+						},
 					},
 					Memory: info.MemoryStats{
 						Usage:             8,
@@ -358,6 +372,20 @@ func (p testSubcontainersInfoProvider) GetRequestedContainersInfo(string, v2.Req
 						MappedFile:  16,
 						KernelUsage: 17,
 						Swap:        8192,
+						PSI: info.PSIStats{
+							Full: info.PSIData{
+								Avg10:  0.3,
+								Avg60:  0.2,
+								Avg300: 0.1,
+								Total:  1000,
+							},
+							Some: info.PSIData{
+								Avg10:  0.6,
+								Avg60:  0.4,
+								Avg300: 0.2,
+								Total:  2000,
+							},
+						},
 					},
 					Hugetlb: map[string]info.HugetlbStats{
 						"2Mi": {
@@ -550,6 +578,20 @@ func (p testSubcontainersInfoProvider) GetRequestedContainersInfo(string, v2.Req
 								"Write":   6,
 							},
 						}},
+						PSI: info.PSIStats{
+							Full: info.PSIData{
+								Avg10:  0.3,
+								Avg60:  0.2,
+								Avg300: 0.1,
+								Total:  1100,
+							},
+							Some: info.PSIData{
+								Avg10:  0.6,
+								Avg60:  0.4,
+								Avg300: 0.2,
+								Total:  2200,
+							},
+						},
 					},
 					Filesystem: []info.FsStats{
 						{

diff --git a/metrics/testdata/prometheus_metrics b/metrics/testdata/prometheus_metrics
@@ -381,6 +381,21 @@ container_perf_uncore_events_total{container_env_foo_env="prod",container_label_
 # TYPE container_perf_uncore_events_scaling_ratio gauge
 container_perf_uncore_events_scaling_ratio{container_env_foo_env="prod",container_label_foo_label="bar",event="cas_count_read",id="testcontainer",image="test",name="testcontaineralias",pmu="uncore_imc_0",socket="0",zone_name="hello"} 1 1395066363000
 container_perf_uncore_events_scaling_ratio{container_env_foo_env="prod",container_label_foo_label="bar",event="cas_count_read",id="testcontainer",image="test",name="testcontaineralias",pmu="uncore_imc_0",socket="1",zone_name="hello"} 1 1395066363000
+# HELP container_pressure_cpu_waiting_seconds_total Total time duration tasks in the container have waited due to CPU congestion.
+# TYPE container_pressure_cpu_waiting_seconds_total counter
+container_pressure_cpu_waiting_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.0002 1395066363000
+# HELP container_pressure_io_stalled_seconds_total Total time duration no tasks in the container could make progress due to IO congestion.
+# TYPE container_pressure_io_stalled_seconds_total counter
+container_pressure_io_stalled_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.0011 1395066363000
+# HELP container_pressure_io_waiting_seconds_total Total time duration tasks in the container have waited due to IO congestion.
+# TYPE container_pressure_io_waiting_seconds_total counter
+container_pressure_io_waiting_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.0022 1395066363000
+# HELP container_pressure_memory_stalled_seconds_total Total time duration no tasks in the container could make progress due to memory congestion.
+# TYPE container_pressure_memory_stalled_seconds_total counter
+container_pressure_memory_stalled_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.001 1395066363000
+# HELP container_pressure_memory_waiting_seconds_total Total time duration tasks in the container have waited due to memory congestion.
+# TYPE container_pressure_memory_waiting_seconds_total counter
+container_pressure_memory_waiting_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.002 1395066363000
 # HELP container_processes Number of processes running inside the container.
 # TYPE container_processes gauge
 container_processes{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 1 1395066363000

diff --git a/metrics/testdata/prometheus_metrics_whitelist_filtered b/metrics/testdata/prometheus_metrics_whitelist_filtered
@@ -381,6 +381,21 @@ container_perf_uncore_events_total{container_env_foo_env="prod",event="cas_count
 # TYPE container_perf_uncore_events_scaling_ratio gauge
 container_perf_uncore_events_scaling_ratio{container_env_foo_env="prod",event="cas_count_read",id="testcontainer",image="test",name="testcontaineralias",pmu="uncore_imc_0",socket="0",zone_name="hello"} 1 1395066363000
 container_perf_uncore_events_scaling_ratio{container_env_foo_env="prod",event="cas_count_read",id="testcontainer",image="test",name="testcontaineralias",pmu="uncore_imc_0",socket="1",zone_name="hello"} 1 1395066363000
+# HELP container_pressure_cpu_waiting_seconds_total Total time duration tasks in the container have waited due to CPU congestion.
+# TYPE container_pressure_cpu_waiting_seconds_total counter
+container_pressure_cpu_waiting_seconds_total{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.0002 1395066363000
+# HELP container_pressure_io_stalled_seconds_total Total time duration no tasks in the container could make progress due to IO congestion.
+# TYPE container_pressure_io_stalled_seconds_total counter
+container_pressure_io_stalled_seconds_total{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.0011 1395066363000
+# HELP container_pressure_io_waiting_seconds_total Total time duration tasks in the container have waited due to IO congestion.
+# TYPE container_pressure_io_waiting_seconds_total counter
+container_pressure_io_waiting_seconds_total{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.0022 1395066363000
+# HELP container_pressure_memory_stalled_seconds_total Total time duration no tasks in the container could make progress due to memory congestion.
+# TYPE container_pressure_memory_stalled_seconds_total counter
+container_pressure_memory_stalled_seconds_total{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.001 1395066363000
+# HELP container_pressure_memory_waiting_seconds_total Total time duration tasks in the container have waited due to memory congestion.
+# TYPE container_pressure_memory_waiting_seconds_total counter
+container_pressure_memory_waiting_seconds_total{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.002 1395066363000
 # HELP container_processes Number of processes running inside the container.
 # TYPE container_processes gauge
 container_processes{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 1 1395066363000