Skip to content

Commit

Permalink
container: do not fail hard, if single containers can't be scraped
Browse files Browse the repository at this point in the history
Signed-off-by: Jan-Otto Kröpke <[email protected]>
  • Loading branch information
jkroepke committed Aug 15, 2024
1 parent 7bb16d2 commit 387fab4
Showing 1 changed file with 128 additions and 93 deletions.
221 changes: 128 additions & 93 deletions pkg/collector/container/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
package container

import (
"errors"
"fmt"
"strings"

"github.com/Microsoft/hcsshim"
Expand Down Expand Up @@ -192,6 +194,7 @@ func (c *Collector) Build() error {
[]string{"container_id"},
nil,
)

return nil
}

Expand All @@ -200,24 +203,19 @@ func (c *Collector) Build() error {
func (c *Collector) Collect(_ *types.ScrapeContext, ch chan<- prometheus.Metric) error {
if err := c.collect(ch); err != nil {
_ = level.Error(c.logger).Log("msg", "failed collecting collector metrics", "err", err)

return err
}
return nil
}

// containerClose closes the container resource.
func (c *Collector) containerClose(container hcsshim.Container) {
err := container.Close()
if err != nil {
_ = level.Error(c.logger).Log("err", err)
}
return nil
}

func (c *Collector) collect(ch chan<- prometheus.Metric) error {
// Types Container is passed to get the containers compute systems only
containers, err := hcsshim.GetContainers(hcsshim.ComputeSystemQuery{Types: []string{"Container"}})
if err != nil {
_ = level.Error(c.logger).Log("msg", "Err in Getting containers", "err", err)

return err
}

Expand All @@ -228,102 +226,138 @@ func (c *Collector) collect(ch chan<- prometheus.Metric) error {
prometheus.GaugeValue,
float64(count),
)

if count == 0 {
return nil
}

containerPrefixes := make(map[string]string)
hasErrors := false

for _, containerDetails := range containers {
// https://stackoverflow.com/questions/45617758/proper-way-to-release-resources-with-defer-in-a-loop
func() {
container, err := hcsshim.OpenContainer(containerDetails.ID)
if container != nil {
defer c.containerClose(container)
}
if err != nil {
_ = level.Error(c.logger).Log("msg", "err in opening container", "containerId", containerDetails.ID, "err", err)
return
containerIdWithPrefix := getContainerIdWithPrefix(containerDetails)

if err = c.collectContainer(ch, containerDetails, containerIdWithPrefix); err != nil {
if hcsshim.IsNotExist(err) {
_ = level.Debug(c.logger).Log("msg", "err in fetching container statistics", "containerId", containerDetails.ID, "err", err)
} else {
_ = level.Error(c.logger).Log("msg", "err in fetching container statistics", "containerId", containerDetails.ID, "err", err)
hasErrors = true
}

cstats, err := container.Statistics()
if err != nil {
_ = level.Error(c.logger).Log("msg", "err in fetching container Statistics", "containerId", containerDetails.ID, "err", err)
return
}
continue
}

containerIdWithPrefix := getContainerIdWithPrefix(containerDetails)
containerPrefixes[containerDetails.ID] = containerIdWithPrefix
containerPrefixes[containerDetails.ID] = containerIdWithPrefix
}

ch <- prometheus.MustNewConstMetric(
c.containerAvailable,
prometheus.CounterValue,
1,
containerIdWithPrefix,
)
ch <- prometheus.MustNewConstMetric(
c.usageCommitBytes,
prometheus.GaugeValue,
float64(cstats.Memory.UsageCommitBytes),
containerIdWithPrefix,
)
ch <- prometheus.MustNewConstMetric(
c.usageCommitPeakBytes,
prometheus.GaugeValue,
float64(cstats.Memory.UsageCommitPeakBytes),
containerIdWithPrefix,
)
ch <- prometheus.MustNewConstMetric(
c.usagePrivateWorkingSetBytes,
prometheus.GaugeValue,
float64(cstats.Memory.UsagePrivateWorkingSetBytes),
containerIdWithPrefix,
)
ch <- prometheus.MustNewConstMetric(
c.runtimeTotal,
prometheus.CounterValue,
float64(cstats.Processor.TotalRuntime100ns)*perflib.TicksToSecondScaleFactor,
containerIdWithPrefix,
)
ch <- prometheus.MustNewConstMetric(
c.runtimeUser,
prometheus.CounterValue,
float64(cstats.Processor.RuntimeUser100ns)*perflib.TicksToSecondScaleFactor,
containerIdWithPrefix,
)
ch <- prometheus.MustNewConstMetric(
c.runtimeKernel,
prometheus.CounterValue,
float64(cstats.Processor.RuntimeKernel100ns)*perflib.TicksToSecondScaleFactor,
containerIdWithPrefix,
)
ch <- prometheus.MustNewConstMetric(
c.readCountNormalized,
prometheus.CounterValue,
float64(cstats.Storage.ReadCountNormalized),
containerIdWithPrefix,
)
ch <- prometheus.MustNewConstMetric(
c.readSizeBytes,
prometheus.CounterValue,
float64(cstats.Storage.ReadSizeBytes),
containerIdWithPrefix,
)
ch <- prometheus.MustNewConstMetric(
c.writeCountNormalized,
prometheus.CounterValue,
float64(cstats.Storage.WriteCountNormalized),
containerIdWithPrefix,
)
ch <- prometheus.MustNewConstMetric(
c.writeSizeBytes,
prometheus.CounterValue,
float64(cstats.Storage.WriteSizeBytes),
containerIdWithPrefix,
)
}()
if err = c.collectNetworkMetrics(ch, containerPrefixes); err != nil {
return fmt.Errorf("error in fetching container network statistics: %w", err)
}

if hasErrors {
return errors.New("errors while fetching container statistics")
}

return nil
}

func (c *Collector) collectContainer(ch chan<- prometheus.Metric, containerDetails hcsshim.ContainerProperties, containerIdWithPrefix string) error {
container, err := hcsshim.OpenContainer(containerDetails.ID)
if err != nil {
return fmt.Errorf("error in opening container: %w", err)
}

defer func() {
if container == nil {
return
}

if err := container.Close(); err != nil {
_ = level.Error(c.logger).Log("err", fmt.Errorf("error in closing container: %w", err))
}
}()

containerStats, err := container.Statistics()
if err != nil {
return fmt.Errorf("error in fetching container statistics: %w", err)
}

ch <- prometheus.MustNewConstMetric(
c.containerAvailable,
prometheus.CounterValue,
1,
containerIdWithPrefix,
)
ch <- prometheus.MustNewConstMetric(
c.usageCommitBytes,
prometheus.GaugeValue,
float64(containerStats.Memory.UsageCommitBytes),
containerIdWithPrefix,
)
ch <- prometheus.MustNewConstMetric(
c.usageCommitPeakBytes,
prometheus.GaugeValue,
float64(containerStats.Memory.UsageCommitPeakBytes),
containerIdWithPrefix,
)
ch <- prometheus.MustNewConstMetric(
c.usagePrivateWorkingSetBytes,
prometheus.GaugeValue,
float64(containerStats.Memory.UsagePrivateWorkingSetBytes),
containerIdWithPrefix,
)
ch <- prometheus.MustNewConstMetric(
c.runtimeTotal,
prometheus.CounterValue,
float64(containerStats.Processor.TotalRuntime100ns)*perflib.TicksToSecondScaleFactor,
containerIdWithPrefix,
)
ch <- prometheus.MustNewConstMetric(
c.runtimeUser,
prometheus.CounterValue,
float64(containerStats.Processor.RuntimeUser100ns)*perflib.TicksToSecondScaleFactor,
containerIdWithPrefix,
)
ch <- prometheus.MustNewConstMetric(
c.runtimeKernel,
prometheus.CounterValue,
float64(containerStats.Processor.RuntimeKernel100ns)*perflib.TicksToSecondScaleFactor,
containerIdWithPrefix,
)
ch <- prometheus.MustNewConstMetric(
c.readCountNormalized,
prometheus.CounterValue,
float64(containerStats.Storage.ReadCountNormalized),
containerIdWithPrefix,
)
ch <- prometheus.MustNewConstMetric(
c.readSizeBytes,
prometheus.CounterValue,
float64(containerStats.Storage.ReadSizeBytes),
containerIdWithPrefix,
)
ch <- prometheus.MustNewConstMetric(
c.writeCountNormalized,
prometheus.CounterValue,
float64(containerStats.Storage.WriteCountNormalized),
containerIdWithPrefix,
)
ch <- prometheus.MustNewConstMetric(
c.writeSizeBytes,
prometheus.CounterValue,
float64(containerStats.Storage.WriteSizeBytes),
containerIdWithPrefix,
)

return nil
}

// collectNetworkMetrics collects network metrics for containers.
// With HNSv2, the network stats must be collected from hcsshim.HNSListEndpointRequest.
// Network statistics from the container.Statistics() are providing data only, if HNSv1 is used.
// Ref: https://github.com/prometheus-community/windows_exporter/pull/1218
func (c *Collector) collectNetworkMetrics(ch chan<- prometheus.Metric, containerPrefixes map[string]string) error {
hnsEndpoints, err := hcsshim.HNSListEndpointRequest()
if err != nil {
_ = level.Warn(c.logger).Log("msg", "Failed to collect network stats for containers")
Expand All @@ -344,13 +378,14 @@ func (c *Collector) collect(ch chan<- prometheus.Metric) error {

for _, containerId := range endpoint.SharedContainers {
containerIdWithPrefix, ok := containerPrefixes[containerId]
endpointId := strings.ToUpper(endpoint.Id)

if !ok {
_ = level.Warn(c.logger).Log("msg", "Failed to collect network stats for container "+containerId)
_ = level.Debug(c.logger).Log("msg", "Failed to collect network stats for container "+containerId)
continue
}

endpointId := strings.ToUpper(endpoint.Id)

ch <- prometheus.MustNewConstMetric(
c.bytesReceived,
prometheus.CounterValue,
Expand Down

0 comments on commit 387fab4

Please sign in to comment.