From f02e7c68d0936a4ebf9ad6ab0763c4722c571690 Mon Sep 17 00:00:00 2001 From: Cam Date: Wed, 13 May 2020 11:17:24 -0700 Subject: [PATCH] Change default docker metric gathering behavior 1. Change default docker metric gathering behavior from streaming metrics to polling. 2. Change the default polling interval to half of the TACS publishing interval (currently 20s), so that every publish interval we have two docker metrics. 3. Change the minimum polling interval to 5s to prevent customers from configuring polling to be just as resource-intensive as streaming metrics. These changes are being made because we have found that docker streaming stats consumes considerable resources from the agent, dockerd daemon, and containerd daemon. --- README.md | 4 ++-- agent/config/config.go | 18 ++++++++++-------- agent/config/config_test.go | 15 ++++++++++++--- 3 files changed, 24 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index de97ee47ee3..9417fba40e7 100644 --- a/README.md +++ b/README.md @@ -139,8 +139,8 @@ additional details on each available environment variable. | `ECS_DATADIR` | /data/ | The container path where state is checkpointed for use across agent restarts. Note that on Linux, when you specify this, you will need to make sure that the Agent container has a bind mount of `$ECS_HOST_DATA_DIR/data:$ECS_DATADIR` with the corresponding values of `ECS_HOST_DATA_DIR` and `ECS_DATADIR`. | /data/ | `C:\ProgramData\Amazon\ECS\data` | `ECS_UPDATES_ENABLED` | <true | false> | Whether to exit for an updater to apply updates when requested. | false | false | | `ECS_DISABLE_METRICS` | <true | false> | Whether to disable metrics gathering for tasks. | false | true | -| `ECS_POLL_METRICS` | <true | false> | Whether to poll or stream when gathering metrics for tasks. | false | false | -| `ECS_POLLING_METRICS_WAIT_DURATION` | 30s | Time to wait to poll for new metrics for a task. Only used when ECS_POLL_METRICS is true | 15s | 15s | +| `ECS_POLL_METRICS` | <true | false> | Whether to poll or stream when gathering metrics for tasks. WARNING: setting this to false on an instance with many containers can result in very high CPU utilization by the agent, dockerd, and containerd. | `true` | `true` | +| `ECS_POLLING_METRICS_WAIT_DURATION` | 10s | Time to wait between polling for metrics for a task. Not used when ECS_POLL_METRICS is false. Maximum value is 20s and minimum value is 5s. | 10s | 10s | | `ECS_RESERVED_MEMORY` | 32 | Memory, in MiB, to reserve for use by things other than containers managed by Amazon ECS. | 0 | 0 | | `ECS_AVAILABLE_LOGGING_DRIVERS` | `["awslogs","fluentd","gelf","json-file","journald","logentries","splunk","syslog"]` | Which logging drivers are available on the container instance. | `["json-file","none"]` | `["json-file","none"]` | | `ECS_DISABLE_PRIVILEGED` | `true` | Whether launching privileged containers is disabled on the container instance. | `false` | `false` | diff --git a/agent/config/config.go b/agent/config/config.go index fa1ad0c066c..8256c9bed9b 100644 --- a/agent/config/config.go +++ b/agent/config/config.go @@ -62,7 +62,7 @@ const ( // DefaultPollingMetricsWaitDuration specifies the default value for polling metrics wait duration // This is only used when PollMetrics is set to true - DefaultPollingMetricsWaitDuration = 15 * time.Second + DefaultPollingMetricsWaitDuration = 10 * time.Second // defaultDockerStopTimeout specifies the value for container stop timeout duration defaultDockerStopTimeout = 30 * time.Second @@ -97,11 +97,11 @@ const ( // minimumPollingMetricsWaitDuration specifies the minimum duration to wait before polling for new stats // from docker. This is only used when PollMetrics is set to true - minimumPollingMetricsWaitDuration = 1 * time.Second + minimumPollingMetricsWaitDuration = 5 * time.Second // maximumPollingMetricsWaitDuration specifies the maximum duration to wait before polling for new stats // from docker. This is only used when PollMetrics is set to true - maximumPollingMetricsWaitDuration = 20 * time.Second + maximumPollingMetricsWaitDuration = DefaultContainerMetricsPublishInterval // minimumDockerStopTimeout specifies the minimum value for docker StopContainer API minimumDockerStopTimeout = 1 * time.Second @@ -346,13 +346,15 @@ func (cfg *Config) validateAndOverrideBounds() error { func (cfg *Config) pollMetricsOverrides() { if cfg.PollMetrics { if cfg.PollingMetricsWaitDuration < minimumPollingMetricsWaitDuration { - seelog.Warnf("Invalid value for polling metrics wait duration, will be overridden with the default value: %s. Parsed value: %v, minimum value: %v.", DefaultPollingMetricsWaitDuration.String(), cfg.PollingMetricsWaitDuration, minimumPollingMetricsWaitDuration) - cfg.PollingMetricsWaitDuration = DefaultPollingMetricsWaitDuration + seelog.Warnf("ECS_POLLING_METRICS_WAIT_DURATION parsed value (%s) is less than the minimum of %s. Setting polling interval to minimum.", + cfg.PollingMetricsWaitDuration, minimumPollingMetricsWaitDuration) + cfg.PollingMetricsWaitDuration = minimumPollingMetricsWaitDuration } if cfg.PollingMetricsWaitDuration > maximumPollingMetricsWaitDuration { - seelog.Warnf("Invalid value for polling metrics wait duration, will be overridden with the default value: %s. Parsed value: %v, maximum value: %v.", DefaultPollingMetricsWaitDuration.String(), cfg.PollingMetricsWaitDuration, maximumPollingMetricsWaitDuration) - cfg.PollingMetricsWaitDuration = DefaultPollingMetricsWaitDuration + seelog.Warnf("ECS_POLLING_METRICS_WAIT_DURATION parsed value (%s) is greater than the maximum of %s. Setting polling interval to maximum.", + cfg.PollingMetricsWaitDuration, maximumPollingMetricsWaitDuration) + cfg.PollingMetricsWaitDuration = maximumPollingMetricsWaitDuration } } } @@ -543,7 +545,7 @@ func environmentConfig() (Config, error) { SharedVolumeMatchFullConfig: utils.ParseBool(os.Getenv("ECS_SHARED_VOLUME_MATCH_FULL_CONFIG"), false), ContainerInstanceTags: containerInstanceTags, ContainerInstancePropagateTagsFrom: parseContainerInstancePropagateTagsFrom(), - PollMetrics: utils.ParseBool(os.Getenv("ECS_POLL_METRICS"), false), + PollMetrics: utils.ParseBool(os.Getenv("ECS_POLL_METRICS"), true), PollingMetricsWaitDuration: parseEnvVariableDuration("ECS_POLLING_METRICS_WAIT_DURATION"), DisableDockerHealthCheck: utils.ParseBool(os.Getenv("ECS_DISABLE_DOCKER_HEALTH_CHECK"), false), GPUSupportEnabled: utils.ParseBool(os.Getenv("ECS_ENABLE_GPU_SUPPORT"), false), diff --git a/agent/config/config_test.go b/agent/config/config_test.go index 9917eb32d10..9917742383e 100644 --- a/agent/config/config_test.go +++ b/agent/config/config_test.go @@ -239,7 +239,7 @@ func TestInvalidLoggingDriver(t *testing.T) { func TestDefaultPollMetricsWithoutECSDataDir(t *testing.T) { conf, err := environmentConfig() assert.NoError(t, err) - assert.False(t, conf.PollMetrics) + assert.True(t, conf.PollMetrics) } func TestDefaultCheckpointWithoutECSDataDir(t *testing.T) { @@ -357,16 +357,25 @@ func TestInvalidValueMaxPollingMetricsWaitDuration(t *testing.T) { defer setTestEnv("ECS_POLLING_METRICS_WAIT_DURATION", "21s")() conf, err := NewConfig(ec2.NewBlackholeEC2MetadataClient()) assert.NoError(t, err) - assert.Equal(t, conf.PollingMetricsWaitDuration, DefaultPollingMetricsWaitDuration, "Wrong value for PollingMetricsWaitDuration") + assert.Equal(t, maximumPollingMetricsWaitDuration, conf.PollingMetricsWaitDuration, "Wrong value for PollingMetricsWaitDuration") } func TestInvalidValueMinPollingMetricsWaitDuration(t *testing.T) { + defer setTestRegion()() + defer setTestEnv("ECS_POLL_METRICS", "true")() + defer setTestEnv("ECS_POLLING_METRICS_WAIT_DURATION", "1s")() + conf, err := NewConfig(ec2.NewBlackholeEC2MetadataClient()) + assert.NoError(t, err) + assert.Equal(t, minimumPollingMetricsWaitDuration, conf.PollingMetricsWaitDuration, "Wrong value for PollingMetricsWaitDuration") +} + +func TestInvalidValuePollingMetricsWaitDuration(t *testing.T) { defer setTestRegion()() defer setTestEnv("ECS_POLL_METRICS", "true")() defer setTestEnv("ECS_POLLING_METRICS_WAIT_DURATION", "0s")() conf, err := NewConfig(ec2.NewBlackholeEC2MetadataClient()) assert.NoError(t, err) - assert.Equal(t, conf.PollingMetricsWaitDuration, DefaultPollingMetricsWaitDuration, "Wrong value for PollingMetricsWaitDuration") + assert.Equal(t, DefaultPollingMetricsWaitDuration, conf.PollingMetricsWaitDuration, "Wrong value for PollingMetricsWaitDuration") } func TestInvalidFormatParseEnvVariableUint16(t *testing.T) {