Skip to content

Commit

Permalink
Change default docker metric gathering behavior
Browse files Browse the repository at this point in the history
1. Change default docker metric gathering behavior from streaming
metrics to polling.
2. Change the default polling interval to half of the TACS publishing
interval (currently 20s), so that every publish interval we have two
docker metrics.
3. Change the minimum polling interval to 5s to prevent customers from
configuring polling to be just as resource-intensive as streaming
metrics.

These changes are being made because we have found that docker streaming
stats consumes considerable resources from the agent, dockerd daemon, and
containerd daemon.
  • Loading branch information
sparrc committed May 13, 2020
1 parent f135e43 commit f02e7c6
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 13 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,8 @@ additional details on each available environment variable.
| `ECS_DATADIR` | /data/ | The container path where state is checkpointed for use across agent restarts. Note that on Linux, when you specify this, you will need to make sure that the Agent container has a bind mount of `$ECS_HOST_DATA_DIR/data:$ECS_DATADIR` with the corresponding values of `ECS_HOST_DATA_DIR` and `ECS_DATADIR`. | /data/ | `C:\ProgramData\Amazon\ECS\data`
| `ECS_UPDATES_ENABLED` | <true | false> | Whether to exit for an updater to apply updates when requested. | false | false |
| `ECS_DISABLE_METRICS` | <true | false> | Whether to disable metrics gathering for tasks. | false | true |
| `ECS_POLL_METRICS` | <true | false> | Whether to poll or stream when gathering metrics for tasks. | false | false |
| `ECS_POLLING_METRICS_WAIT_DURATION` | 30s | Time to wait to poll for new metrics for a task. Only used when ECS_POLL_METRICS is true | 15s | 15s |
| `ECS_POLL_METRICS` | <true | false> | Whether to poll or stream when gathering metrics for tasks. WARNING: setting this to false on an instance with many containers can result in very high CPU utilization by the agent, dockerd, and containerd. | `true` | `true` |
| `ECS_POLLING_METRICS_WAIT_DURATION` | 10s | Time to wait between polling for metrics for a task. Not used when ECS_POLL_METRICS is false. Maximum value is 20s and minimum value is 5s. | 10s | 10s |
| `ECS_RESERVED_MEMORY` | 32 | Memory, in MiB, to reserve for use by things other than containers managed by Amazon ECS. | 0 | 0 |
| `ECS_AVAILABLE_LOGGING_DRIVERS` | `["awslogs","fluentd","gelf","json-file","journald","logentries","splunk","syslog"]` | Which logging drivers are available on the container instance. | `["json-file","none"]` | `["json-file","none"]` |
| `ECS_DISABLE_PRIVILEGED` | `true` | Whether launching privileged containers is disabled on the container instance. | `false` | `false` |
Expand Down
18 changes: 10 additions & 8 deletions agent/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ const (

// DefaultPollingMetricsWaitDuration specifies the default value for polling metrics wait duration
// This is only used when PollMetrics is set to true
DefaultPollingMetricsWaitDuration = 15 * time.Second
DefaultPollingMetricsWaitDuration = 10 * time.Second

// defaultDockerStopTimeout specifies the value for container stop timeout duration
defaultDockerStopTimeout = 30 * time.Second
Expand Down Expand Up @@ -97,11 +97,11 @@ const (

// minimumPollingMetricsWaitDuration specifies the minimum duration to wait before polling for new stats
// from docker. This is only used when PollMetrics is set to true
minimumPollingMetricsWaitDuration = 1 * time.Second
minimumPollingMetricsWaitDuration = 5 * time.Second

// maximumPollingMetricsWaitDuration specifies the maximum duration to wait before polling for new stats
// from docker. This is only used when PollMetrics is set to true
maximumPollingMetricsWaitDuration = 20 * time.Second
maximumPollingMetricsWaitDuration = DefaultContainerMetricsPublishInterval

// minimumDockerStopTimeout specifies the minimum value for docker StopContainer API
minimumDockerStopTimeout = 1 * time.Second
Expand Down Expand Up @@ -346,13 +346,15 @@ func (cfg *Config) validateAndOverrideBounds() error {
func (cfg *Config) pollMetricsOverrides() {
if cfg.PollMetrics {
if cfg.PollingMetricsWaitDuration < minimumPollingMetricsWaitDuration {
seelog.Warnf("Invalid value for polling metrics wait duration, will be overridden with the default value: %s. Parsed value: %v, minimum value: %v.", DefaultPollingMetricsWaitDuration.String(), cfg.PollingMetricsWaitDuration, minimumPollingMetricsWaitDuration)
cfg.PollingMetricsWaitDuration = DefaultPollingMetricsWaitDuration
seelog.Warnf("ECS_POLLING_METRICS_WAIT_DURATION parsed value (%s) is less than the minimum of %s. Setting polling interval to minimum.",
cfg.PollingMetricsWaitDuration, minimumPollingMetricsWaitDuration)
cfg.PollingMetricsWaitDuration = minimumPollingMetricsWaitDuration
}

if cfg.PollingMetricsWaitDuration > maximumPollingMetricsWaitDuration {
seelog.Warnf("Invalid value for polling metrics wait duration, will be overridden with the default value: %s. Parsed value: %v, maximum value: %v.", DefaultPollingMetricsWaitDuration.String(), cfg.PollingMetricsWaitDuration, maximumPollingMetricsWaitDuration)
cfg.PollingMetricsWaitDuration = DefaultPollingMetricsWaitDuration
seelog.Warnf("ECS_POLLING_METRICS_WAIT_DURATION parsed value (%s) is greater than the maximum of %s. Setting polling interval to maximum.",
cfg.PollingMetricsWaitDuration, maximumPollingMetricsWaitDuration)
cfg.PollingMetricsWaitDuration = maximumPollingMetricsWaitDuration
}
}
}
Expand Down Expand Up @@ -543,7 +545,7 @@ func environmentConfig() (Config, error) {
SharedVolumeMatchFullConfig: utils.ParseBool(os.Getenv("ECS_SHARED_VOLUME_MATCH_FULL_CONFIG"), false),
ContainerInstanceTags: containerInstanceTags,
ContainerInstancePropagateTagsFrom: parseContainerInstancePropagateTagsFrom(),
PollMetrics: utils.ParseBool(os.Getenv("ECS_POLL_METRICS"), false),
PollMetrics: utils.ParseBool(os.Getenv("ECS_POLL_METRICS"), true),
PollingMetricsWaitDuration: parseEnvVariableDuration("ECS_POLLING_METRICS_WAIT_DURATION"),
DisableDockerHealthCheck: utils.ParseBool(os.Getenv("ECS_DISABLE_DOCKER_HEALTH_CHECK"), false),
GPUSupportEnabled: utils.ParseBool(os.Getenv("ECS_ENABLE_GPU_SUPPORT"), false),
Expand Down
15 changes: 12 additions & 3 deletions agent/config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ func TestInvalidLoggingDriver(t *testing.T) {
func TestDefaultPollMetricsWithoutECSDataDir(t *testing.T) {
conf, err := environmentConfig()
assert.NoError(t, err)
assert.False(t, conf.PollMetrics)
assert.True(t, conf.PollMetrics)
}

func TestDefaultCheckpointWithoutECSDataDir(t *testing.T) {
Expand Down Expand Up @@ -357,16 +357,25 @@ func TestInvalidValueMaxPollingMetricsWaitDuration(t *testing.T) {
defer setTestEnv("ECS_POLLING_METRICS_WAIT_DURATION", "21s")()
conf, err := NewConfig(ec2.NewBlackholeEC2MetadataClient())
assert.NoError(t, err)
assert.Equal(t, conf.PollingMetricsWaitDuration, DefaultPollingMetricsWaitDuration, "Wrong value for PollingMetricsWaitDuration")
assert.Equal(t, maximumPollingMetricsWaitDuration, conf.PollingMetricsWaitDuration, "Wrong value for PollingMetricsWaitDuration")
}

func TestInvalidValueMinPollingMetricsWaitDuration(t *testing.T) {
defer setTestRegion()()
defer setTestEnv("ECS_POLL_METRICS", "true")()
defer setTestEnv("ECS_POLLING_METRICS_WAIT_DURATION", "1s")()
conf, err := NewConfig(ec2.NewBlackholeEC2MetadataClient())
assert.NoError(t, err)
assert.Equal(t, minimumPollingMetricsWaitDuration, conf.PollingMetricsWaitDuration, "Wrong value for PollingMetricsWaitDuration")
}

func TestInvalidValuePollingMetricsWaitDuration(t *testing.T) {
defer setTestRegion()()
defer setTestEnv("ECS_POLL_METRICS", "true")()
defer setTestEnv("ECS_POLLING_METRICS_WAIT_DURATION", "0s")()
conf, err := NewConfig(ec2.NewBlackholeEC2MetadataClient())
assert.NoError(t, err)
assert.Equal(t, conf.PollingMetricsWaitDuration, DefaultPollingMetricsWaitDuration, "Wrong value for PollingMetricsWaitDuration")
assert.Equal(t, DefaultPollingMetricsWaitDuration, conf.PollingMetricsWaitDuration, "Wrong value for PollingMetricsWaitDuration")
}

func TestInvalidFormatParseEnvVariableUint16(t *testing.T) {
Expand Down

0 comments on commit f02e7c6

Please sign in to comment.