Skip to content

Commit

Permalink
Change default docker metric gathering behavior (#2452)
Browse files Browse the repository at this point in the history
* Change default docker metric gathering behavior

1. Change default docker metric gathering behavior from streaming
metrics to polling.
2. Change the default polling interval to half of the TACS publishing
interval (currently 20s), so that every publish interval we have two
docker metrics.
3. Change the minimum polling interval to 5s to prevent customers from
configuring polling to be just as resource-intensive as streaming
metrics.

These changes are being made because we have found that docker streaming
stats consumes considerable resources from the agent, dockerd daemon, and
containerd daemon.

* Task stats endpoint needs to be populated immediately

to avoid changing behavior from streaming stats, we need to populate the
stats endpoint immediately when the stats engine starts. So instead of
jittering the first stats gather we need to just do it immediately.

* Pinning windows images (ltsc201*) to known working version

5/12 release broke our tests
  • Loading branch information
sparrc authored May 18, 2020
1 parent c267c9a commit afa5b19
Show file tree
Hide file tree
Showing 5 changed files with 64 additions and 53 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,8 @@ additional details on each available environment variable.
| `ECS_DATADIR` | /data/ | The container path where state is checkpointed for use across agent restarts. Note that on Linux, when you specify this, you will need to make sure that the Agent container has a bind mount of `$ECS_HOST_DATA_DIR/data:$ECS_DATADIR` with the corresponding values of `ECS_HOST_DATA_DIR` and `ECS_DATADIR`. | /data/ | `C:\ProgramData\Amazon\ECS\data`
| `ECS_UPDATES_ENABLED` | <true | false> | Whether to exit for an updater to apply updates when requested. | false | false |
| `ECS_DISABLE_METRICS` | <true | false> | Whether to disable metrics gathering for tasks. | false | true |
| `ECS_POLL_METRICS` | <true | false> | Whether to poll or stream when gathering metrics for tasks. | false | false |
| `ECS_POLLING_METRICS_WAIT_DURATION` | 30s | Time to wait to poll for new metrics for a task. Only used when ECS_POLL_METRICS is true | 15s | 15s |
| `ECS_POLL_METRICS` | <true | false> | Whether to poll or stream when gathering metrics for tasks. This defaulted to `false` previous to agent version 1.40.0. WARNING: setting this to false on an instance with many containers can result in very high CPU utilization by the agent, dockerd, and containerd. | `true` | `true` |
| `ECS_POLLING_METRICS_WAIT_DURATION` | 10s | Time to wait between polling for metrics for a task. Not used when ECS_POLL_METRICS is false. Maximum value is 20s and minimum value is 5s. If user sets above maximum it will be set to max, and if below minimum it will be set to min. | 10s | 10s |
| `ECS_RESERVED_MEMORY` | 32 | Memory, in MiB, to reserve for use by things other than containers managed by Amazon ECS. | 0 | 0 |
| `ECS_AVAILABLE_LOGGING_DRIVERS` | `["awslogs","fluentd","gelf","json-file","journald","logentries","splunk","syslog"]` | Which logging drivers are available on the container instance. | `["json-file","none"]` | `["json-file","none"]` |
| `ECS_DISABLE_PRIVILEGED` | `true` | Whether launching privileged containers is disabled on the container instance. | `false` | `false` |
Expand Down
18 changes: 10 additions & 8 deletions agent/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ const (

// DefaultPollingMetricsWaitDuration specifies the default value for polling metrics wait duration
// This is only used when PollMetrics is set to true
DefaultPollingMetricsWaitDuration = 15 * time.Second
DefaultPollingMetricsWaitDuration = DefaultContainerMetricsPublishInterval / 2

// defaultDockerStopTimeout specifies the value for container stop timeout duration
defaultDockerStopTimeout = 30 * time.Second
Expand Down Expand Up @@ -97,11 +97,11 @@ const (

// minimumPollingMetricsWaitDuration specifies the minimum duration to wait before polling for new stats
// from docker. This is only used when PollMetrics is set to true
minimumPollingMetricsWaitDuration = 1 * time.Second
minimumPollingMetricsWaitDuration = 5 * time.Second

// maximumPollingMetricsWaitDuration specifies the maximum duration to wait before polling for new stats
// from docker. This is only used when PollMetrics is set to true
maximumPollingMetricsWaitDuration = 20 * time.Second
maximumPollingMetricsWaitDuration = DefaultContainerMetricsPublishInterval

// minimumDockerStopTimeout specifies the minimum value for docker StopContainer API
minimumDockerStopTimeout = 1 * time.Second
Expand Down Expand Up @@ -350,13 +350,15 @@ func (cfg *Config) validateAndOverrideBounds() error {
func (cfg *Config) pollMetricsOverrides() {
if cfg.PollMetrics {
if cfg.PollingMetricsWaitDuration < minimumPollingMetricsWaitDuration {
seelog.Warnf("Invalid value for polling metrics wait duration, will be overridden with the default value: %s. Parsed value: %v, minimum value: %v.", DefaultPollingMetricsWaitDuration.String(), cfg.PollingMetricsWaitDuration, minimumPollingMetricsWaitDuration)
cfg.PollingMetricsWaitDuration = DefaultPollingMetricsWaitDuration
seelog.Warnf("ECS_POLLING_METRICS_WAIT_DURATION parsed value (%s) is less than the minimum of %s. Setting polling interval to minimum.",
cfg.PollingMetricsWaitDuration, minimumPollingMetricsWaitDuration)
cfg.PollingMetricsWaitDuration = minimumPollingMetricsWaitDuration
}

if cfg.PollingMetricsWaitDuration > maximumPollingMetricsWaitDuration {
seelog.Warnf("Invalid value for polling metrics wait duration, will be overridden with the default value: %s. Parsed value: %v, maximum value: %v.", DefaultPollingMetricsWaitDuration.String(), cfg.PollingMetricsWaitDuration, maximumPollingMetricsWaitDuration)
cfg.PollingMetricsWaitDuration = DefaultPollingMetricsWaitDuration
seelog.Warnf("ECS_POLLING_METRICS_WAIT_DURATION parsed value (%s) is greater than the maximum of %s. Setting polling interval to maximum.",
cfg.PollingMetricsWaitDuration, maximumPollingMetricsWaitDuration)
cfg.PollingMetricsWaitDuration = maximumPollingMetricsWaitDuration
}
}
}
Expand Down Expand Up @@ -547,7 +549,7 @@ func environmentConfig() (Config, error) {
SharedVolumeMatchFullConfig: utils.ParseBool(os.Getenv("ECS_SHARED_VOLUME_MATCH_FULL_CONFIG"), false),
ContainerInstanceTags: containerInstanceTags,
ContainerInstancePropagateTagsFrom: parseContainerInstancePropagateTagsFrom(),
PollMetrics: utils.ParseBool(os.Getenv("ECS_POLL_METRICS"), false),
PollMetrics: utils.ParseBool(os.Getenv("ECS_POLL_METRICS"), true),
PollingMetricsWaitDuration: parseEnvVariableDuration("ECS_POLLING_METRICS_WAIT_DURATION"),
DisableDockerHealthCheck: utils.ParseBool(os.Getenv("ECS_DISABLE_DOCKER_HEALTH_CHECK"), false),
GPUSupportEnabled: utils.ParseBool(os.Getenv("ECS_ENABLE_GPU_SUPPORT"), false),
Expand Down
15 changes: 12 additions & 3 deletions agent/config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ func TestInvalidLoggingDriver(t *testing.T) {
func TestDefaultPollMetricsWithoutECSDataDir(t *testing.T) {
conf, err := environmentConfig()
assert.NoError(t, err)
assert.False(t, conf.PollMetrics)
assert.True(t, conf.PollMetrics)
}

func TestDefaultCheckpointWithoutECSDataDir(t *testing.T) {
Expand Down Expand Up @@ -357,16 +357,25 @@ func TestInvalidValueMaxPollingMetricsWaitDuration(t *testing.T) {
defer setTestEnv("ECS_POLLING_METRICS_WAIT_DURATION", "21s")()
conf, err := NewConfig(ec2.NewBlackholeEC2MetadataClient())
assert.NoError(t, err)
assert.Equal(t, conf.PollingMetricsWaitDuration, DefaultPollingMetricsWaitDuration, "Wrong value for PollingMetricsWaitDuration")
assert.Equal(t, maximumPollingMetricsWaitDuration, conf.PollingMetricsWaitDuration, "Wrong value for PollingMetricsWaitDuration")
}

func TestInvalidValueMinPollingMetricsWaitDuration(t *testing.T) {
defer setTestRegion()()
defer setTestEnv("ECS_POLL_METRICS", "true")()
defer setTestEnv("ECS_POLLING_METRICS_WAIT_DURATION", "1s")()
conf, err := NewConfig(ec2.NewBlackholeEC2MetadataClient())
assert.NoError(t, err)
assert.Equal(t, minimumPollingMetricsWaitDuration, conf.PollingMetricsWaitDuration, "Wrong value for PollingMetricsWaitDuration")
}

func TestInvalidValuePollingMetricsWaitDuration(t *testing.T) {
defer setTestRegion()()
defer setTestEnv("ECS_POLL_METRICS", "true")()
defer setTestEnv("ECS_POLLING_METRICS_WAIT_DURATION", "0s")()
conf, err := NewConfig(ec2.NewBlackholeEC2MetadataClient())
assert.NoError(t, err)
assert.Equal(t, conf.PollingMetricsWaitDuration, DefaultPollingMetricsWaitDuration, "Wrong value for PollingMetricsWaitDuration")
assert.Equal(t, DefaultPollingMetricsWaitDuration, conf.PollingMetricsWaitDuration, "Wrong value for PollingMetricsWaitDuration")
}

func TestInvalidFormatParseEnvVariableUint16(t *testing.T) {
Expand Down
72 changes: 36 additions & 36 deletions agent/dockerclient/dockerapi/docker_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -1369,56 +1369,56 @@ func (dg *dockerGoClient) Stats(ctx context.Context, id string, inactivityTimeou
}()
} else {
seelog.Infof("DockerGoClient: Starting to Poll for metrics for container %s", id)
// firstStatC jitters the time at which containers ask for their first stat.
// We use this channel to 'seed' the queue with docker stats so that containers
// can publish metrics more quickly.
firstStatC := time.After(retry.AddJitter(time.Nanosecond, dg.config.PollingMetricsWaitDuration/2))
// sleeping here jitters the time at which the ticker is created, so that
// containers do not synchronize on calling the docker stats api.
// the max sleep is 80% of the polling interval so that we have a chance to
// get two stats in the first publishing interval.
time.Sleep(retry.AddJitter(time.Nanosecond, dg.config.PollingMetricsWaitDuration*8/10))
statPollTicker := time.NewTicker(dg.config.PollingMetricsWaitDuration)

go func() {
defer cancelRequest()
defer close(statsC)
// we need to start by getting container stats so that the task stats
// endpoint will be populated immediately.
stats, err := getContainerStatsNotStreamed(client, subCtx, id)
if err != nil {
errC <- err
return
}
statsC <- stats

// sleeping here jitters the time at which the ticker is created, so that
// containers do not synchronize on calling the docker stats api.
// the max sleep is 80% of the polling interval so that we have a chance to
// get two stats in the first publishing interval.
time.Sleep(retry.AddJitter(time.Nanosecond, dg.config.PollingMetricsWaitDuration*8/10))
statPollTicker := time.NewTicker(dg.config.PollingMetricsWaitDuration)
defer statPollTicker.Stop()

for {
// this select statement is waiting on either the stat polling ticker channel
// or the firstStat time.After channel to fire. firstStat will fire
// first and then afterwards we will always grab stats on the ticker.
select {
case _, ok := <-statPollTicker.C:
if !ok {
return
}
case <-firstStatC:
}

stream := false
resp, err = client.ContainerStats(subCtx, id, stream)
if err != nil {
errC <- fmt.Errorf("DockerGoClient: Unable to retrieve stats for container %s: %v", id, err)
return
}

decoder := json.NewDecoder(resp.Body)
data := new(types.StatsJSON)
err := decoder.Decode(data)
for range statPollTicker.C {
stats, err := getContainerStatsNotStreamed(client, subCtx, id)
if err != nil {
errC <- fmt.Errorf("DockerGoClient: Unable to decode stats for container %s: %v", id, err)
errC <- err
return
}

statsC <- data
statsC <- stats
}
}()
}

return statsC, errC
}

func getContainerStatsNotStreamed(client sdkclient.Client, ctx context.Context, id string) (*types.StatsJSON, error) {
resp, err := client.ContainerStats(ctx, id, false)
if err != nil {
return nil, fmt.Errorf("DockerGoClient: Unable to retrieve stats for container %s: %v", id, err)
}

decoder := json.NewDecoder(resp.Body)
stats := &types.StatsJSON{}
err = decoder.Decode(stats)
if err != nil {
return nil, fmt.Errorf("DockerGoClient: Unable to decode stats for container %s: %v", id, err)
}

return stats, nil
}

func (dg *dockerGoClient) RemoveImage(ctx context.Context, imageName string, timeout time.Duration) error {
ctx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
Expand Down
8 changes: 4 additions & 4 deletions scripts/run-integ-tests.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@ Param (
)

if ($Platform -like "windows2016") {
$BaseImageName="mcr.microsoft.com/windows/servercore:ltsc2016"
$BaseImageNameWithDigest="mcr.microsoft.com/windows/servercore@sha256:91368f3cff77ad42259ccb3bf3d1a4e145cf5fa9e486f23999d32711c2913f3e"
$BaseImageName="mcr.microsoft.com/windows/servercore@sha256:42be24b8810c861cc1b3fe75c5e99f75061cb45fdbae1de46d151c18cc8e6a9a"
$BaseImageNameWithDigest="mcr.microsoft.com/windows/servercore@sha256:42be24b8810c861cc1b3fe75c5e99f75061cb45fdbae1de46d151c18cc8e6a9a"
} elseif ($Platform -like "windows2019") {
$BaseImageName="mcr.microsoft.com/windows/servercore:ltsc2019"
$BaseImageNameWithDigest="mcr.microsoft.com/windows/servercore@sha256:e20960b4c06acee08af55164e3abc37b39cdc128ce2f5fcdf3397c738cb91069"
$BaseImageName="mcr.microsoft.com/windows/servercore@sha256:cc6d6da31014dceab4daee8b5a8da4707233f4ef42eaf071e45cee044ac738f4"
$BaseImageNameWithDigest="mcr.microsoft.com/windows/servercore@sha256:cc6d6da31014dceab4daee8b5a8da4707233f4ef42eaf071e45cee044ac738f4"
} else {
echo "Invalid platform parameter"
exit 1
Expand Down

0 comments on commit afa5b19

Please sign in to comment.