Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

V1.52.2 stage #2875

Merged
merged 5 commits into from
May 21, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
# Changelog
## 1.52.2
* Enhancement - validate agent config file path permission on Windows [#2866](https://github.com/aws/amazon-ecs-agent/pull/2866)
* Bug - fix potential goroutine leak when closing websocket connections [#2854](https://github.com/aws/amazon-ecs-agent/pull/2854)
* Bug - fixes a bug where a task can be stuck in RUNNING indefinitely when a container can't be stopped due to an unresolved docker [bug](https://github.com/moby/moby/issues/41587) (see also the open [PR](https://github.com/moby/moby/pull/41588) in moby to fix the bug).
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we add this log to the PR summary? Thank you!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added


## 1.52.1
* Enhancement - Register Windows ECS Instances using specific OSFamilyType [#2859](https://github.com/aws/amazon-ecs-agent/pull/2859)
* Enhancement - Add retries while retrieving instance-id using EC2 Instance metadata service api [#2861](https://github.com/aws/amazon-ecs-agent/pull/2861)
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.52.1
1.52.2
9 changes: 9 additions & 0 deletions agent/Gopkg.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion agent/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -437,8 +437,11 @@ func (cfg *Config) complete() bool {
}

func fileConfig() (Config, error) {
fileName := utils.DefaultIfBlank(os.Getenv("ECS_AGENT_CONFIG_FILE_PATH"), defaultConfigFileName)
cfg := Config{}
fileName, err := getConfigFileName()
if err != nil {
return cfg, nil
}

file, err := os.Open(fileName)
if err != nil {
Expand Down
4 changes: 4 additions & 0 deletions agent/config/config_unix.go
Original file line number Diff line number Diff line change
Expand Up @@ -117,3 +117,7 @@ func (cfg *Config) platformString() string {
}
return ""
}

func getConfigFileName() (string, error) {
return utils.DefaultIfBlank(os.Getenv("ECS_AGENT_CONFIG_FILE_PATH"), defaultConfigFileName), nil
}
70 changes: 70 additions & 0 deletions agent/config/config_windows.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,13 @@ import (
"path/filepath"
"time"

"golang.org/x/sys/windows"

"github.com/aws/amazon-ecs-agent/agent/dockerclient"
"github.com/aws/amazon-ecs-agent/agent/utils"

"github.com/cihub/seelog"
"github.com/hectane/go-acl/api"
)

const (
Expand Down Expand Up @@ -55,6 +60,9 @@ const (
minimumContainerCreateTimeout = 1 * time.Minute
// default image pull inactivity time is extra time needed on container extraction
defaultImagePullInactivityTimeout = 3 * time.Minute
// adminSid is the security ID for the admin group on Windows
// Reference: https://docs.microsoft.com/en-us/troubleshoot/windows-server/identity/security-identifiers-in-windows
adminSid = "S-1-5-32-544"
)

// DefaultConfig returns the default configuration for Windows
Expand Down Expand Up @@ -144,3 +152,65 @@ func (cfg *Config) platformOverrides() {
func (cfg *Config) platformString() string {
return ""
}

var getNamedSecurityInfo = api.GetNamedSecurityInfo

// validateConfigFile checks if the config file owner is an admin
// Reference: https://github.com/hectane/go-acl#using-the-api-directly
func validateConfigFile(configFileName string) (bool, error) {
var (
Sid *windows.SID
handle windows.Handle
)
err := getNamedSecurityInfo(
configFileName,
api.SE_FILE_OBJECT,
api.OWNER_SECURITY_INFORMATION,
&Sid,
nil,
nil,
nil,
&handle,
)
if err != nil {
return false, err
}
defer windows.LocalFree(handle)

id, err := Sid.String()
if err != nil {
return false, err
}

if id == adminSid {
return true, nil
}
seelog.Debugf("Non-admin cfg file owner with SID: %v, skip merging into agent config", id)
return false, nil
}

var osStat = os.Stat

func getConfigFileName() (string, error) {
fileName := os.Getenv("ECS_AGENT_CONFIG_FILE_PATH")
// validate the config file only if above env var is not set
if len(fileName) == 0 {
fileName = defaultConfigFileName
// check if the default config file exists before validating it
_, err := osStat(fileName)
if err != nil {
return "", err
}

isValidFile, err := validateConfigFile(fileName)
if err != nil {
seelog.Errorf("Unable to validate cfg file: %v, err: %v", fileName, err)
return "", err
}
if !isValidFile {
seelog.Error("Invalid cfg file")
return "", err
}
}
return fileName, nil
}
68 changes: 68 additions & 0 deletions agent/config/config_windows_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,17 @@
package config

import (
"errors"
"os"
"testing"
"time"

"golang.org/x/sys/windows"

"github.com/aws/amazon-ecs-agent/agent/dockerclient"
"github.com/aws/amazon-ecs-agent/agent/ec2"

"github.com/hectane/go-acl/api"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
Expand Down Expand Up @@ -134,3 +139,66 @@ func TestMemoryUnboundedWindowsDisabled(t *testing.T) {
assert.NoError(t, err)
assert.False(t, cfg.PlatformVariables.MemoryUnbounded.Enabled())
}

func TestGetConfigFileName(t *testing.T) {
configFileName := "/foo/bar/config.json"
testCases := []struct {
name string
envVarVal string
expectedFileName string
expectedError error
cfgFileSid string
}{
{
name: "config file via env var, no errors",
envVarVal: configFileName,
expectedFileName: configFileName,
expectedError: nil,
cfgFileSid: "",
},
{
name: "default config file without env var, no errors",
envVarVal: "",
expectedFileName: defaultConfigFileName,
expectedError: nil,
cfgFileSid: adminSid,
},
{
name: "unable to validate cfg file error",
envVarVal: "",
expectedFileName: "",
expectedError: errors.New("Unable to validate cfg file"),
cfgFileSid: "random-sid",
},
{
name: "invalid cfg file error",
envVarVal: "",
expectedFileName: "",
expectedError: errors.New("Invalid cfg file"),
cfgFileSid: "S-1-5-7",
},
}
defer func() {
osStat = os.Stat
getNamedSecurityInfo = api.GetNamedSecurityInfo
}()

for _, tc := range testCases {
os.Setenv("ECS_AGENT_CONFIG_FILE_PATH", tc.envVarVal)
defer os.Unsetenv("ECS_AGENT_CONFIG_FILE_PATH")

osStat = func(name string) (os.FileInfo, error) {
return nil, nil
}

getNamedSecurityInfo = func(fileName string, fileType int32, secInfo uint32, owner,
group **windows.SID, dacl, sacl, secDesc *windows.Handle) error {
*owner, _ = windows.StringToSid(tc.cfgFileSid)
return tc.expectedError
}

fileName, err := getConfigFileName()
assert.Equal(t, tc.expectedFileName, fileName)
assert.Equal(t, tc.expectedError, err)
}
}
32 changes: 31 additions & 1 deletion agent/engine/docker_task_engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,16 @@ const (
FluentAWSVPCHostValue = "127.0.0.1"

defaultMonitorExecAgentsInterval = 15 * time.Minute

stopContainerBackoffMin = time.Second
stopContainerBackoffMax = time.Second * 5
stopContainerBackoffJitter = 0.2
stopContainerBackoffMultiplier = 1.3
stopContainerMaxRetryCount = 3
)

var newExponentialBackoff = retry.NewExponentialBackoff

// DockerTaskEngine is a state machine for managing a task and its containers
// in ECS.
//
Expand Down Expand Up @@ -1535,7 +1543,29 @@ func (engine *DockerTaskEngine) stopContainer(task *apitask.Task, container *api
apiTimeoutStopContainer = engine.cfg.DockerStopTimeout
}

return engine.client.StopContainer(engine.ctx, dockerID, apiTimeoutStopContainer)
return engine.stopDockerContainer(dockerID, container.Name, apiTimeoutStopContainer)
}

// stopDockerContainer attempts to stop the container, retrying only in case of time out errors.
// If the maximum number of retries is reached, the container is marked as stopped. This is because docker sometimes
// deadlocks when trying to stop a container but the actual container process is stopped.
// for more information, see: https://github.com/moby/moby/issues/41587
func (engine *DockerTaskEngine) stopDockerContainer(dockerID, containerName string, apiTimeoutStopContainer time.Duration) dockerapi.DockerContainerMetadata {
var md dockerapi.DockerContainerMetadata
backoff := newExponentialBackoff(stopContainerBackoffMin, stopContainerBackoffMax, stopContainerBackoffJitter, stopContainerBackoffMultiplier)
for i := 0; i < stopContainerMaxRetryCount; i++ {
md = engine.client.StopContainer(engine.ctx, dockerID, apiTimeoutStopContainer)
if md.Error == nil || md.Error.ErrorName() != dockerapi.DockerTimeoutErrorName {
return md
}
if i < stopContainerMaxRetryCount-1 {
time.Sleep(backoff.Duration())
}
}
if md.Error != nil && md.Error.ErrorName() == dockerapi.DockerTimeoutErrorName {
seelog.Warnf("Unable to stop container (%s) after %d attempts that timed out; giving up and marking container as stopped anyways", containerName, stopContainerMaxRetryCount)
}
return md
}

func (engine *DockerTaskEngine) removeContainer(task *apitask.Task, container *apicontainer.Container) error {
Expand Down
39 changes: 4 additions & 35 deletions agent/engine/docker_task_engine_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -897,7 +897,6 @@ func TestTaskTransitionWhenStopContainerTimesout(t *testing.T) {
Duration: 30 * time.Second,
},
}
dockerEventSent := make(chan int)
for _, container := range sleepTask.Containers {
imageManager.EXPECT().AddAllImageStates(gomock.Any()).AnyTimes()
client.EXPECT().PullImage(gomock.Any(), container.Image, nil, gomock.Any()).Return(dockerapi.DockerContainerMetadata{})
Expand All @@ -918,27 +917,15 @@ func TestTaskTransitionWhenStopContainerTimesout(t *testing.T) {
}()
}).Return(dockerapi.DockerContainerMetadata{DockerID: containerID}),

// StopContainer times out
client.EXPECT().StopContainer(gomock.Any(), containerID, gomock.Any()).Return(containerStopTimeoutError),
// Since task is not in steady state, progressContainers causes
// another invocation of StopContainer. Return a timeout error
// for that as well.
client.EXPECT().StopContainer(gomock.Any(), containerID, gomock.Any()).Do(
func(ctx interface{}, id string, timeout time.Duration) {
go func() {
dockerEventSent <- 1
// Emit 'ContainerStopped' event to the container event stream
// This should cause the container and the task to transition
// to 'STOPPED'
eventStream <- createDockerEvent(apicontainerstatus.ContainerStopped)
}()
}).Return(containerStopTimeoutError).MinTimes(1),
// Validate that timeouts are retried exactly 3 times
client.EXPECT().StopContainer(gomock.Any(), containerID, gomock.Any()).
Return(containerStopTimeoutError).
Times(3),
)
}

err := taskEngine.Init(ctx)
assert.NoError(t, err)
stateChangeEvents := taskEngine.StateChangeEvents()

go taskEngine.AddTask(sleepTask)
// wait for task running
Expand All @@ -948,24 +935,6 @@ func TestTaskTransitionWhenStopContainerTimesout(t *testing.T) {
updateSleepTask.SetDesiredStatus(apitaskstatus.TaskStopped)
go taskEngine.AddTask(updateSleepTask)

// StopContainer timeout error shouldn't cause cantainer/task status change
// until receive stop event from docker event stream
select {
case <-stateChangeEvents:
t.Error("Should not get task events")
case <-dockerEventSent:
t.Logf("Send docker stop event")
go func() {
for {
select {
case <-dockerEventSent:
case <-ctx.Done():
return
}
}
}()
}

// StopContainer was called again and received stop event from docker event stream
// Expect it to go to stopped
waitForStopEvents(t, taskEngine.StateChangeEvents(), false, false)
Expand Down
20 changes: 1 addition & 19 deletions agent/engine/task_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -930,24 +930,6 @@ func (mtask *managedTask) handleEventError(containerChange dockerContainerChange
func (mtask *managedTask) handleContainerStoppedTransitionError(event dockerapi.DockerContainerChangeEvent,
container *apicontainer.Container,
currentKnownStatus apicontainerstatus.ContainerStatus) bool {
// If we were trying to transition to stopped and had a timeout error
// from docker, reset the known status to the current status and return
// This ensures that we don't emit a containerstopped event; a
// terminal container event from docker event stream will instead be
// responsible for the transition. Alternatively, the steadyState check
// could also trigger the progress and have another go at stopping the
// container
if event.Error.ErrorName() == dockerapi.DockerTimeoutErrorName {
logger.Info("Error stopping container; ignoring state change", logger.Fields{
field.TaskARN: mtask.Arn,
field.Container: container.Name,
field.RuntimeID: container.GetRuntimeID(),
"ErrorName": event.Error.ErrorName(),
field.Error: event.Error.Error(),
})
container.SetKnownStatus(currentKnownStatus)
return false
}
// If docker returned a transient error while trying to stop a container,
// reset the known status to the current status and return
cannotStopContainerError, ok := event.Error.(cannotStopContainerError)
Expand All @@ -969,7 +951,7 @@ func (mtask *managedTask) handleContainerStoppedTransitionError(event dockerapi.
// enough) and get on with it
// This can happen in cases where the container we tried to stop
// was already stopped or did not exist at all.
logger.Warn("Error stopping the container; ignoring state change", logger.Fields{
logger.Warn("Error stopping the container; marking it as stopped anyway", logger.Fields{
field.TaskARN: mtask.Arn,
field.Container: container.Name,
field.RuntimeID: container.GetRuntimeID(),
Expand Down
4 changes: 2 additions & 2 deletions agent/engine/task_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,8 @@ func TestHandleEventError(t *testing.T) {
CurrentContainerKnownStatus: apicontainerstatus.ContainerRunning,
Error: &dockerapi.DockerTimeoutError{},
ExpectedContainerKnownStatusSet: true,
ExpectedContainerKnownStatus: apicontainerstatus.ContainerRunning,
ExpectedOK: false,
ExpectedContainerKnownStatus: apicontainerstatus.ContainerStopped,
ExpectedOK: true,
},
{
Name: "Retriable error with stop",
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading