Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merge feature ecs-anywhere-gpu support to dev #3040

Merged
merged 7 commits into from
Sep 28, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 21 additions & 8 deletions agent/api/task/task.go
Original file line number Diff line number Diff line change
@@ -458,8 +458,12 @@ func (task *Task) addGPUResource(cfg *config.Config) error {
container.GPUIDs = append(container.GPUIDs, association.Name)
}
}
task.populateGPUEnvironmentVariables()
task.NvidiaRuntime = cfg.NvidiaRuntime
// For external instances, GPU IDs are handled by resources struct
// For internal instances, GPU IDs are handled by env var
if !cfg.External.Enabled() {
task.populateGPUEnvironmentVariables()
task.NvidiaRuntime = cfg.NvidiaRuntime
}
}
return nil
}
@@ -1466,7 +1470,7 @@ func (task *Task) dockerHostConfig(container *apicontainer.Container, dockerCont
return nil, &apierrors.HostConfigError{Msg: err.Error()}
}

resources := task.getDockerResources(container)
resources := task.getDockerResources(container, cfg)

// Populate hostConfig
hostConfig := &dockercontainer.HostConfig{
@@ -1531,11 +1535,13 @@ func (task *Task) dockerHostConfig(container *apicontainer.Container, dockerCont
func (task *Task) overrideContainerRuntime(container *apicontainer.Container, hostCfg *dockercontainer.HostConfig,
cfg *config.Config) *apierrors.HostConfigError {
if task.isGPUEnabled() && task.shouldRequireNvidiaRuntime(container) {
if task.NvidiaRuntime == "" {
return &apierrors.HostConfigError{Msg: "Runtime is not set for GPU containers"}
if !cfg.External.Enabled() {
if task.NvidiaRuntime == "" {
return &apierrors.HostConfigError{Msg: "Runtime is not set for GPU containers"}
}
seelog.Debugf("Setting runtime as %s for container %s", task.NvidiaRuntime, container.Name)
hostCfg.Runtime = task.NvidiaRuntime
}
seelog.Debugf("Setting runtime as %s for container %s", task.NvidiaRuntime, container.Name)
hostCfg.Runtime = task.NvidiaRuntime
}

if cfg.InferentiaSupportEnabled && container.RequireNeuronRuntime() {
@@ -1546,7 +1552,7 @@ func (task *Task) overrideContainerRuntime(container *apicontainer.Container, ho
}

// Requires an *apicontainer.Container and returns the Resources for the HostConfig struct
func (task *Task) getDockerResources(container *apicontainer.Container) dockercontainer.Resources {
func (task *Task) getDockerResources(container *apicontainer.Container, cfg *config.Config) dockercontainer.Resources {
// Convert MB to B and set Memory
dockerMem := int64(container.Memory * 1024 * 1024)
if dockerMem != 0 && dockerMem < apicontainer.DockerContainerMinimumMemoryInBytes {
@@ -1560,6 +1566,13 @@ func (task *Task) getDockerResources(container *apicontainer.Container) dockerco
Memory: dockerMem,
CPUShares: cpuShare,
}
if cfg.External.Enabled() && cfg.GPUSupportEnabled {
deviceRequest := dockercontainer.DeviceRequest{
Capabilities: [][]string{[]string{"gpu"}},
DeviceIDs: container.GPUIDs,
}
resources.DeviceRequests = []dockercontainer.DeviceRequest{deviceRequest}
}
return resources
}

57 changes: 53 additions & 4 deletions agent/api/task/task_test.go
Original file line number Diff line number Diff line change
@@ -568,7 +568,8 @@ func TestGetDockerResources(t *testing.T) {
},
},
}
resources := testTask.getDockerResources(testTask.Containers[0])
cfg := &config.Config{}
resources := testTask.getDockerResources(testTask.Containers[0], cfg)
assert.Equal(t, int64(10), resources.CPUShares, "Wrong number of CPUShares")
assert.Equal(t, int64(268435456), resources.Memory, "Wrong amount of memory")
}
@@ -586,7 +587,8 @@ func TestGetDockerResourcesCPUTooLow(t *testing.T) {
},
},
}
resources := testTask.getDockerResources(testTask.Containers[0])
cfg := &config.Config{}
resources := testTask.getDockerResources(testTask.Containers[0], cfg)
assert.Equal(t, int64(268435456), resources.Memory, "Wrong amount of memory")

// Minimum requirement of 2 CPU Shares
@@ -608,7 +610,8 @@ func TestGetDockerResourcesMemoryTooLow(t *testing.T) {
},
},
}
resources := testTask.getDockerResources(testTask.Containers[0])
cfg := &config.Config{}
resources := testTask.getDockerResources(testTask.Containers[0], cfg)
assert.Equal(t, int64(10), resources.CPUShares, "Wrong number of CPUShares")
assert.Equal(t, int64(apicontainer.DockerContainerMinimumMemoryInBytes), resources.Memory,
"Wrong amount of memory")
@@ -626,11 +629,57 @@ func TestGetDockerResourcesUnspecifiedMemory(t *testing.T) {
},
},
}
resources := testTask.getDockerResources(testTask.Containers[0])
cfg := &config.Config{}
resources := testTask.getDockerResources(testTask.Containers[0], cfg)
assert.Equal(t, int64(10), resources.CPUShares, "Wrong number of CPUShares")
assert.Equal(t, int64(0), resources.Memory, "Wrong amount of memory")
}

func TestGetDockerResourcesExternalGPUInstance(t *testing.T) {
container := &apicontainer.Container{
Name: "c1",
CPU: uint(10),
Memory: uint(256),
GPUIDs: []string{"gpu1"},
}
testTask := &Task{
Arn: "arn:aws:ecs:us-east-1:012345678910:task/c09f0188-7f87-4b0f-bfc3-16296622b6fe",
Family: "myFamily",
Version: "1",
Containers: []*apicontainer.Container{container},
}
cfg := &config.Config{
GPUSupportEnabled: true,
}
cfg.External.Value = config.ExplicitlyEnabled
resources := testTask.getDockerResources(testTask.Containers[0], cfg)
assert.Equal(t, int64(10), resources.CPUShares, "Wrong number of CPUShares")
assert.Equal(t, int64(268435456), resources.Memory, "Wrong amount of memory")
assert.Equal(t, resources.DeviceRequests[0].DeviceIDs, container.GPUIDs, "Wrong GPU IDs assigned")
}

func TestGetDockerResourcesInternalGPUInstance(t *testing.T) {
container := &apicontainer.Container{
Name: "c1",
CPU: uint(10),
Memory: uint(256),
GPUIDs: []string{"gpu1"},
}
testTask := &Task{
Arn: "arn:aws:ecs:us-east-1:012345678910:task/c09f0188-7f87-4b0f-bfc3-16296622b6fe",
Family: "myFamily",
Version: "1",
Containers: []*apicontainer.Container{container},
}
cfg := &config.Config{
GPUSupportEnabled: true,
}
resources := testTask.getDockerResources(testTask.Containers[0], cfg)
assert.Equal(t, int64(10), resources.CPUShares, "Wrong number of CPUShares")
assert.Equal(t, int64(268435456), resources.Memory, "Wrong amount of memory")
assert.Equal(t, int64(len(resources.DeviceRequests)), int64(0), "GPU IDs to be handled by env var for internal instance")
}

func TestPostUnmarshalTaskWithDockerVolumes(t *testing.T) {
autoprovision := true
ctrl := gomock.NewController(t)