Skip to content

Commit 38ac49e

Browse files
committed
Added inf support config and neuron runtime override.
Added an agent config InferentiaSupportEnabled populated by ECS_ENABLE_INF_SUPPORT env. For a container that has AWS_NEURON_VISIBLE_DEVICES specified, if InferentiaSupportEnabled is on, the agent will override its runtime to the neuron docker runtime needed for using the inferentia devices. This enables us to only use the neuron runtime for container that needs the inf device, and only do so when such runtime is installed on the AMI (which is indicated by the ECS_ENABLE_INF_SUPPORT config that we will add together with installing the neuron runtime).
1 parent ef91b76 commit 38ac49e

File tree

10 files changed

+151
-56
lines changed

10 files changed

+151
-56
lines changed

agent/api/container/container.go

+11
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,9 @@ const (
7777

7878
// TargetLogDriver is to show secret target being "LOG_DRIVER", the default will be "CONTAINER"
7979
SecretTargetLogDriver = "LOG_DRIVER"
80+
81+
// neuronVisibleDevicesEnvVar is the env which indicates that the container wants to use inferentia devices.
82+
neuronVisibleDevicesEnvVar = "AWS_NEURON_VISIBLE_DEVICES"
8083
)
8184

8285
// DockerConfig represents additional metadata about a container to run. It's
@@ -1120,3 +1123,11 @@ func (c *Container) GetEnvironmentFiles() []EnvironmentFile {
11201123

11211124
return c.EnvironmentFiles
11221125
}
1126+
1127+
func (c *Container) RequireNeuronRuntime() bool {
1128+
c.lock.RLock()
1129+
defer c.lock.RUnlock()
1130+
1131+
_, ok := c.Environment[neuronVisibleDevicesEnvVar]
1132+
return ok
1133+
}

agent/api/container/container_test.go

+7
Original file line numberDiff line numberDiff line change
@@ -750,3 +750,10 @@ func TestMergeEnvironmentVariablesFromEnvfiles(t *testing.T) {
750750
})
751751
}
752752
}
753+
754+
func TestRequireNeuronRuntime(t *testing.T) {
755+
c := &Container{
756+
Environment: map[string]string{neuronVisibleDevicesEnvVar: "all"},
757+
}
758+
assert.True(t, c.RequireNeuronRuntime())
759+
}

agent/api/task/task.go

+26-9
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,9 @@ const (
7777
NvidiaVisibleDevicesEnvVar = "NVIDIA_VISIBLE_DEVICES"
7878
GPUAssociationType = "gpu"
7979

80+
// neuronRuntime is the name of the neuron docker runtime.
81+
neuronRuntime = "neuron"
82+
8083
ContainerOrderingCreateCondition = "CREATE"
8184
ContainerOrderingStartCondition = "START"
8285

@@ -1430,8 +1433,8 @@ func (task *Task) dockerExposedPorts(container *apicontainer.Container) nat.Port
14301433
}
14311434

14321435
// DockerHostConfig construct the configuration recognized by docker
1433-
func (task *Task) DockerHostConfig(container *apicontainer.Container, dockerContainerMap map[string]*apicontainer.DockerContainer, apiVersion dockerclient.DockerVersion) (*dockercontainer.HostConfig, *apierrors.HostConfigError) {
1434-
return task.dockerHostConfig(container, dockerContainerMap, apiVersion)
1436+
func (task *Task) DockerHostConfig(container *apicontainer.Container, dockerContainerMap map[string]*apicontainer.DockerContainer, apiVersion dockerclient.DockerVersion, cfg *config.Config) (*dockercontainer.HostConfig, *apierrors.HostConfigError) {
1437+
return task.dockerHostConfig(container, dockerContainerMap, apiVersion, cfg)
14351438
}
14361439

14371440
// ApplyExecutionRoleLogsAuth will check whether the task has execution role
@@ -1459,7 +1462,7 @@ func (task *Task) ApplyExecutionRoleLogsAuth(hostConfig *dockercontainer.HostCon
14591462
return nil
14601463
}
14611464

1462-
func (task *Task) dockerHostConfig(container *apicontainer.Container, dockerContainerMap map[string]*apicontainer.DockerContainer, apiVersion dockerclient.DockerVersion) (*dockercontainer.HostConfig, *apierrors.HostConfigError) {
1465+
func (task *Task) dockerHostConfig(container *apicontainer.Container, dockerContainerMap map[string]*apicontainer.DockerContainer, apiVersion dockerclient.DockerVersion, cfg *config.Config) (*dockercontainer.HostConfig, *apierrors.HostConfigError) {
14631466
dockerLinkArr, err := task.dockerLinks(container, dockerContainerMap)
14641467
if err != nil {
14651468
return nil, &apierrors.HostConfigError{Msg: err.Error()}
@@ -1488,12 +1491,8 @@ func (task *Task) dockerHostConfig(container *apicontainer.Container, dockerCont
14881491
Resources: resources,
14891492
}
14901493

1491-
if task.isGPUEnabled() && task.shouldRequireNvidiaRuntime(container) {
1492-
if task.NvidiaRuntime == "" {
1493-
return nil, &apierrors.HostConfigError{Msg: "Runtime is not set for GPU containers"}
1494-
}
1495-
seelog.Debugf("Setting runtime as %s for container %s", task.NvidiaRuntime, container.Name)
1496-
hostConfig.Runtime = task.NvidiaRuntime
1494+
if err := task.overrideContainerRuntime(container, hostConfig, cfg); err != nil {
1495+
return nil, err
14971496
}
14981497

14991498
if container.DockerConfig.HostConfig != nil {
@@ -1537,6 +1536,24 @@ func (task *Task) dockerHostConfig(container *apicontainer.Container, dockerCont
15371536
return hostConfig, nil
15381537
}
15391538

1539+
// overrideContainerRuntime overrides the runtime for the container in host config if needed.
1540+
func (task *Task) overrideContainerRuntime(container *apicontainer.Container, hostCfg *dockercontainer.HostConfig,
1541+
cfg *config.Config) *apierrors.HostConfigError {
1542+
if task.isGPUEnabled() && task.shouldRequireNvidiaRuntime(container) {
1543+
if task.NvidiaRuntime == "" {
1544+
return &apierrors.HostConfigError{Msg: "Runtime is not set for GPU containers"}
1545+
}
1546+
seelog.Debugf("Setting runtime as %s for container %s", task.NvidiaRuntime, container.Name)
1547+
hostCfg.Runtime = task.NvidiaRuntime
1548+
}
1549+
1550+
if cfg.InferentiaSupportEnabled && container.RequireNeuronRuntime() {
1551+
seelog.Debugf("Setting runtime as %s for container %s", neuronRuntime, container.Name)
1552+
hostCfg.Runtime = neuronRuntime
1553+
}
1554+
return nil
1555+
}
1556+
15401557
// Requires an *apicontainer.Container and returns the Resources for the HostConfig struct
15411558
func (task *Task) getDockerResources(container *apicontainer.Container) dockercontainer.Resources {
15421559
// Convert MB to B and set Memory

agent/api/task/task_linux_test.go

+4-2
Original file line numberDiff line numberDiff line change
@@ -422,7 +422,8 @@ func TestPlatformHostConfigOverrideErrorPath(t *testing.T) {
422422
},
423423
}
424424

425-
dockerHostConfig, err := task.DockerHostConfig(task.Containers[0], dockerMap(task), defaultDockerClientAPIVersion)
425+
dockerHostConfig, err := task.DockerHostConfig(task.Containers[0], dockerMap(task), defaultDockerClientAPIVersion,
426+
&config.Config{})
426427
assert.Error(t, err)
427428
assert.Empty(t, dockerHostConfig)
428429
}
@@ -464,7 +465,8 @@ func TestDockerHostConfigRawConfigMerging(t *testing.T) {
464465
},
465466
}
466467

467-
hostConfig, configErr := testTask.DockerHostConfig(testTask.Containers[0], dockerMap(testTask), minDockerClientAPIVersion)
468+
hostConfig, configErr := testTask.DockerHostConfig(testTask.Containers[0], dockerMap(testTask),
469+
minDockerClientAPIVersion, &config.Config{})
468470
assert.Nil(t, configErr)
469471

470472
expected := dockercontainer.HostConfig{

0 commit comments

Comments
 (0)