Skip to content

Commit

Permalink
feat: MPI operator installation code for distributed training use case (
Browse files Browse the repository at this point in the history
#362)

* MPI operator code for distributed training

* Making MPI operator optional for users

* added type string to mpi operator variable version
  • Loading branch information
sanjeevrg89 authored Nov 7, 2023
1 parent 843f2d8 commit 0ecfd45
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 0 deletions.
18 changes: 18 additions & 0 deletions ai-ml/trainium-inferentia/addons.tf
Original file line number Diff line number Diff line change
Expand Up @@ -499,3 +499,21 @@ resource "aws_launch_template" "trn1_lt" {
}
}
}

#---------------------------------------------------------------
# MPI Operator for distributed training on Trainium
#---------------------------------------------------------------
data "http" "mpi_operator_yaml" {
url = "https://raw.githubusercontent.com/kubeflow/mpi-operator/${var.mpi_operator_version}/deploy/v2beta1/mpi-operator.yaml"
}

data "kubectl_file_documents" "mpi_operator_yaml" {
content = data.http.mpi_operator_yaml.response_body
}

resource "kubectl_manifest" "mpi_operator" {
for_each = var.enable_mpi_operator ? data.kubectl_file_documents.mpi_operator_yaml.manifests : {}
yaml_body = each.value
depends_on = [module.eks.eks_cluster_id]
}

12 changes: 12 additions & 0 deletions ai-ml/trainium-inferentia/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,15 @@ variable "enable_amazon_prometheus" {
type = bool
default = true
}

variable "mpi_operator_version" {
description = "The version of the MPI Operator to install"
default = "v0.4.0"
type = string
}

variable "enable_mpi_operator" {
description = "Flag to enable the MPI Operator deployment"
type = bool
default = false
}

0 comments on commit 0ecfd45

Please sign in to comment.