Skip to content

Commit

Permalink
feat: Trainium blueprint upgrade (#622)
Browse files Browse the repository at this point in the history
  • Loading branch information
vara-bonthu authored Aug 30, 2024
1 parent 0b1b808 commit 6a92682
Show file tree
Hide file tree
Showing 15 changed files with 715 additions and 516 deletions.
118 changes: 69 additions & 49 deletions ai-ml/trainium-inferentia/addons.tf
Original file line number Diff line number Diff line change
Expand Up @@ -37,19 +37,25 @@ resource "kubernetes_storage_class_v1" "default_gp3" {
}

#---------------------------------------------------------------
# IRSA for EBS CSI Driver
# EKS Pod identiity association
#---------------------------------------------------------------
module "ebs_csi_driver_irsa" {
source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks"
version = "~> 5.20"
role_name_prefix = format("%s-%s-", local.name, "ebs-csi-driver")
attach_ebs_csi_policy = true
oidc_providers = {
main = {
provider_arn = module.eks.oidc_provider_arn
namespace_service_accounts = ["kube-system:ebs-csi-controller-sa"]

module "aws_ebs_csi_pod_identity" {
source = "terraform-aws-modules/eks-pod-identity/aws"
version = "~> 1.4.0"

name = "aws-ebs-csi"
attach_aws_ebs_csi_policy = true

# Pod Identity Associations
associations = {
ebs-csi-controller = {
namespace = "kube-system"
service_account = "ebs-csi-controller-sa"
cluster_name = module.eks.cluster_name
}
}

tags = local.tags
}

Expand All @@ -58,7 +64,7 @@ module "ebs_csi_driver_irsa" {
#---------------------------------------------------------------
module "eks_blueprints_addons" {
source = "aws-ia/eks-blueprints-addons/aws"
version = "~> 1.2"
version = "~> 1.16"

cluster_name = module.eks.cluster_name
cluster_endpoint = module.eks.cluster_endpoint
Expand All @@ -69,35 +75,16 @@ module "eks_blueprints_addons" {
# Amazon EKS Managed Add-ons
#---------------------------------------
eks_addons = {
aws-ebs-csi-driver = {
service_account_role_arn = module.ebs_csi_driver_irsa.iam_role_arn
}
coredns = {
preserve = true
}
kube-proxy = {
preserve = true
}
# VPC CNI uses worker node IAM role policies
vpc-cni = {
preserve = true
}
aws-ebs-csi-driver = {}
coredns = {}
eks-pod-identity-agent = {}
kube-proxy = {}
vpc-cni = {}
}

#---------------------------------------
# Kubernetes Add-ons
#---------------------------------------
#---------------------------------------------------------------
# CoreDNS Autoscaler helps to scale for large EKS Clusters
# Further tuning for CoreDNS is to leverage NodeLocal DNSCache -> https://kubernetes.io/docs/tasks/administer-cluster/nodelocaldns/
#---------------------------------------------------------------
enable_cluster_proportional_autoscaler = true
cluster_proportional_autoscaler = {
values = [templatefile("${path.module}/helm-values/coredns-autoscaler-values.yaml", {
target = "deployment/coredns"
})]
description = "Cluster Proportional Autoscaler for CoreDNS Service"
}

#---------------------------------------
# Metrics Server
Expand Down Expand Up @@ -198,16 +185,8 @@ module "eks_blueprints_addons" {
#---------------------------------------------------------------
enable_kube_prometheus_stack = true
kube_prometheus_stack = {
values = [
var.enable_amazon_prometheus ? templatefile("${path.module}/helm-values/kube-prometheus-amp-enable.yaml", {
storage_class_type = kubernetes_storage_class_v1.default_gp3.id
region = local.region
amp_sa = local.amp_ingest_service_account
amp_irsa = module.amp_ingest_irsa[0].iam_role_arn
amp_remotewrite_url = "https://aps-workspaces.${local.region}.amazonaws.com/workspaces/${aws_prometheus_workspace.amp[0].id}/api/v1/remote_write"
amp_url = "https://aps-workspaces.${local.region}.amazonaws.com/workspaces/${aws_prometheus_workspace.amp[0].id}"
}) : templatefile("${path.module}/helm-values/kube-prometheus.yaml", {
storage_class_type = kubernetes_storage_class_v1.default_gp3.id
values = [templatefile("${path.module}/helm-values/kube-prometheus.yaml", {
storage_class_type = kubernetes_storage_class_v1.default_gp3.id
})
]
chart_version = "48.1.1"
Expand Down Expand Up @@ -241,9 +220,12 @@ module "eks_blueprints_addons" {
}

tags = local.tags
}

# We are installing Karpenter resources with Helm Chart, see helm-values/

resource "aws_eks_access_entry" "karpenter_node_access_entry" {
cluster_name = module.eks.cluster_name
principal_arn = module.eks_blueprints_addons.karpenter.node_iam_role_arn
type = "EC2_LINUX"
}

#---------------------------------------------------------------
Expand All @@ -255,9 +237,14 @@ module "eks_data_addons" {

oidc_provider_arn = module.eks.oidc_provider_arn

enable_aws_neuron_device_plugin = true
enable_aws_neuron_device_plugin = true

enable_aws_efa_k8s_device_plugin = true

aws_efa_k8s_device_plugin_helm_config = {
version = "v0.5.3"
}

#---------------------------------------
# Volcano Scheduler for TorchX used in BERT-Large distributed training example
# Volcano is also a default scheduler for KubeRay Operator
Expand Down Expand Up @@ -339,7 +326,7 @@ module "eks_data_addons" {
values: ["amd64"]
- key: "karpenter.sh/capacity-type"
operator: In
values: ["spot", "on-demand"]
values: [ "spot", "on-demand"]
limits:
cpu: 1000
disruption:
Expand Down Expand Up @@ -470,3 +457,36 @@ resource "kubectl_manifest" "mpi_operator" {
yaml_body = each.value
depends_on = [module.eks.eks_cluster_id]
}

#---------------------------------------------------------------
# Neuron Scheduler deployment
# The YAML manifest contents for Neuron Scheduler will be replaced in future by Neuron Helm Chart
#---------------------------------------------------------------

data "http" "neuron_scheduler" {
url = "https://awsdocs-neuron.readthedocs-hosted.com/en/latest/_downloads/e739253083129abeaf6f6ad1db7ccb21/my-scheduler.yml"
}

data "kubectl_file_documents" "neuron_scheduler" {
content = data.http.neuron_scheduler.response_body
}

resource "kubectl_manifest" "neuron_scheduler" {
for_each = data.kubectl_file_documents.neuron_scheduler.manifests
yaml_body = each.value
depends_on = [module.eks.eks_cluster_id]
}

data "http" "k8s_neuron_scheduler_eks" {
url = "https://awsdocs-neuron.readthedocs-hosted.com/en/latest/_downloads/e518187532701b6660dcd70ea28c2562/k8s-neuron-scheduler-eks.yml"
}

data "kubectl_file_documents" "k8s_neuron_scheduler_eks" {
content = data.http.k8s_neuron_scheduler_eks.response_body
}

resource "kubectl_manifest" "k8s_neuron_scheduler_eks" {
for_each = data.kubectl_file_documents.k8s_neuron_scheduler_eks.manifests
yaml_body = each.value
depends_on = [module.eks.eks_cluster_id]
}
36 changes: 0 additions & 36 deletions ai-ml/trainium-inferentia/amp.tf

This file was deleted.

Loading

0 comments on commit 6a92682

Please sign in to comment.