From be5ade73f115adcbe7c04e8ec69dca7ec353f4cf Mon Sep 17 00:00:00 2001 From: Vara Bonthu Date: Thu, 1 Feb 2024 21:21:23 -0800 Subject: [PATCH] feat: Spark operator Karpenter upgrade (#425) Co-authored-by: Lucas Duarte <30901918+lusoal@users.noreply.github.com> Co-authored-by: Sanjeev Ganjihal --- .../karpenter-resources/Chart.yaml | 5 - .../helm-values/karpenter-resources/README.md | 71 ---- .../templates/node-class.yaml | 56 --- .../templates/node-pool.yaml | 36 -- .../karpenter-resources/values.yaml | 40 -- .../terraform/spark-k8s-operator/README.md | 5 +- .../terraform/spark-k8s-operator/addons.tf | 391 ++++++++++++++++-- .../examples/docker/Dockerfile | 40 ++ .../nvme-ephemeral-storage.yaml | 35 +- .../spark-compute-optimized-provisioner.yaml | 88 ---- ...graviton-memory-optimized-provisioner.yaml | 92 ----- .../spark-memory-optimized-provisioner.yaml | 102 ----- ...k-vertical-ebs-scaling-with-cpu-cores.yaml | 155 ------- .../terraform/spark-k8s-operator/main.tf | 8 +- .../amazon-emr-on-eks/emr-eks-karpenter.md | 64 +-- .../data-analytics/spark-operator-yunikorn.md | 333 ++++++++++++++- .../job-schedulers/self-managed-airflow.md | 86 +++- 17 files changed, 847 insertions(+), 760 deletions(-) delete mode 100644 ai-ml/trainium-inferentia/helm-values/karpenter-resources/Chart.yaml delete mode 100644 ai-ml/trainium-inferentia/helm-values/karpenter-resources/README.md delete mode 100644 ai-ml/trainium-inferentia/helm-values/karpenter-resources/templates/node-class.yaml delete mode 100644 ai-ml/trainium-inferentia/helm-values/karpenter-resources/templates/node-pool.yaml delete mode 100644 ai-ml/trainium-inferentia/helm-values/karpenter-resources/values.yaml create mode 100644 analytics/terraform/spark-k8s-operator/examples/docker/Dockerfile delete mode 100644 analytics/terraform/spark-k8s-operator/karpenter-provisioners/spark-compute-optimized-provisioner.yaml delete mode 100644 analytics/terraform/spark-k8s-operator/karpenter-provisioners/spark-graviton-memory-optimized-provisioner.yaml delete mode 100644 analytics/terraform/spark-k8s-operator/karpenter-provisioners/spark-memory-optimized-provisioner.yaml delete mode 100644 analytics/terraform/spark-k8s-operator/karpenter-provisioners/spark-vertical-ebs-scaling-with-cpu-cores.yaml diff --git a/ai-ml/trainium-inferentia/helm-values/karpenter-resources/Chart.yaml b/ai-ml/trainium-inferentia/helm-values/karpenter-resources/Chart.yaml deleted file mode 100644 index 0c3b8474a..000000000 --- a/ai-ml/trainium-inferentia/helm-values/karpenter-resources/Chart.yaml +++ /dev/null @@ -1,5 +0,0 @@ -apiVersion: v2 -name: karpenter-resources -description: Helm chart for configuring custom resources for Karpenter on the cluster -version: 0.0.1 -appVersion: 0.0.1 diff --git a/ai-ml/trainium-inferentia/helm-values/karpenter-resources/README.md b/ai-ml/trainium-inferentia/helm-values/karpenter-resources/README.md deleted file mode 100644 index e95f582c1..000000000 --- a/ai-ml/trainium-inferentia/helm-values/karpenter-resources/README.md +++ /dev/null @@ -1,71 +0,0 @@ -# Karpenter Resources Helm Chart - -## Overview - -This Helm chart is an abstraction layer designed for deploying various configurations of Karpenter nodes in a Kubernetes cluster managed by EKS. It integrates seamlessly with Terraform, allowing users to define different node pools and settings for their Kubernetes cluster. - -## Prerequisites - -- Helm 3.x or later installed -- Terraform installed -- Access to an AWS EKS cluster - -## Configuration - -The chart is configured to be used with Terraform. Here is an example of how you might define Helm releases for different Karpenter configurations in your Terraform files using EKS Blueprints add-ons: - -```hcl -module "eks_blueprints_addons" { - # ... other configurations ... - helm_releases = { - karpenter-resources-default = { - name = "default" - description = "A Helm chart for default node pool" - chart = "${path.module}/helm-values/karpenter-resources" - values = [ - <<-EOT - clusterName: ${module.eks.cluster_name} - ec2NodeClass: - karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]} - nodePool: - labels: - - provisioner: default - - workload: rayhead - EOT - ] - } - } -} -``` - -## Testing the Chart with Helm Template - -To review the Kubernetes manifests that will be generated by the Helm chart based on your configuration, you can use the `helm template` command. This is especially useful for validating your Terraform configurations before applying them. - -1. **Generate the Manifests** - - Navigate to the directory where your Helm chart is located. - - ```sh - cd path/to/helm-chart - ``` - -2. **Run Helm Template** - - Use the `helm template` command with your custom values. For example: - - ```sh - helm template my-release-name . --values values.yaml - ``` - - Replace `my-release-name` with a name for your release, and `values.yaml` with the path to your custom values file. - - To test specific configurations defined in your Terraform file, you can create a temporary values file with the configuration snippet from your Terraform definition: - - ```sh - echo '' > temp-values.yaml - helm template my-release-name . --values temp-values.yaml - rm temp-values.yaml - ``` - - This will output the Kubernetes manifests to your terminal, allowing you to review them. diff --git a/ai-ml/trainium-inferentia/helm-values/karpenter-resources/templates/node-class.yaml b/ai-ml/trainium-inferentia/helm-values/karpenter-resources/templates/node-class.yaml deleted file mode 100644 index 604e7f8e8..000000000 --- a/ai-ml/trainium-inferentia/helm-values/karpenter-resources/templates/node-class.yaml +++ /dev/null @@ -1,56 +0,0 @@ -{{- if .Values.ec2NodeClass.enabled }} -apiVersion: karpenter.k8s.aws/v1beta1 -kind: EC2NodeClass -metadata: - name: {{ .Values.name }} -spec: - {{- if .Values.ec2NodeClass.amiFamily }} - amiFamily: {{ .Values.ec2NodeClass.amiFamily }} - {{- else if .Values.ec2NodeClass.amiSelectorTerms }} - amiSelectorTerms: - {{- toYaml .Values.ec2NodeClass.amiSelectorTerms | nindent 4 }} - {{- end }} - subnetSelectorTerms: - {{- if .Values.ec2NodeClass.subnetSelectorTerms.tags }} - - tags: - {{- range $key, $value := .Values.ec2NodeClass.subnetSelectorTerms.tags }} - {{ $key }}: {{ $value | quote }} - {{- end }} - {{- end }} - {{- if .Values.ec2NodeClass.subnetSelectorTerms.id }} - - id: {{ .Values.ec2NodeClass.subnetSelectorTerms.id }} - {{- end }} - securityGroupSelectorTerms: - {{- if .Values.ec2NodeClass.securityGroupSelectorTerms.name }} - - name: {{ .Values.ec2NodeClass.securityGroupSelectorTerms.name }} - {{- end }} - {{- if .Values.ec2NodeClass.securityGroupSelectorTerms.id }} - - id: {{ .Values.ec2NodeClass.securityGroupSelectorTerms.id }} - {{- end }} - {{- if .Values.ec2NodeClass.securityGroupSelectorTerms.tags }} - - tags: - {{- range $key, $value := .Values.ec2NodeClass.securityGroupSelectorTerms.tags }} - {{ $key }}: {{ $value | quote }} - {{- end }} - {{- end }} - role: {{ .Values.ec2NodeClass.karpenterRole }} - tags: - Name: karpenter-{{ .Values.name }} - metadataOptions: - httpEndpoint: {{ .Values.ec2NodeClass.metadataOptions.httpEndpoint }} - httpProtocolIPv6: {{ .Values.ec2NodeClass.metadataOptions.httpProtocolIPv6 }} - httpPutResponseHopLimit: {{ .Values.ec2NodeClass.metadataOptions.httpPutResponseHopLimit }} - httpTokens: {{ .Values.ec2NodeClass.metadataOptions.httpTokens }} - blockDeviceMappings: - - deviceName: {{ default "/dev/xvda" .Values.ec2NodeClass.blockDevice.deviceName }} - ebs: - volumeSize: {{ .Values.ec2NodeClass.blockDevice.volumeSize }} - volumeType: {{ .Values.ec2NodeClass.blockDevice.volumeType }} - encrypted: {{ .Values.ec2NodeClass.blockDevice.encrypted }} - deleteOnTermination: {{ .Values.ec2NodeClass.blockDevice.deleteOnTermination }} - detailedMonitoring: {{ .Values.ec2NodeClass.detailedMonitoring }} - {{- if .Values.ec2NodeClass.userData }} - userData: | - {{- .Values.ec2NodeClass.userData | nindent 4 }} - {{- end }} -{{- end }} diff --git a/ai-ml/trainium-inferentia/helm-values/karpenter-resources/templates/node-pool.yaml b/ai-ml/trainium-inferentia/helm-values/karpenter-resources/templates/node-pool.yaml deleted file mode 100644 index 0ac17988f..000000000 --- a/ai-ml/trainium-inferentia/helm-values/karpenter-resources/templates/node-pool.yaml +++ /dev/null @@ -1,36 +0,0 @@ -{{- if .Values.nodePool.enabled }} -apiVersion: karpenter.sh/v1beta1 -kind: NodePool -metadata: - name: {{ .Values.name }} -spec: - template: - metadata: - labels: - NodePool: {{ .Values.name }} - NodeGroupType: {{ .Values.name }} - {{- with .Values.nodePool.labels }} - {{- range . }} - {{- toYaml . | nindent 8 }} - {{- end }} - {{- end }} - spec: - nodeClassRef: - name: {{ .Values.name }} - {{- with .Values.nodePool.taints }} - taints: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.nodePool.requirements }} - requirements: - {{- toYaml . | nindent 8 }} - {{- end }} - disruption: - consolidationPolicy: {{ .Values.nodePool.disruption.consolidationPolicy }} - consolidateAfter: {{ .Values.nodePool.disruption.consolidateAfter }} - expireAfter: {{ .Values.nodePool.disruption.expireAfter }} - limits: - cpu: {{ .Values.nodePool.limits.cpu }} - memory: {{ .Values.nodePool.limits.memory }} - weight: {{ .Values.nodePool.weight }} -{{- end }} diff --git a/ai-ml/trainium-inferentia/helm-values/karpenter-resources/values.yaml b/ai-ml/trainium-inferentia/helm-values/karpenter-resources/values.yaml deleted file mode 100644 index 456dce270..000000000 --- a/ai-ml/trainium-inferentia/helm-values/karpenter-resources/values.yaml +++ /dev/null @@ -1,40 +0,0 @@ -# Shared values -name: default -clusterName: test-cluster - -# EC2NodeClass specific values -ec2NodeClass: - enabled: true - amiFamily: AL2 - amiSelectorTerms: - subnetSelectorTerms: # tag or id see documentation, https://karpenter.sh/docs/concepts/nodeclasses/ - securityGroupSelectorTerms: # tag, name, id see documentation, https://karpenter.sh/docs/concepts/nodeclasses/ - karpenterRole: - metadataOptions: - httpEndpoint: enabled - httpProtocolIPv6: disabled - httpPutResponseHopLimit: 2 - httpTokens: required - blockDevice: - deviceName: /dev/xvda - volumeSize: 100Gi - volumeType: gp3 - encrypted: true - deleteOnTermination: true - detailedMonitoring: true - userData: - -# NodePool specific values -nodePool: - enabled: true - labels: - taints: - requirements: - disruption: - consolidationPolicy: WhenEmpty - consolidateAfter: 30s - expireAfter: 720h - limits: - cpu: "1000" - memory: 1000Gi - weight: 10 diff --git a/analytics/terraform/spark-k8s-operator/README.md b/analytics/terraform/spark-k8s-operator/README.md index 99d45aeaf..38ce271f2 100644 --- a/analytics/terraform/spark-k8s-operator/README.md +++ b/analytics/terraform/spark-k8s-operator/README.md @@ -19,7 +19,6 @@ Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/ |------|---------| | [aws](#provider\_aws) | >= 3.72 | | [aws.ecr](#provider\_aws.ecr) | >= 3.72 | -| [kubectl](#provider\_kubectl) | >= 1.14 | | [kubernetes](#provider\_kubernetes) | >= 2.10 | | [random](#provider\_random) | 3.3.2 | @@ -31,7 +30,7 @@ Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/ | [ebs\_csi\_driver\_irsa](#module\_ebs\_csi\_driver\_irsa) | terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks | ~> 5.20 | | [eks](#module\_eks) | terraform-aws-modules/eks/aws | ~> 19.15 | | [eks\_blueprints\_addons](#module\_eks\_blueprints\_addons) | aws-ia/eks-blueprints-addons/aws | ~> 1.2 | -| [eks\_data\_addons](#module\_eks\_data\_addons) | aws-ia/eks-data-addons/aws | ~> 1.0 | +| [eks\_data\_addons](#module\_eks\_data\_addons) | aws-ia/eks-data-addons/aws | ~> 1.2.9 | | [s3\_bucket](#module\_s3\_bucket) | terraform-aws-modules/s3-bucket/aws | ~> 3.0 | | [spark\_team\_a\_irsa](#module\_spark\_team\_a\_irsa) | aws-ia/eks-blueprints-addon/aws | ~> 1.0 | | [vpc](#module\_vpc) | terraform-aws-modules/vpc/aws | ~> 5.0 | @@ -48,7 +47,6 @@ Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/ | [aws_s3_object.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/s3_object) | resource | | [aws_secretsmanager_secret.grafana](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/secretsmanager_secret) | resource | | [aws_secretsmanager_secret_version.grafana](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/secretsmanager_secret_version) | resource | -| [kubectl_manifest.karpenter_provisioner](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/resources/manifest) | resource | | [kubernetes_cluster_role.spark_role](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/cluster_role) | resource | | [kubernetes_cluster_role_binding.spark_role_binding](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/cluster_role_binding) | resource | | [kubernetes_namespace_v1.spark_team_a](https://registry.terraform.io/providers/hashicorp/kubernetes/latest/docs/resources/namespace_v1) | resource | @@ -63,7 +61,6 @@ Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/ | [aws_partition.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/partition) | data source | | [aws_region.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/region) | data source | | [aws_secretsmanager_secret_version.admin_password_version](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/secretsmanager_secret_version) | data source | -| [kubectl_path_documents.karpenter_provisioners](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/data-sources/path_documents) | data source | ## Inputs diff --git a/analytics/terraform/spark-k8s-operator/addons.tf b/analytics/terraform/spark-k8s-operator/addons.tf index 74ed81426..067c0b33f 100644 --- a/analytics/terraform/spark-k8s-operator/addons.tf +++ b/analytics/terraform/spark-k8s-operator/addons.tf @@ -84,15 +84,17 @@ module "eks_blueprints_addons" { #--------------------------------------- enable_karpenter = true karpenter_enable_spot_termination = true - karpenter = { - repository_username = data.aws_ecrpublic_authorization_token.token.user_name - repository_password = data.aws_ecrpublic_authorization_token.token.password - } karpenter_node = { iam_role_additional_policies = { AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" } } + karpenter = { + chart_version = "v0.33.1" + repository_username = data.aws_ecrpublic_authorization_token.token.user_name + repository_password = data.aws_ecrpublic_authorization_token.token.password + } + #--------------------------------------- # CloudWatch metrics for EKS #--------------------------------------- @@ -171,10 +173,368 @@ module "eks_blueprints_addons" { #--------------------------------------------------------------- module "eks_data_addons" { source = "aws-ia/eks-data-addons/aws" - version = "~> 1.0" # ensure to update this to the latest/desired version + version = "~> 1.2.9" # ensure to update this to the latest/desired version oidc_provider_arn = module.eks.oidc_provider_arn + enable_karpenter_resources = true + + karpenter_resources_helm_config = { + spark-compute-optimized = { + values = [ + <<-EOT + name: spark-compute-optimized + clusterName: ${module.eks.cluster_name} + ec2NodeClass: + karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]} + subnetSelectorTerms: + tags: + Name: "${module.eks.cluster_name}-private*" + securityGroupSelectorTerms: + tags: + Name: ${module.eks.cluster_name}-node + userData: | + MIME-Version: 1.0 + Content-Type: multipart/mixed; boundary="BOUNDARY" + + --BOUNDARY + Content-Type: text/x-shellscript; charset="us-ascii" + + cat <<-EOF > /etc/profile.d/bootstrap.sh + #!/bin/sh + + + # Configure the NVMe volumes in RAID0 configuration in the bootstrap.sh call. + # https://github.com/awslabs/amazon-eks-ami/blob/master/files/bootstrap.sh#L35 + # This will create a RAID volume and mount it at /mnt/k8s-disks/0 + # then mount that volume to /var/lib/kubelet, /var/lib/containerd, and /var/log/pods + # this allows the container daemons and pods to write to the RAID0 by default without needing PersistentVolumes + export LOCAL_DISKS='raid0' + EOF + + # Source extra environment variables in bootstrap script + sed -i '/^set -o errexit/a\\nsource /etc/profile.d/bootstrap.sh' /etc/eks/bootstrap.sh + + --BOUNDARY-- + + nodePool: + labels: + - type: karpenter + - NodeGroupType: SparkComputeOptimized + - multiArch: Spark + requirements: + - key: "karpenter.sh/capacity-type" + operator: In + values: ["spot", "on-demand"] + - key: "kubernetes.io/arch" + operator: In + values: ["amd64"] + - key: "karpenter.k8s.aws/instance-category" + operator: In + values: ["c"] + - key: "karpenter.k8s.aws/instance-family" + operator: In + values: ["c5d"] + - key: "karpenter.k8s.aws/instance-cpu" + operator: In + values: ["4", "8", "16", "36"] + - key: "karpenter.k8s.aws/instance-hypervisor" + operator: In + values: ["nitro"] + - key: "karpenter.k8s.aws/instance-generation" + operator: Gt + values: ["2"] + limits: + cpu: 20 # Change this to 1000 or more for production according to your needs + disruption: + consolidationPolicy: WhenEmpty + consolidateAfter: 30s + expireAfter: 720h + weight: 100 + EOT + ] + } + spark-graviton-memory-optimized = { + values = [ + <<-EOT + name: spark-graviton-memory-optimized + clusterName: ${module.eks.cluster_name} + ec2NodeClass: + karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]} + subnetSelectorTerms: + tags: + Name: "${module.eks.cluster_name}-private*" + securityGroupSelectorTerms: + tags: + Name: ${module.eks.cluster_name}-node + userData: | + MIME-Version: 1.0 + Content-Type: multipart/mixed; boundary="BOUNDARY" + + --BOUNDARY + Content-Type: text/x-shellscript; charset="us-ascii" + + cat <<-EOF > /etc/profile.d/bootstrap.sh + #!/bin/sh + + + # Configure the NVMe volumes in RAID0 configuration in the bootstrap.sh call. + # https://github.com/awslabs/amazon-eks-ami/blob/master/files/bootstrap.sh#L35 + # This will create a RAID volume and mount it at /mnt/k8s-disks/0 + # then mount that volume to /var/lib/kubelet, /var/lib/containerd, and /var/log/pods + # this allows the container daemons and pods to write to the RAID0 by default without needing PersistentVolumes + export LOCAL_DISKS='raid0' + EOF + + # Source extra environment variables in bootstrap script + sed -i '/^set -o errexit/a\\nsource /etc/profile.d/bootstrap.sh' /etc/eks/bootstrap.sh + + --BOUNDARY-- + + + nodePool: + labels: + - type: karpenter + - NodeGroupType: SparkGravitonMemoryOptimized + - multiArch: Spark + requirements: + - key: "karpenter.sh/capacity-type" + operator: In + values: ["spot", "on-demand"] + - key: "kubernetes.io/arch" + operator: In + values: ["arm64"] + - key: "karpenter.k8s.aws/instance-category" + operator: In + values: ["r"] + - key: "karpenter.k8s.aws/instance-family" + operator: In + values: ["r6gd"] + - key: "karpenter.k8s.aws/instance-cpu" + operator: In + values: ["4", "8", "16", "32"] + - key: "karpenter.k8s.aws/instance-hypervisor" + operator: In + values: ["nitro"] + - key: "karpenter.k8s.aws/instance-generation" + operator: Gt + values: ["2"] + limits: + cpu: 1000 + disruption: + consolidationPolicy: WhenEmpty + consolidateAfter: 30s + expireAfter: 720h + weight: 50 + EOT + ] + } + spark-memory-optimized = { + values = [ + <<-EOT + name: spark-memory-optimized + clusterName: ${module.eks.cluster_name} + ec2NodeClass: + karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]} + subnetSelectorTerms: + tags: + Name: "${module.eks.cluster_name}-private*" + securityGroupSelectorTerms: + tags: + Name: ${module.eks.cluster_name}-node + userData: | + MIME-Version: 1.0 + Content-Type: multipart/mixed; boundary="BOUNDARY" + + --BOUNDARY + Content-Type: text/x-shellscript; charset="us-ascii" + + cat <<-EOF > /etc/profile.d/bootstrap.sh + #!/bin/sh + + + # Configure the NVMe volumes in RAID0 configuration in the bootstrap.sh call. + # https://github.com/awslabs/amazon-eks-ami/blob/master/files/bootstrap.sh#L35 + # This will create a RAID volume and mount it at /mnt/k8s-disks/0 + # then mount that volume to /var/lib/kubelet, /var/lib/containerd, and /var/log/pods + # this allows the container daemons and pods to write to the RAID0 by default without needing PersistentVolumes + export LOCAL_DISKS='raid0' + EOF + + # Source extra environment variables in bootstrap script + sed -i '/^set -o errexit/a\\nsource /etc/profile.d/bootstrap.sh' /etc/eks/bootstrap.sh + + --BOUNDARY-- + + nodePool: + labels: + - type: karpenter + - NodeGroupType: SparkComputeOptimized + - multiArch: Spark + requirements: + - key: "karpenter.sh/capacity-type" + operator: In + values: ["spot", "on-demand"] + - key: "kubernetes.io/arch" + operator: In + values: ["amd64"] + - key: "karpenter.k8s.aws/instance-category" + operator: In + values: ["r"] + - key: "karpenter.k8s.aws/instance-family" + operator: In + values: ["r5d"] + - key: "karpenter.k8s.aws/instance-cpu" + operator: In + values: ["4", "8", "16", "32"] + - key: "karpenter.k8s.aws/instance-hypervisor" + operator: In + values: ["nitro"] + - key: "karpenter.k8s.aws/instance-generation" + operator: Gt + values: ["2"] + limits: + cpu: 1000 + disruption: + consolidationPolicy: WhenEmpty + consolidateAfter: 30s + expireAfter: 720h + weight: 100 + EOT + ] + } + spark-vertical-ebs-scale = { + values = [ + <<-EOT + name: spark-vertical-ebs-scale + clusterName: ${module.eks.cluster_name} + ec2NodeClass: + karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]} + subnetSelectorTerms: + tags: + Name: "${module.eks.cluster_name}-private*" + securityGroupSelectorTerms: + tags: + Name: ${module.eks.cluster_name}-node + userData: | + MIME-Version: 1.0 + Content-Type: multipart/mixed; boundary="BOUNDARY" + + --BOUNDARY + Content-Type: text/x-shellscript; charset="us-ascii" + + #!/bin/bash + echo "Running a custom user data script" + set -ex + yum install mdadm -y + + IDX=1 + DEVICES=$(lsblk -o NAME,TYPE -dsn | awk '/disk/ {print $1}') + + DISK_ARRAY=() + + for DEV in $DEVICES + do + DISK_ARRAY+=("/dev/$${DEV}") + done + + DISK_COUNT=$${#DISK_ARRAY[@]} + + if [ $${DISK_COUNT} -eq 0 ]; then + echo "No SSD disks available. Creating new EBS volume according to number of cores available in the node." + yum install -y jq awscli + TOKEN=$(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 3600") + + # Get instance info + INSTANCE_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" http://169.254.169.254/latest/meta-data/instance-id) + AVAILABILITY_ZONE=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" http://169.254.169.254/latest/meta-data/placement/availability-zone) + REGION=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" http://169.254.169.254/latest/meta-data/placement/availability-zone | sed 's/[a-z]$//') + + # Get the number of cores available + CORES=$(nproc --all) + + # Define volume size based on the number of cores and EBS volume size per core + VOLUME_SIZE=$(expr $CORES \* 10) # 10GB per core. Change as desired + + # Create a volume + VOLUME_ID=$(aws ec2 create-volume --availability-zone $AVAILABILITY_ZONE --size $VOLUME_SIZE --volume-type gp3 --region $REGION --output text --query 'VolumeId') + + # Check whether the volume is available + while [ "$(aws ec2 describe-volumes --volume-ids $VOLUME_ID --region $REGION --query "Volumes[*].State" --output text)" != "available" ]; do + echo "Waiting for volume to become available" + sleep 5 + done + + # Attach the volume to the instance + aws ec2 attach-volume --volume-id $VOLUME_ID --instance-id $INSTANCE_ID --device /dev/xvdb --region $REGION + + # Update the state to delete the volume when the node is terminated + aws ec2 modify-instance-attribute --instance-id $INSTANCE_ID --block-device-mappings "[{\"DeviceName\": \"/dev/xvdb\",\"Ebs\":{\"DeleteOnTermination\":true}}]" --region $REGION + + # Wait for the volume to be attached + while [ "$(aws ec2 describe-volumes --volume-ids $VOLUME_ID --region $REGION --query "Volumes[*].Attachments[*].State" --output text)" != "attached" ]; do + echo "Waiting for volume to be attached" + sleep 5 + done + + # Format the volume + sudo mkfs -t ext4 /dev/xvdb # Improve this to get this value dynamically + # Create a mount point + sudo mkdir /mnt/k8s-disks # Change directory as you like + # Mount the volume + sudo mount /dev/xvdb /mnt/k8s-disks + # To mount this EBS volume on every system reboot, you need to add an entry in /etc/fstab + echo "/dev/xvdb /mnt/k8s-disks ext4 defaults,nofail 0 2" | sudo tee -a /etc/fstab + + # Adding permissions to the mount + /usr/bin/chown -hR +999:+1000 /mnt/k8s-disks + else + if [ $${DISK_COUNT} -eq 1 ]; then + TARGET_DEV=$${DISK_ARRAY[0]} + mkfs.xfs $${TARGET_DEV} + else + mdadm --create --verbose /dev/md0 --level=0 --raid-devices=$${DISK_COUNT} $${DISK_ARRAY[@]} + mkfs.xfs /dev/md0 + TARGET_DEV=/dev/md0 + fi + + mkdir -p /mnt/k8s-disks + echo $${TARGET_DEV} /mnt/k8s-disks xfs defaults,noatime 1 2 >> /etc/fstab + mount -a + /usr/bin/chown -hR +999:+1000 /mnt/k8s-disks + fi + + --BOUNDARY-- + + nodePool: + labels: + - type: karpenter + - provisioner: spark-vertical-ebs-scale + requirements: + - key: "karpenter.sh/capacity-type" + operator: In + values: ["spot", "on-demand"] + - key: "karpenter.k8s.aws/instance-family" + operator: In + values: ["r4", "r4", "r5", "r5d", "r5n", "r5dn", "r5b", "m4", "m5", "m5n", "m5zn", "m5dn", "m5d", "c4", "c5", "c5n", "c5d"] + - key: "kubernetes.io/arch" + operator: In + values: ["amd64"] + - key: "karpenter.k8s.aws/instance-generation" + operator: Gt + values: ["2"] + limits: + cpu: 1000 + disruption: + consolidationPolicy: WhenEmpty + consolidateAfter: 30s + expireAfter: 720h + weight: 100 + EOT + ] + } + } + #--------------------------------------------------------------- # Spark Operator Add-on #--------------------------------------------------------------- @@ -218,24 +578,9 @@ module "eks_data_addons" { } -#--------------------------------------- -# Karpenter Provisioners -#--------------------------------------- -data "kubectl_path_documents" "karpenter_provisioners" { - pattern = "${path.module}/karpenter-provisioners/spark-*.yaml" - vars = { - azs = local.region - eks_cluster_id = module.eks.cluster_name - } -} - -resource "kubectl_manifest" "karpenter_provisioner" { - for_each = toset(data.kubectl_path_documents.karpenter_provisioners.documents) - yaml_body = each.value - - depends_on = [module.eks_blueprints_addons] -} - +#--------------------------------------------------------------- +# S3 bucket for Spark Event Logs and Example Data +#--------------------------------------------------------------- #tfsec:ignore:* module "s3_bucket" { source = "terraform-aws-modules/s3-bucket/aws" diff --git a/analytics/terraform/spark-k8s-operator/examples/docker/Dockerfile b/analytics/terraform/spark-k8s-operator/examples/docker/Dockerfile new file mode 100644 index 000000000..23a59cba8 --- /dev/null +++ b/analytics/terraform/spark-k8s-operator/examples/docker/Dockerfile @@ -0,0 +1,40 @@ +#-------------------------------------------------------------------------------------------- +# Dockerfile for Apache Spark 3.3.1 with S3A Support on multi-arch platforms (AMD64 & ARM64) +#-------------------------------------------------------------------------------------------- +# Step1: Create a Private or Public ECR repo from AWS Console or CLI +# e.g., aws ecr-public create-repository --repository-name spark3.3.1-hadoop3.2-aws-java-sdk-bundle-1.12.647 --region us-east-1 +#--- +# Step2: Docker Login: +# aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/ +#--- +# Step3: Build multi arch image and push it to ECR: +# docker buildx build --platform linux/amd64,linux/arm64 -t public.ecr.aws//spark3.3.1-hadoop3.2-aws-java-sdk-bundle-1.12.647:latest --push . +#-------------------------------------------------------------------------------------------- + +# Use the official Apache Spark base image +FROM apache/spark:3.3.1 + +# Define Spark and Hadoop versions +ENV SPARK_VERSION=3.3.1 +ENV HADOOP_VERSION=3.3.1 +ENV AWS_SDK_VERSION=1.12.647 + +# Install wget, Python, and PySpark +USER root +RUN apt-get update && \ + apt-get install -y wget python3 python3-pip && \ + ln -s /usr/bin/python3 /usr/bin/python && \ + pip3 install pyspark==$SPARK_VERSION + + +# Add the AWS Java SDK and Hadoop-AWS package to enable S3A support +# These versions should be compatible with the Spark and Hadoop versions used +RUN cd /opt/spark/jars && \ + wget -q "https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_VERSION}/aws-java-sdk-bundle-${AWS_SDK_VERSION}.jar" && \ + wget -q "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar" + +# Switch back to the non-root user +USER 1001 + +# Set the entry point for the container +ENTRYPOINT ["/opt/entrypoint.sh"] diff --git a/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-ephemeral-storage/nvme-ephemeral-storage.yaml b/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-ephemeral-storage/nvme-ephemeral-storage.yaml index 8159a7b08..7bd34351c 100644 --- a/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-ephemeral-storage/nvme-ephemeral-storage.yaml +++ b/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-ephemeral-storage/nvme-ephemeral-storage.yaml @@ -1,6 +1,6 @@ # Pre-requisite before running this job -# 1/ Open taxi-trip-execute.sh and update and -# 2/ Replace with your S3 bucket created by this blueprint(Check Terraform outputs) +# 1/ Open taxi-trip-execute.sh and update and +# 2/ Replace with your S3 bucket created by this blueprint(Check Terraform outputs) # 3/ execute taxi-trip-execute.sh --- @@ -28,12 +28,12 @@ spec: sparkVersion: "3.2.1" pythonVersion: "3" mode: cluster - image: "public.ecr.aws/r1l5w1y9/spark-operator:3.2.1-hadoop-3.3.1-java-11-scala-2.12-python-3.8-latest" + image: public.ecr.aws/data-on-eks/spark3.3.1-hadoop3.2-aws-java-sdk-bundle-1.12.647 imagePullPolicy: IfNotPresent - mainApplicationFile: "s3a:///taxi-trip/scripts/pyspark-taxi-trip.py" # MainFile is the path to a bundled JAR, Python, or R file of the application + mainApplicationFile: "s3a:///taxi-trip/scripts/pyspark-taxi-trip.py" # MainFile is the path to a bundled JAR, Python, or R file of the application arguments: - - "s3a:///taxi-trip/input/" - - "s3a:///taxi-trip/output/" + - "s3a:///taxi-trip/input/" + - "s3a:///taxi-trip/output/" hadoopConf: "fs.s3a.aws.credentials.provider": "com.amazonaws.auth.WebIdentityTokenCredentialsProvider" "fs.s3a.impl": "org.apache.hadoop.fs.s3a.S3AFileSystem" @@ -55,7 +55,7 @@ spec: # Spark Event logs "spark.eventLog.enabled": "true" - "spark.eventLog.dir": "s3a:///spark-event-logs" + "spark.eventLog.dir": "s3a:///spark-event-logs" "spark.eventLog.rolling.enabled": "true" "spark.eventLog.rolling.maxFileSize": "64m" # "spark.history.fs.eventLog.rolling.maxFilesToRetain": 100 @@ -87,31 +87,24 @@ spec: memoryOverhead: "4g" serviceAccount: spark-team-a labels: - version: 3.2.1 + version: 3.3.1 # the c5d instances that Karpenter will launch will have the NVMe storage preformatted and available to the pod # we do not need to leverage a hostPath mount or volume to leverage that storage. # ephemeral-storage requests and limits can be used to manage the storage utilization nodeSelector: - NodeGroupType: "SparkComputeOptimized" - tolerations: - - key: "spark-compute-optimized" - operator: "Exists" - effect: "NoSchedule" + multiArch: Spark + executor: - cores: 1 - coreLimit: "1200m" + cores: 3 + coreLimit: "3400m" instances: 4 memory: "4g" memoryOverhead: "4g" serviceAccount: spark-team-a labels: - version: 3.2.1 + version: 3.3.1 # the c5d instances that Karpenter will launch will have the NVMe storage preformatted and available to the pod # we do not need to leverage a hostPath mount or volume to leverage that storage. # ephemeral-storage requests and limits can be used to manage the storage utilization nodeSelector: - NodeGroupType: "SparkComputeOptimized" - tolerations: - - key: "spark-compute-optimized" - operator: "Exists" - effect: "NoSchedule" + multiArch: Spark diff --git a/analytics/terraform/spark-k8s-operator/karpenter-provisioners/spark-compute-optimized-provisioner.yaml b/analytics/terraform/spark-k8s-operator/karpenter-provisioners/spark-compute-optimized-provisioner.yaml deleted file mode 100644 index 3532a8a9e..000000000 --- a/analytics/terraform/spark-k8s-operator/karpenter-provisioners/spark-compute-optimized-provisioner.yaml +++ /dev/null @@ -1,88 +0,0 @@ -apiVersion: karpenter.sh/v1alpha5 -kind: Provisioner -metadata: - name: spark-compute-optimized - namespace: karpenter # Same namespace as Karpenter add-on installed -spec: - kubeletConfiguration: - containerRuntime: containerd - # podsPerCore: 2 - # maxPods: 20 - requirements: - - key: "topology.kubernetes.io/zone" - operator: In - values: [${azs}a] #Update the correct region and zones - - key: "karpenter.sh/capacity-type" - operator: In - values: ["spot", "on-demand"] - - key: "node.kubernetes.io/instance-type" #If not included, all instance types are considered - operator: In - values: ["c5d.xlarge","c5d.2xlarge","c5d.4xlarge","c5d.9xlarge"] # 1 NVMe disk - - key: "kubernetes.io/arch" - operator: In - values: ["amd64"] - limits: - resources: - cpu: 2000 - providerRef: - name: spark-compute-optimized - labels: - type: karpenter - provisioner: spark-compute-optimized - NodeGroupType: SparkComputeOptimized - taints: - - key: spark-compute-optimized - value: 'true' - effect: NoSchedule - ttlSecondsAfterEmpty: 120 # optional, but never scales down if not set - ---- -apiVersion: karpenter.k8s.aws/v1alpha1 -kind: AWSNodeTemplate -metadata: - name: spark-compute-optimized - namespace: karpenter -spec: - blockDeviceMappings: - - deviceName: /dev/xvda - ebs: - volumeSize: 100Gi - volumeType: gp3 - encrypted: true - deleteOnTermination: true - metadataOptions: - httpEndpoint: enabled - httpProtocolIPv6: disabled - httpPutResponseHopLimit: 2 - httpTokens: required - subnetSelector: - Name: "${eks_cluster_id}-private*" # Name of the Subnets to spin up the nodes - securityGroupSelector: # required, when not using launchTemplate - Name: "${eks_cluster_id}-node*" # name of the SecurityGroup to be used with Nodes - # instanceProfile: "" # optional, if already set in controller args - #RAID0 config example - userData: | - MIME-Version: 1.0 - Content-Type: multipart/mixed; boundary="BOUNDARY" - - --BOUNDARY - Content-Type: text/x-shellscript; charset="us-ascii" - - cat <<-EOF > /etc/profile.d/bootstrap.sh - #!/bin/sh - - - # Configure the NVMe volumes in RAID0 configuration in the bootstrap.sh call. - # https://github.com/awslabs/amazon-eks-ami/blob/master/files/bootstrap.sh#L35 - # This will create a RAID volume and mount it at /mnt/k8s-disks/0 - # then mount that volume to /var/lib/kubelet, /var/lib/containerd, and /var/log/pods - # this allows the container daemons and pods to write to the RAID0 by default without needing PersistentVolumes - export LOCAL_DISKS='raid0' - EOF - - # Source extra environment variables in bootstrap script - sed -i '/^set -o errexit/a\\nsource /etc/profile.d/bootstrap.sh' /etc/eks/bootstrap.sh - - --BOUNDARY-- - tags: - InstanceType: "spark-compute-optimized" # optional, add tags for your own use diff --git a/analytics/terraform/spark-k8s-operator/karpenter-provisioners/spark-graviton-memory-optimized-provisioner.yaml b/analytics/terraform/spark-k8s-operator/karpenter-provisioners/spark-graviton-memory-optimized-provisioner.yaml deleted file mode 100644 index 0d455aa7f..000000000 --- a/analytics/terraform/spark-k8s-operator/karpenter-provisioners/spark-graviton-memory-optimized-provisioner.yaml +++ /dev/null @@ -1,92 +0,0 @@ -apiVersion: karpenter.sh/v1alpha5 -kind: Provisioner -metadata: - name: spark-graviton-memory-optimized - namespace: karpenter -spec: - kubeletConfiguration: - containerRuntime: containerd -# podsPerCore: 2 -# maxPods: 20 - requirements: - - key: "topology.kubernetes.io/zone" - operator: In - values: [${azs}b] #Update the correct region and zone - - key: "karpenter.sh/capacity-type" - operator: In - values: ["spot", "on-demand"] - - key: "node.kubernetes.io/instance-type" #If not included, all instance types are considered - operator: In - values: ["r6gd.4xlarge", "r6gd.8xlarge"] # 2 NVMe disk - - key: "kubernetes.io/arch" - operator: In - values: ["arm64"] - limits: - resources: - cpu: 1000 - providerRef: # optional, recommended to use instead of `provider` - name: spark-graviton-memory-optimized - labels: - type: karpenter - provisioner: spark-graviton-memory-optimized - NodeGroupType: SparkGravitonMemoryOptimized - taints: - - key: spark-graviton-memory-optimized - value: 'true' - effect: NoSchedule - ttlSecondsAfterEmpty: 120 # optional, but never scales down if not set - ---- -apiVersion: karpenter.k8s.aws/v1alpha1 -kind: AWSNodeTemplate -metadata: - name: spark-graviton-memory-optimized - namespace: karpenter -spec: - blockDeviceMappings: - - deviceName: /dev/xvda - ebs: - volumeSize: 200Gi - volumeType: gp3 - encrypted: true - deleteOnTermination: true - metadataOptions: - httpEndpoint: enabled - httpProtocolIPv6: disabled - httpPutResponseHopLimit: 2 - httpTokens: required - subnetSelector: - Name: "${eks_cluster_id}-private*" # Name of the Subnets to spin up the nodes - securityGroupSelector: # required, when not using launchTemplate - Name: "${eks_cluster_id}-node*" # name of the SecurityGroup to be used with Nodes - # instanceProfile: "" # optional, if already set in controller args - #RAID0 config example - userData: | - MIME-Version: 1.0 - Content-Type: multipart/mixed; boundary="BOUNDARY" - - --BOUNDARY - Content-Type: text/x-shellscript; charset="us-ascii" - - cat <<-EOF > /etc/profile.d/bootstrap.sh - #!/bin/sh - - # Configure NVMe volumes in RAID0 configuration - # https://github.com/awslabs/amazon-eks-ami/blob/056e31f8c7477e893424abce468cb32bbcd1f079/files/bootstrap.sh#L35C121-L35C126 - # Mount will be: /mnt/k8s-disks - - # Configure the NVMe volumes in RAID0 configuration in the bootstrap.sh call. - # https://github.com/awslabs/amazon-eks-ami/blob/master/files/bootstrap.sh#L35 - # This will create a RAID volume and mount it at /mnt/k8s-disks/0 - # then mount that volume to /var/lib/kubelet, /var/lib/containerd, and /var/log/pods - # this allows the container daemons and pods to write to the RAID0 by default without needing PersistentVolumes - export LOCAL_DISKS='raid0' - EOF - - # Source extra environment variables in bootstrap script - sed -i '/^set -o errexit/a\\nsource /etc/profile.d/bootstrap.sh' /etc/eks/bootstrap.sh - - --BOUNDARY-- - - tags: - InstanceType: "spark-graviton-memory-optimized" # optional, add tags for your own use diff --git a/analytics/terraform/spark-k8s-operator/karpenter-provisioners/spark-memory-optimized-provisioner.yaml b/analytics/terraform/spark-k8s-operator/karpenter-provisioners/spark-memory-optimized-provisioner.yaml deleted file mode 100644 index 854a3e470..000000000 --- a/analytics/terraform/spark-k8s-operator/karpenter-provisioners/spark-memory-optimized-provisioner.yaml +++ /dev/null @@ -1,102 +0,0 @@ -apiVersion: karpenter.sh/v1alpha5 -kind: Provisioner -metadata: - name: spark-memory-optimized - namespace: karpenter -spec: - kubeletConfiguration: - containerRuntime: containerd -# podsPerCore: 2 -# maxPods: 20 - requirements: - - key: "topology.kubernetes.io/zone" - operator: In - values: [${azs}b] # Update the correct region and zone - - key: "karpenter.sh/capacity-type" - operator: In - values: ["spot", "on-demand"] - - key: "karpenter.k8s.aws/instance-family" - operator: In - values: ["r5d"] - - key: "karpenter.k8s.aws/instance-size" - operator: In - values: ["2xlarge", "4xlarge", "8xlarge", "12xlarge", "16xlarge", "24xlarge"] -# - key: "node.kubernetes.io/instance-type" #If not included, all instance types are considered -# operator: In -# values: ["r5d.4xlarge","r5d.8xlarge","r5d.12xlarge"] # 2 NVMe disk - - key: "kubernetes.io/arch" - operator: In - values: ["amd64"] - limits: - resources: - cpu: 1000 - providerRef: # optional, recommended to use instead of `provider` - name: spark-memory-optimized - labels: - type: karpenter - provisioner: spark-memory-optimized - NodeGroupType: SparkMemoryOptimized - taints: - - key: spark-memory-optimized - value: 'true' - effect: NoSchedule - # If omitted, the feature is disabled, nodes will never scale down due to low utilization - ttlSecondsAfterEmpty: 120 - # Enables consolidation which attempts to reduce cluster cost by both removing un-needed nodes and down-sizing those - # that can't be removed. Mutually exclusive with the ttlSecondsAfterEmpty parameter. -# consolidation: -# enabled: true - # If omitted, the feature is disabled and nodes will never expire. If set to less time than it requires for a node - # to become ready, the node may expire before any pods successfully start. -# ttlSecondsUntilExpired: 2592000 # 30 Days = 60 * 60 * 24 * 30 Seconds; - ---- -apiVersion: karpenter.k8s.aws/v1alpha1 -kind: AWSNodeTemplate -metadata: - name: spark-memory-optimized - namespace: karpenter -spec: - blockDeviceMappings: - - deviceName: /dev/xvda - ebs: - volumeSize: 100Gi - volumeType: gp3 - encrypted: true - deleteOnTermination: true - metadataOptions: - httpEndpoint: enabled - httpProtocolIPv6: disabled - httpPutResponseHopLimit: 2 - httpTokens: required - subnetSelector: - Name: "${eks_cluster_id}-private*" # Name of the Subnets to spin up the nodes - securityGroupSelector: # required, when not using launchTemplate - Name: "${eks_cluster_id}-node*" # name of the SecurityGroup to be used with Nodes - # instanceProfile: "" # optional, if already set in controller args - #RAID0 config example - userData: | - MIME-Version: 1.0 - Content-Type: multipart/mixed; boundary="BOUNDARY" - - --BOUNDARY - Content-Type: text/x-shellscript; charset="us-ascii" - - cat <<-EOF > /etc/profile.d/bootstrap.sh - #!/bin/sh - - # Configure the NVMe volumes in RAID0 configuration in the bootstrap.sh call. - # https://github.com/awslabs/amazon-eks-ami/blob/master/files/bootstrap.sh#L35 - # This will create a RAID volume and mount it at /mnt/k8s-disks/0 - # then mount that volume to /var/lib/kubelet, /var/lib/containerd, and /var/log/pods - # this allows the container daemons and pods to write to the RAID0 by default without needing PersistentVolumes - export LOCAL_DISKS='raid0' - EOF - - # Source extra environment variables in bootstrap script - sed -i '/^set -o errexit/a\\nsource /etc/profile.d/bootstrap.sh' /etc/eks/bootstrap.sh - - --BOUNDARY-- - - tags: - InstanceType: "spark-memory-optimized" # optional, add tags for your own use diff --git a/analytics/terraform/spark-k8s-operator/karpenter-provisioners/spark-vertical-ebs-scaling-with-cpu-cores.yaml b/analytics/terraform/spark-k8s-operator/karpenter-provisioners/spark-vertical-ebs-scaling-with-cpu-cores.yaml deleted file mode 100644 index 71a4f13a4..000000000 --- a/analytics/terraform/spark-k8s-operator/karpenter-provisioners/spark-vertical-ebs-scaling-with-cpu-cores.yaml +++ /dev/null @@ -1,155 +0,0 @@ -apiVersion: karpenter.sh/v1alpha5 -kind: Provisioner -metadata: - name: spark-vertical-ebs-scale - namespace: karpenter -spec: - kubeletConfiguration: - containerRuntime: containerd -# podsPerCore: 2 -# maxPods: 20 - requirements: - - key: "topology.kubernetes.io/zone" - operator: In - values: [${azs}b] #Update the correct region and zone - - key: "karpenter.sh/capacity-type" - operator: In - values: ["spot", "on-demand"] - - key: "karpenter.k8s.aws/instance-family" - operator: In - values: ["r4", "r4", "r5", "r5d", "r5n", "r5dn", "r5b", "m4", "m5", "m5n", "m5zn", "m5dn", "m5d", "c4", "c5", "c5n", "c5d"] - - key: "kubernetes.io/arch" - operator: In - values: ["amd64"] - - key: "karpenter.k8s.aws/instance-cpu" - operator: Gt - values: ["1"] - providerRef: # optional, recommended to use instead of `provider` - name: spark-vertical-ebs-scale - labels: - type: karpenter - provisioner: spark-vertical-ebs-scale - taints: - - key: spark-vertical-ebs-scale - value: 'true' - effect: NoSchedule - # If omitted, the feature is disabled, nodes will never scale down due to low utilization - ttlSecondsAfterEmpty: 120 - ---- -apiVersion: karpenter.k8s.aws/v1alpha1 -kind: AWSNodeTemplate -metadata: - name: spark-vertical-ebs-scale - namespace: karpenter -spec: - blockDeviceMappings: - - deviceName: /dev/xvda # root volume - ebs: - volumeSize: 40Gi - volumeType: gp3 - encrypted: true - deleteOnTermination: true - metadataOptions: - httpEndpoint: enabled - httpProtocolIPv6: disabled - httpPutResponseHopLimit: 2 - httpTokens: required - subnetSelector: - Name: "${eks_cluster_id}-private*" # Name of the Subnets to spin up the nodes - securityGroupSelector: # required, when not using launchTemplate - Name: "${eks_cluster_id}-node*" # name of the SecurityGroup to be used with Nodes - - # Script for creating EBS Volume for Spark workloads proportional to number of cores available on the nodes - userData: | - MIME-Version: 1.0 - Content-Type: multipart/mixed; boundary="BOUNDARY" - - --BOUNDARY - Content-Type: text/x-shellscript; charset="us-ascii" - - #!/bin/bash - echo "Running a custom user data script" - set -ex - yum install mdadm -y - - IDX=1 - DEVICES=$(lsblk -o NAME,TYPE -dsn | awk '/disk/ {print $1}') - - DISK_ARRAY=() - - for DEV in $DEVICES - do - DISK_ARRAY+=("/dev/$${DEV}") - done - - DISK_COUNT=$${#DISK_ARRAY[@]} - - if [ $${DISK_COUNT} -eq 0 ]; then - echo "No SSD disks available. Creating new EBS volume according to number of cores available in the node." - yum install -y jq awscli - TOKEN=$(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 3600") - - # Get instance info - INSTANCE_ID=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" http://169.254.169.254/latest/meta-data/instance-id) - AVAILABILITY_ZONE=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" http://169.254.169.254/latest/meta-data/placement/availability-zone) - REGION=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" http://169.254.169.254/latest/meta-data/placement/availability-zone | sed 's/[a-z]$//') - - # Get the number of cores available - CORES=$(nproc --all) - - # Define volume size based on the number of cores and EBS volume size per core - VOLUME_SIZE=$(expr $CORES \* 10) # 10GB per core. Change as desired - - # Create a volume - VOLUME_ID=$(aws ec2 create-volume --availability-zone $AVAILABILITY_ZONE --size $VOLUME_SIZE --volume-type gp3 --region $REGION --output text --query 'VolumeId') - - # Check whether the volume is available - while [ "$(aws ec2 describe-volumes --volume-ids $VOLUME_ID --region $REGION --query "Volumes[*].State" --output text)" != "available" ]; do - echo "Waiting for volume to become available" - sleep 5 - done - - # Attach the volume to the instance - aws ec2 attach-volume --volume-id $VOLUME_ID --instance-id $INSTANCE_ID --device /dev/xvdb --region $REGION - - # Update the state to delete the volume when the node is terminated - aws ec2 modify-instance-attribute --instance-id $INSTANCE_ID --block-device-mappings "[{\"DeviceName\": \"/dev/xvdb\",\"Ebs\":{\"DeleteOnTermination\":true}}]" --region $REGION - - # Wait for the volume to be attached - while [ "$(aws ec2 describe-volumes --volume-ids $VOLUME_ID --region $REGION --query "Volumes[*].Attachments[*].State" --output text)" != "attached" ]; do - echo "Waiting for volume to be attached" - sleep 5 - done - - # Format the volume - sudo mkfs -t ext4 /dev/xvdb # Improve this to get this value dynamically - # Create a mount point - sudo mkdir /mnt/k8s-disks # Change directory as you like - # Mount the volume - sudo mount /dev/xvdb /mnt/k8s-disks - # To mount this EBS volume on every system reboot, you need to add an entry in /etc/fstab - echo "/dev/xvdb /mnt/k8s-disks ext4 defaults,nofail 0 2" | sudo tee -a /etc/fstab - - # Adding permissions to the mount - /usr/bin/chown -hR +999:+1000 /mnt/k8s-disks - else - if [ $${DISK_COUNT} -eq 1 ]; then - TARGET_DEV=$${DISK_ARRAY[0]} - mkfs.xfs $${TARGET_DEV} - else - mdadm --create --verbose /dev/md0 --level=0 --raid-devices=$${DISK_COUNT} $${DISK_ARRAY[@]} - mkfs.xfs /dev/md0 - TARGET_DEV=/dev/md0 - fi - - mkdir -p /mnt/k8s-disks - echo $${TARGET_DEV} /mnt/k8s-disks xfs defaults,noatime 1 2 >> /etc/fstab - mount -a - /usr/bin/chown -hR +999:+1000 /mnt/k8s-disks - fi - - --BOUNDARY-- - - tags: - InstanceType: "spark-vertical-ebs-scale" # optional, add tags for your own use diff --git a/analytics/terraform/spark-k8s-operator/main.tf b/analytics/terraform/spark-k8s-operator/main.tf index 64b8b8b52..f8046a360 100755 --- a/analytics/terraform/spark-k8s-operator/main.tf +++ b/analytics/terraform/spark-k8s-operator/main.tf @@ -156,9 +156,9 @@ module "eks" { substr(cidr_block, 0, 4) == "100." ? subnet_id : null]), 0) ] - min_size = 1 + min_size = 0 max_size = 20 - desired_size = 1 + desired_size = 0 instance_types = ["r5d.xlarge"] # r5d.xlarge 4vCPU - 32GB - 1 x 150 NVMe SSD - Up to 10Gbps - Up to 4,750 Mbps EBS Bandwidth @@ -190,9 +190,9 @@ module "eks" { substr(cidr_block, 0, 4) == "100." ? subnet_id : null]), 0) ] - min_size = 1 + min_size = 0 max_size = 12 - desired_size = 1 + desired_size = 0 instance_types = ["r5d.12xlarge", "r6id.12xlarge", "c5ad.12xlarge", "c5d.12xlarge", "c6id.12xlarge", "m5ad.12xlarge", "m5d.12xlarge", "m6id.12xlarge"] # 48cpu - 2 x 1425 NVMe SSD diff --git a/website/docs/blueprints/amazon-emr-on-eks/emr-eks-karpenter.md b/website/docs/blueprints/amazon-emr-on-eks/emr-eks-karpenter.md index 04eaf7773..0d8ce22c8 100644 --- a/website/docs/blueprints/amazon-emr-on-eks/emr-eks-karpenter.md +++ b/website/docs/blueprints/amazon-emr-on-eks/emr-eks-karpenter.md @@ -7,15 +7,15 @@ import TabItem from '@theme/TabItem'; import CollapsibleContent from '../../../src/components/CollapsibleContent'; import CodeBlock from '@theme/CodeBlock'; -import SparkComputeOptimizedProvisioner from '!!raw-loader!../../../../analytics/terraform/spark-k8s-operator/karpenter-provisioners/spark-compute-optimized-provisioner.yaml'; -import SparkMemoryOptimizedProvisioner from '!!raw-loader!../../../../analytics/terraform/spark-k8s-operator/karpenter-provisioners/spark-memory-optimized-provisioner.yaml'; -import SparkGravitonMemoryOptimizedProvisioner from '!!raw-loader!../../../../analytics/terraform/spark-k8s-operator/karpenter-provisioners/spark-graviton-memory-optimized-provisioner.yaml'; +import SparkComputeOptimizedNodepool from '!!raw-loader!../../../../analytics/terraform/emr-eks-karpenter/karpenter-provisioners/spark-compute-optimized-provisioner.yaml'; +import SparkMemoryOptimizedNodepool from '!!raw-loader!../../../../analytics/terraform/emr-eks-karpenter/karpenter-provisioners/spark-memory-optimized-provisioner.yaml'; +import SparkGravitonMemoryOptimizedNodepool from '!!raw-loader!../../../../analytics/terraform/emr-eks-karpenter/karpenter-provisioners/spark-graviton-memory-optimized-provisioner.yaml'; # EMR on EKS with [Karpenter](https://karpenter.sh/) ## Introduction -In this [pattern](https://github.com/awslabs/data-on-eks/tree/main/analytics/terraform/emr-eks-karpenter), you will deploy an EMR on EKS cluster and use [Karpenter](https://karpenter.sh/) provisioners for scaling Spark jobs. +In this [pattern](https://github.com/awslabs/data-on-eks/tree/main/analytics/terraform/emr-eks-karpenter), you will deploy an EMR on EKS cluster and use [Karpenter](https://karpenter.sh/) Nodepools for scaling Spark jobs. **Architecture** ![emr-eks-karpenter](img/emr-eks-karpenter.png) @@ -155,24 +155,24 @@ kubectl get pods --namespace=kube-system | grep cluster-autoscaler # Output sho ## Run Sample Spark job -The pattern shows how to run spark jobs in a multi-tenant EKS cluster. The examples showcases two data teams using namespaces `emr-data-team-a` and `emr-data-team-b` mapped to their EMR virtual clusters. You can use different Karpenter provisioners for each team so that they can submit jobs that are unique to their workload. Teams can also use different storage requirements to run their Spark jobs. For example, you can use compute optimized provisioner that has `taints` and specify `tolerations` using pod templates so that you can run spark on compute optimized EC2 instances. In terms of storage, you can decide whether to use [EC2 instance-store](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/InstanceStorage.html) or [EBS](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AmazonEBS.html) or [FSx for lustre](https://docs.aws.amazon.com/fsx/latest/LustreGuide/what-is.html) volumes for data processing. The default storage that is used in these examples is EC2 instance store because of performance benefit +The pattern shows how to run spark jobs in a multi-tenant EKS cluster. The examples showcases two data teams using namespaces `emr-data-team-a` and `emr-data-team-b` mapped to their EMR virtual clusters. You can use different Karpenter Nodepools for each team so that they can submit jobs that are unique to their workload. Teams can also use different storage requirements to run their Spark jobs. For example, you can use compute optimized Nodepool that has `taints` and specify `tolerations` using pod templates so that you can run spark on compute optimized EC2 instances. In terms of storage, you can decide whether to use [EC2 instance-store](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/InstanceStorage.html) or [EBS](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AmazonEBS.html) or [FSx for lustre](https://docs.aws.amazon.com/fsx/latest/LustreGuide/what-is.html) volumes for data processing. The default storage that is used in these examples is EC2 instance store because of performance benefit -- `spark-compute-optimized` provisioner to run spark jobs on `c5d` instances. -- `spark-memory-optimized` provisioner to run spark jobs on `r5d` instances. -- `spark-graviton-memory-optimized` provisioner to run spark jobs on `r6gd` Graviton instances(`ARM64`). +- `spark-compute-optimized` Nodepool to run spark jobs on `c5d` instances. +- `spark-memory-optimized` Nodepool to run spark jobs on `r5d` instances. +- `spark-graviton-memory-optimized` Nodepool to run spark jobs on `r6gd` Graviton instances(`ARM64`). -In this tutorial, you will use Karpenter provisioner that uses compute optimized instances. This template leverages the Karpenter AWSNodeTemplates. +In this tutorial, you will use Karpenter Nodepool that uses compute optimized instances. This template leverages the Karpenter AWSNodeTemplates.
- To view Karpenter provisioner for compute optimized instances, Click to toggle content! + To view Karpenter Nodepool for compute optimized instances, Click to toggle content! -{SparkComputeOptimizedProvisioner} +{SparkComputeOptimizedNodepool}
-To run Spark Jobs that can use this provisioner, you need to submit your jobs by adding `tolerations` to your pod templates +To run Spark Jobs that can use this Nodepool, you need to submit your jobs by adding `tolerations` to your pod templates For example, @@ -184,7 +184,7 @@ spec: effect: "NoSchedule" ``` -**Execute the sample PySpark Job to trigger compute optimized Karpenter provisioner** +**Execute the sample PySpark Job to trigger compute optimized Karpenter Nodepool** The following script requires four input parameters `virtual_cluster_id`, `job_execution_role_arn`, `cloudwatch_log_group_name` & `S3_Bucket` to store PySpark scripts, Pod templates and Input data. You can get these values `terraform apply` output values or by running `terraform output`. For `S3_BUCKET`, Either create a new S3 bucket or use an existing S3 bucket. @@ -205,7 +205,7 @@ Enter the CloudWatch Log Group name: /emr-on-eks-logs/emr-eks-karpenter/emr-data Enter the S3 Bucket for storing PySpark Scripts, Pod Templates and Input data. For e.g., s3://: s3://example-bucket ``` -Karpenter may take between 1 and 2 minutes to spin up a new compute node as specified in the provisioner templates before running the Spark Jobs. +Karpenter may take between 1 and 2 minutes to spin up a new compute node as specified in the Nodepool templates before running the Spark Jobs. Nodes will be drained with once the job is completed **Verify the job execution** @@ -218,15 +218,15 @@ kubectl get pods --namespace=emr-data-team-a -w -In this tutorial, you will use Karpenter provisioner that uses memory optimized instances. This template uses the AWS Node template with Userdata. +In this tutorial, you will use Karpenter Nodepool that uses memory optimized instances. This template uses the AWS Node template with Userdata.
- To view Karpenter provisioner for memory optimized instances, Click to toggle content! + To view Karpenter Nodepool for memory optimized instances, Click to toggle content! -{SparkMemoryOptimizedProvisioner} +{SparkMemoryOptimizedNodepool}
-To run Spark Jobs that can use this provisioner, you need to submit your jobs by adding `tolerations` to your pod templates +To run Spark Jobs that can use this Nodepool, you need to submit your jobs by adding `tolerations` to your pod templates For example, @@ -237,7 +237,7 @@ spec: operator: "Exists" effect: "NoSchedule" ``` -**Execute the sample PySpark Job to trigger memory optimized Karpenter provisioner** +**Execute the sample PySpark Job to trigger memory optimized Karpenter Nodepool** The following script requires four input parameters `virtual_cluster_id`, `job_execution_role_arn`, `cloudwatch_log_group_name` & `S3_Bucket` to store PySpark scripts, Pod templates and Input data. You can get these values `terraform apply` output values or by running `terraform output`. For `S3_BUCKET`, Either create a new S3 bucket or use an existing S3 bucket. @@ -258,7 +258,7 @@ Enter the CloudWatch Log Group name: /emr-on-eks-logs/emr-eks-karpenter/emr-data Enter the S3 Bucket for storing PySpark Scripts, Pod Templates and Input data. For e.g., s3://: s3://example-bucket ``` -Karpenter may take between 1 and 2 minutes to spin up a new compute node as specified in the provisioner templates before running the Spark Jobs. +Karpenter may take between 1 and 2 minutes to spin up a new compute node as specified in the Nodepool templates before running the Spark Jobs. Nodes will be drained with once the job is completed **Verify the job execution** @@ -270,15 +270,15 @@ kubectl get pods --namespace=emr-data-team-a -w -In this tutorial, you will use Karpenter provisioner that uses Graviton memory optimized instances. This template uses the AWS Node template with Userdata. +In this tutorial, you will use Karpenter Nodepool that uses Graviton memory optimized instances. This template uses the AWS Node template with Userdata.
- To view Karpenter provisioner for Graviton memory optimized instances, Click to toggle content! + To view Karpenter Nodepool for Graviton memory optimized instances, Click to toggle content! -{SparkGravitonMemoryOptimizedProvisioner} +{SparkGravitonMemoryOptimizedNodepool}
-To run Spark Jobs that can use this provisioner, you need to submit your jobs by adding `tolerations` to your pod templates +To run Spark Jobs that can use this Nodepool, you need to submit your jobs by adding `tolerations` to your pod templates For example, @@ -289,7 +289,7 @@ spec: operator: "Exists" effect: "NoSchedule" ``` -**Execute the sample PySpark Job to trigger Graviton memory optimized Karpenter provisioner** +**Execute the sample PySpark Job to trigger Graviton memory optimized Karpenter Nodepool** The following script requires four input parameters `virtual_cluster_id`, `job_execution_role_arn`, `cloudwatch_log_group_name` & `S3_Bucket` to store PySpark scripts, Pod templates and Input data. You can get these values `terraform apply` output values or by running `terraform output`. For `S3_BUCKET`, Either create a new S3 bucket or use an existing S3 bucket. @@ -310,7 +310,7 @@ Enter the CloudWatch Log Group name: /emr-on-eks-logs/emr-eks-karpenter/emr-data Enter the S3 Bucket for storing PySpark Scripts, Pod Templates and Input data. For e.g., s3://: s3://example-bucket ``` -Karpenter may take between 1 and 2 minutes to spin up a new compute node as specified in the provisioner templates before running the Spark Jobs. +Karpenter may take between 1 and 2 minutes to spin up a new compute node as specified in the Nodepool templates before running the Spark Jobs. Nodes will be drained with once the job is completed **Verify the job execution** @@ -322,9 +322,9 @@ kubectl get pods --namespace=emr-data-team-a -w
-### Execute the sample PySpark job that uses EBS volumes and compute optimized Karpenter provisioner +### Execute the sample PySpark job that uses EBS volumes and compute optimized Karpenter Nodepool -This pattern uses EBS volumes for data processing and compute optimized provisioner. You can modify the provisioner by changing nodeselector in driver and executor pod templates. In order to change provisioners, simply update your pod templates to desired provisioner +This pattern uses EBS volumes for data processing and compute optimized Nodepool. You can modify the Nodepool by changing nodeselector in driver and executor pod templates. In order to change Nodepools, simply update your pod templates to desired Nodepool ```yaml nodeSelector: NodeGroupType: "SparkComputeOptimized" @@ -362,7 +362,7 @@ In this example, you will learn how to deploy, configure and use FSx for Lustre -**Execute Spark Job by using `FSx for Lustre` with statically provisioned volume and compute optimized Karpenter provisioner.** +**Execute Spark Job by using `FSx for Lustre` with statically provisioned volume and compute optimized Karpenter Nodepool.** Fsx for Lustre Terraform module is disabled by default. Follow the [customizing add-ons](#customizing-add-ons) steps before running Spark jobs. @@ -381,7 +381,7 @@ cd analytics/terraform/emr-eks-karpenter/examples/fsx-for-lustre/fsx-static-pvc- ./fsx-static-spark.sh ``` -Karpetner may take between 1 and 2 minutes to spin up a new compute node as specified in the provisioner templates before running the Spark Jobs. +Karpetner may take between 1 and 2 minutes to spin up a new compute node as specified in the Nodepool templates before running the Spark Jobs. Nodes will be drained with once the job is completed **Verify the job execution events** @@ -400,7 +400,7 @@ kubectl exec -ti taxidata-exec-1 -c spark-kubernetes-executor -n emr-data-team-a -**Execute Spark Job by using `FSx for Lustre` with dynamically provisioned volume and compute optimized Karpenter provisioner.** +**Execute Spark Job by using `FSx for Lustre` with dynamically provisioned volume and compute optimized Karpenter Nodepool.** Fsx for Lustre Terraform module is disabled by default. Follow the [customizing add-ons](#customizing-add-ons) steps before running Spark jobs. @@ -420,7 +420,7 @@ cd analytics/terraform/emr-eks-karpenter/examples/fsx-for-lustre/fsx-dynamic-pvc ./fsx-dynamic-spark.sh ``` -Karpetner may take between 1 and 2 minutes to spin up a new compute node as specified in the provisioner templates before running the Spark Jobs. +Karpetner may take between 1 and 2 minutes to spin up a new compute node as specified in the Nodepool templates before running the Spark Jobs. Nodes will be drained with once the job is completed **Verify the job execution events** diff --git a/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md b/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md index e18c5ef27..c00bd8b41 100644 --- a/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md +++ b/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md @@ -7,8 +7,6 @@ import TabItem from '@theme/TabItem'; import CollapsibleContent from '../../../src/components/CollapsibleContent'; import CodeBlock from '@theme/CodeBlock'; -import SparkMemoryOptimizedProvisioner from '!!raw-loader!../../../../analytics/terraform/spark-k8s-operator/karpenter-provisioners/spark-memory-optimized-provisioner.yaml'; -import SparkGravitonMemoryOptimizedProvisioner from '!!raw-loader!../../../../analytics/terraform/spark-k8s-operator/karpenter-provisioners/spark-graviton-memory-optimized-provisioner.yaml'; # Spark Operator with YuniKorn @@ -19,46 +17,175 @@ The EKS Cluster design for the Data on EKS blueprint is optimized for running Sp Spark workloads with Karpenter}> -The first option presented leverages Karpenter as the autoscaler, eliminating the need for Managed Node Groups and Cluster Autoscaler. In this design, Karpenter and its provisioner are responsible for creating both On-Demand and Spot instances, dynamically selecting instance types based on user demands. Karpenter offers improved performance compared to Cluster Autoscaler, with more efficient node scaling and faster response times. Karpenter's key features include its ability to scale from zero, optimizing resource utilization and reducing costs when there is no demand for resources. Additionally, Karpenter supports multiple provisioners, allowing for greater flexibility in defining the required infrastructure for different workload types, such as compute, memory, and GPU-intensive tasks. Furthermore, Karpenter integrates seamlessly with Kubernetes, providing automatic, real-time adjustments to the cluster size based on observed workloads and scaling events. This enables a more efficient and cost-effective EKS cluster design that adapts to the ever-changing demands of Spark applications and other workloads. +The first option presented leverages Karpenter as the autoscaler, eliminating the need for Managed Node Groups and Cluster Autoscaler. In this design, Karpenter and its Nodepools are responsible for creating both On-Demand and Spot instances, dynamically selecting instance types based on user demands. Karpenter offers improved performance compared to Cluster Autoscaler, with more efficient node scaling and faster response times. Karpenter's key features include its ability to scale from zero, optimizing resource utilization and reducing costs when there is no demand for resources. Additionally, Karpenter supports multiple Nodepools, allowing for greater flexibility in defining the required infrastructure for different workload types, such as compute, memory, and GPU-intensive tasks. Furthermore, Karpenter integrates seamlessly with Kubernetes, providing automatic, real-time adjustments to the cluster size based on observed workloads and scaling events. This enables a more efficient and cost-effective EKS cluster design that adapts to the ever-changing demands of Spark applications and other workloads. ![img.png](img/eks-spark-operator-karpenter.png) -In this tutorial, you will use Karpenter provisioner that uses memory optimized instances. This template uses the AWS Node template with Userdata. +In this tutorial, you will use Karpenter Nodepools that uses memory optimized instances. This template uses the AWS Node template with Userdata.
- To view Karpenter provisioner for memory optimized instances, Click to toggle content! - -{SparkMemoryOptimizedProvisioner} -
- -To run Spark Jobs that can use this provisioner, you need to submit your jobs by adding `tolerations` to your pod templates - -For example, + To view Karpenter Nodepool for memory optimized instances, Click to toggle content! ```yaml -spec: - tolerations: - - key: "spark-memory-optimized" - operator: "Exists" - effect: "NoSchedule" + name: spark-memory-optimized + clusterName: ${module.eks.cluster_name} + ec2NodeClass: + karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]} + subnetSelectorTerms: + tags: + Name: "${module.eks.cluster_name}-private*" + securityGroupSelectorTerms: + tags: + Name: ${module.eks.cluster_name}-node + userData: | + MIME-Version: 1.0 + Content-Type: multipart/mixed; boundary="BOUNDARY" + + --BOUNDARY + Content-Type: text/x-shellscript; charset="us-ascii" + + cat <<-EOF > /etc/profile.d/bootstrap.sh + #!/bin/sh + + + # Configure the NVMe volumes in RAID0 configuration in the bootstrap.sh call. + # https://github.com/awslabs/amazon-eks-ami/blob/master/files/bootstrap.sh#L35 + # This will create a RAID volume and mount it at /mnt/k8s-disks/0 + # then mount that volume to /var/lib/kubelet, /var/lib/containerd, and /var/log/pods + # this allows the container daemons and pods to write to the RAID0 by default without needing PersistentVolumes + export LOCAL_DISKS='raid0' + EOF + + # Source extra environment variables in bootstrap script + sed -i '/^set -o errexit/a\\nsource /etc/profile.d/bootstrap.sh' /etc/eks/bootstrap.sh + + --BOUNDARY-- + + nodePool: + labels: + - type: karpenter + - NodeGroupType: SparkComputeOptimized + - multiArch: Spark + requirements: + - key: "karpenter.sh/capacity-type" + operator: In + values: ["spot", "on-demand"] + - key: "kubernetes.io/arch" + operator: In + values: ["amd64"] + - key: "karpenter.k8s.aws/instance-category" + operator: In + values: ["r"] + - key: "karpenter.k8s.aws/instance-family" + operator: In + values: ["r5d"] + - key: "karpenter.k8s.aws/instance-cpu" + operator: In + values: ["4", "8", "16", "32"] + - key: "karpenter.k8s.aws/instance-hypervisor" + operator: In + values: ["nitro"] + - key: "karpenter.k8s.aws/instance-generation" + operator: Gt + values: ["2"] + limits: + cpu: 1000 + disruption: + consolidationPolicy: WhenEmpty + consolidateAfter: 30s + expireAfter: 720h + weight: 100 ``` + +
-In this yaml, you will use Karpenter provisioner that uses Graviton memory optimized instances. This template uses the AWS Node template with Userdata. +In this yaml, you will use Karpenter Nodepool that uses Graviton memory optimized instances. This template uses the AWS Node template with Userdata.
- To view Karpenter provisioner for Graviton memory optimized instances, Click to toggle content! + To view Karpenter Nodepool for Graviton memory optimized instances, Click to toggle content! + +```yaml + name: spark-graviton-memory-optimized + clusterName: ${module.eks.cluster_name} + ec2NodeClass: + karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]} + subnetSelectorTerms: + tags: + Name: "${module.eks.cluster_name}-private*" + securityGroupSelectorTerms: + tags: + Name: ${module.eks.cluster_name}-node + userData: | + MIME-Version: 1.0 + Content-Type: multipart/mixed; boundary="BOUNDARY" + + --BOUNDARY + Content-Type: text/x-shellscript; charset="us-ascii" + + cat <<-EOF > /etc/profile.d/bootstrap.sh + #!/bin/sh + + + # Configure the NVMe volumes in RAID0 configuration in the bootstrap.sh call. + # https://github.com/awslabs/amazon-eks-ami/blob/master/files/bootstrap.sh#L35 + # This will create a RAID volume and mount it at /mnt/k8s-disks/0 + # then mount that volume to /var/lib/kubelet, /var/lib/containerd, and /var/log/pods + # this allows the container daemons and pods to write to the RAID0 by default without needing PersistentVolumes + export LOCAL_DISKS='raid0' + EOF + + # Source extra environment variables in bootstrap script + sed -i '/^set -o errexit/a\\nsource /etc/profile.d/bootstrap.sh' /etc/eks/bootstrap.sh + + --BOUNDARY-- + + + nodePool: + labels: + - type: karpenter + - NodeGroupType: SparkGravitonMemoryOptimized + - multiArch: Spark + requirements: + - key: "karpenter.sh/capacity-type" + operator: In + values: ["spot", "on-demand"] + - key: "kubernetes.io/arch" + operator: In + values: ["arm64"] + - key: "karpenter.k8s.aws/instance-category" + operator: In + values: ["r"] + - key: "karpenter.k8s.aws/instance-family" + operator: In + values: ["r6gd"] + - key: "karpenter.k8s.aws/instance-cpu" + operator: In + values: ["4", "8", "16", "32"] + - key: "karpenter.k8s.aws/instance-hypervisor" + operator: In + values: ["nitro"] + - key: "karpenter.k8s.aws/instance-generation" + operator: Gt + values: ["2"] + limits: + cpu: 1000 + disruption: + consolidationPolicy: WhenEmpty + consolidateAfter: 30s + expireAfter: 720h + weight: 50 +``` -{SparkGravitonMemoryOptimizedProvisioner}
-To run Spark Jobs that can use this provisioner, you need to submit your jobs by adding `tolerations` to your pod templates +To run Spark Jobs that can use this Nodepool, you need to submit your jobs by adding `tolerations` to your pod templates For example, @@ -161,7 +288,7 @@ You should see the new nodes triggered by the karpenter and the YuniKorn will sc kubectl get pods -n spark-team-a -w ``` -You can try the following examples to leverage multiple Karpenter provisioners, EBS as Dynamic PVC instead of SSD and YuniKorn Gang Scheduling. +You can try the following examples to leverage multiple Karpenter Nodepools, EBS as Dynamic PVC instead of SSD and YuniKorn Gang Scheduling. ## NVMe Ephemeral SSD disk for Spark shuffle storage @@ -319,6 +446,168 @@ Step2: Execute Benchmark test ```
+Karpenter Nodepool weights with Graviton and Intel}> + +### Using Karpenter Nodepool weights for running Spark Jobs on both AWS Graviton and Intel EC2 Instances + +Customers often seek to leverage AWS Graviton instances for running Spark jobs due to their cost savings and performance improvements over traditional Intel instances. However, a common challenge is the availability of Graviton instances in specific regions or availability zones, especially during times of high demand. To address this, a fallback strategy to equivalent Intel instances is desirable. + +#### Solution +**Step 1: Create a Multi-Architecture Spark Docker Image** +First, ensure that your Spark job can run on both AWS Graviton (ARM architecture) and Intel (AMD architecture) instances by creating a multi-architecture Docker image. You can find a sample [Dockerfile](../../../../analytics/terraform/spark-k8s-operator/examples/docker/Dockerfile) and instructions for building and pushing this image to Amazon Elastic Container Registry (ECR) here. + +**Step 2: Deploy Two Karpenter Nodepools with weights** +Deploy two separate Karpenter Nodepools: one configured for Graviton instances and the other for Intel instances. + +Graviton Nodepool (ARM): Set the weight of the Graviton Nodepool to `100`. This prioritizes Graviton instances for your Spark workloads. + +Intel Nodepool (AMD): Set the weight of the Intel Nodepool to `50`. This ensures that Karpenter will fall back to the Intel Nodepool when Graviton instances are either unavailable or reach their maximum CPU capacity. + +```yaml + # spark-compute-optimized + name: spark-compute-optimized + clusterName: ${module.eks.cluster_name} + ec2NodeClass: + karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]} + subnetSelectorTerms: + tags: + Name: "${module.eks.cluster_name}-private*" + securityGroupSelectorTerms: + tags: + Name: ${module.eks.cluster_name}-node + userData: | + MIME-Version: 1.0 + Content-Type: multipart/mixed; boundary="BOUNDARY" + + --BOUNDARY + Content-Type: text/x-shellscript; charset="us-ascii" + + cat <<-EOF > /etc/profile.d/bootstrap.sh + #!/bin/sh + + + # Configure the NVMe volumes in RAID0 configuration in the bootstrap.sh call. + # https://github.com/awslabs/amazon-eks-ami/blob/master/files/bootstrap.sh#L35 + # This will create a RAID volume and mount it at /mnt/k8s-disks/0 + # then mount that volume to /var/lib/kubelet, /var/lib/containerd, and /var/log/pods + # this allows the container daemons and pods to write to the RAID0 by default without needing PersistentVolumes + export LOCAL_DISKS='raid0' + EOF + + # Source extra environment variables in bootstrap script + sed -i '/^set -o errexit/a\\nsource /etc/profile.d/bootstrap.sh' /etc/eks/bootstrap.sh + + --BOUNDARY-- + + nodePool: + labels: + - type: karpenter + - NodeGroupType: SparkComputeOptimized + - multiArch: Spark + requirements: + - key: "karpenter.sh/capacity-type" + operator: In + values: ["spot", "on-demand"] + - key: "kubernetes.io/arch" + operator: In + values: ["amd64"] + - key: "karpenter.k8s.aws/instance-category" + operator: In + values: ["c"] + - key: "karpenter.k8s.aws/instance-family" + operator: In + values: ["c5d"] + - key: "karpenter.k8s.aws/instance-cpu" + operator: In + values: ["4", "8", "16", "36"] + - key: "karpenter.k8s.aws/instance-hypervisor" + operator: In + values: ["nitro"] + - key: "karpenter.k8s.aws/instance-generation" + operator: Gt + values: ["2"] + limits: + cpu: 20 # Change this to 1000 or more for production according to your needs + disruption: + consolidationPolicy: WhenEmpty + consolidateAfter: 30s + expireAfter: 720h + weight: 100 + + # spark-graviton-memory-optimized Nodepool + + name: spark-graviton-memory-optimized + clusterName: ${module.eks.cluster_name} + ec2NodeClass: + karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]} + subnetSelectorTerms: + tags: + Name: "${module.eks.cluster_name}-private*" + securityGroupSelectorTerms: + tags: + Name: ${module.eks.cluster_name}-node + userData: | + MIME-Version: 1.0 + Content-Type: multipart/mixed; boundary="BOUNDARY" + + --BOUNDARY + Content-Type: text/x-shellscript; charset="us-ascii" + + cat <<-EOF > /etc/profile.d/bootstrap.sh + #!/bin/sh + + + # Configure the NVMe volumes in RAID0 configuration in the bootstrap.sh call. + # https://github.com/awslabs/amazon-eks-ami/blob/master/files/bootstrap.sh#L35 + # This will create a RAID volume and mount it at /mnt/k8s-disks/0 + # then mount that volume to /var/lib/kubelet, /var/lib/containerd, and /var/log/pods + # this allows the container daemons and pods to write to the RAID0 by default without needing PersistentVolumes + export LOCAL_DISKS='raid0' + EOF + + # Source extra environment variables in bootstrap script + sed -i '/^set -o errexit/a\\nsource /etc/profile.d/bootstrap.sh' /etc/eks/bootstrap.sh + + --BOUNDARY-- + nodePool: + labels: + - type: karpenter + - NodeGroupType: SparkGravitonMemoryOptimized + - multiArch: Spark + requirements: + - key: "karpenter.sh/capacity-type" + operator: In + values: ["spot", "on-demand"] + - key: "kubernetes.io/arch" + operator: In + values: ["arm64"] + - key: "karpenter.k8s.aws/instance-category" + operator: In + values: ["r"] + - key: "karpenter.k8s.aws/instance-family" + operator: In + values: ["r6gd"] + - key: "karpenter.k8s.aws/instance-cpu" + operator: In + values: ["4", "8", "16", "32"] + - key: "karpenter.k8s.aws/instance-hypervisor" + operator: In + values: ["nitro"] + - key: "karpenter.k8s.aws/instance-generation" + operator: Gt + values: ["2"] + limits: + cpu: 1000 + disruption: + consolidationPolicy: WhenEmpty + consolidateAfter: 30s + expireAfter: 720h + weight: 50 +``` + + + + Cleanup}> This script will cleanup the environment using `-target` option to ensure all the resources are deleted in correct order. diff --git a/website/docs/blueprints/job-schedulers/self-managed-airflow.md b/website/docs/blueprints/job-schedulers/self-managed-airflow.md index b81f18ebd..cebd3b359 100644 --- a/website/docs/blueprints/job-schedulers/self-managed-airflow.md +++ b/website/docs/blueprints/job-schedulers/self-managed-airflow.md @@ -8,7 +8,6 @@ import TabItem from '@theme/TabItem'; import CollapsibleContent from '../../../src/components/CollapsibleContent'; import CodeBlock from '@theme/CodeBlock'; -import SparkComputeOptimizedProvisioner from '!!raw-loader!../../../../analytics/terraform/spark-k8s-operator/karpenter-provisioners/spark-compute-optimized-provisioner.yaml'; # Self-managed Apache Airflow deployment on Amazon EKS @@ -185,22 +184,91 @@ Login with Admin user and password and create new users for Admin and Viewer rol ![img.png](img/airflow-k8sspark-example.png) -This option presents leverages Karpenter as the autoscaler, eliminating the need for Managed Node Groups and Cluster Autoscaler. In this design, Karpenter and its provisioner are responsible for creating both On-Demand and Spot instances, dynamically selecting instance types based on user demands. Karpenter offers improved performance compared to Cluster Autoscaler, with more efficient node scaling and faster response times. Karpenter's key features include its ability to scale from zero, optimizing resource utilization and reducing costs when there is no demand for resources. Additionally, Karpenter supports multiple provisioners, allowing for greater flexibility in defining the required infrastructure for different workload types, such as compute, memory, and GPU-intensive tasks. Furthermore, Karpenter integrates seamlessly with Kubernetes, providing automatic, real-time adjustments to the cluster size based on observed workloads and scaling events. This enables a more efficient and cost-effective EKS cluster design that adapts to the ever-changing demands of Spark applications and other workloads. +This option presents leverages Karpenter as the autoscaler, eliminating the need for Managed Node Groups and Cluster Autoscaler. In this design, Karpenter and its Nodepool are responsible for creating both On-Demand and Spot instances, dynamically selecting instance types based on user demands. Karpenter offers improved performance compared to Cluster Autoscaler, with more efficient node scaling and faster response times. Karpenter's key features include its ability to scale from zero, optimizing resource utilization and reducing costs when there is no demand for resources. Additionally, Karpenter supports multiple Nodepools, allowing for greater flexibility in defining the required infrastructure for different workload types, such as compute, memory, and GPU-intensive tasks. Furthermore, Karpenter integrates seamlessly with Kubernetes, providing automatic, real-time adjustments to the cluster size based on observed workloads and scaling events. This enables a more efficient and cost-effective EKS cluster design that adapts to the ever-changing demands of Spark applications and other workloads. -In this tutorial, you will use Karpenter provisioner that uses memory optimized instances. This template uses the AWS Node template with Userdata. +In this tutorial, you will use Karpenter Nodepool that uses memory optimized instances. This template uses the AWS Node template with Userdata.
- To view Karpenter provisioner for memory optimized instances, Click to toggle content! + To view Karpenter Nodepool for memory optimized instances, Click to toggle content! -{SparkComputeOptimizedProvisioner} +```yaml + name: spark-compute-optimized + clusterName: ${module.eks.cluster_name} + ec2NodeClass: + karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]} + subnetSelectorTerms: + tags: + Name: "${module.eks.cluster_name}-private*" + securityGroupSelectorTerms: + tags: + Name: ${module.eks.cluster_name}-node + userData: | + MIME-Version: 1.0 + Content-Type: multipart/mixed; boundary="BOUNDARY" + + --BOUNDARY + Content-Type: text/x-shellscript; charset="us-ascii" + + cat <<-EOF > /etc/profile.d/bootstrap.sh + #!/bin/sh + + + # Configure the NVMe volumes in RAID0 configuration in the bootstrap.sh call. + # https://github.com/awslabs/amazon-eks-ami/blob/master/files/bootstrap.sh#L35 + # This will create a RAID volume and mount it at /mnt/k8s-disks/0 + # then mount that volume to /var/lib/kubelet, /var/lib/containerd, and /var/log/pods + # this allows the container daemons and pods to write to the RAID0 by default without needing PersistentVolumes + export LOCAL_DISKS='raid0' + EOF + + # Source extra environment variables in bootstrap script + sed -i '/^set -o errexit/a\\nsource /etc/profile.d/bootstrap.sh' /etc/eks/bootstrap.sh + + --BOUNDARY-- + + nodePool: + labels: + - type: karpenter + - NodeGroupType: SparkComputeOptimized + - multiArch: Spark + requirements: + - key: "karpenter.sh/capacity-type" + operator: In + values: ["spot", "on-demand"] + - key: "kubernetes.io/arch" + operator: In + values: ["amd64"] + - key: "karpenter.k8s.aws/instance-category" + operator: In + values: ["c"] + - key: "karpenter.k8s.aws/instance-family" + operator: In + values: ["c5d"] + - key: "karpenter.k8s.aws/instance-cpu" + operator: In + values: ["4", "8", "16", "36"] + - key: "karpenter.k8s.aws/instance-hypervisor" + operator: In + values: ["nitro"] + - key: "karpenter.k8s.aws/instance-generation" + operator: Gt + values: ["2"] + limits: + cpu: 20 # Change this to 1000 or more for production according to your needs + disruption: + consolidationPolicy: WhenEmpty + consolidateAfter: 30s + expireAfter: 720h + weight: 100 +```
-To run Spark Jobs that can use this provisioner, you need to submit your jobs by adding `tolerations` to your Spark Application manifest. Additionally, to ensure Spark Driver pods run only on `On-Demand` nodes and Spark Executors run only on `Spot` nodes, add the `karpenter.sh/capacity-type` node selectors. +To run Spark Jobs that can use this Nodepool, you need to submit your jobs by adding `tolerations` to your Spark Application manifest. Additionally, to ensure Spark Driver pods run only on `On-Demand` nodes and Spark Executors run only on `Spot` nodes, add the `karpenter.sh/capacity-type` node selectors. For example, ```yaml - # Using Karpenter provisioner nodeSelectors and tolerations + # Using Karpenter Nodepool nodeSelectors and tolerations nodeSelector: NodeGroupType: "SparkComputeOptimized" karpenter.sh/capacity-type: "on-demand" @@ -346,7 +414,7 @@ spec: labels: version: 3.1.1 serviceAccount: spark-team-a - # Using Karpenter provisioner nodeSelectors and tolerations + # Using Karpenter Nodepool nodeSelectors and tolerations nodeSelector: NodeGroupType: "SparkComputeOptimized" karpenter.sh/capacity-type: "on-demand" @@ -361,7 +429,7 @@ spec: serviceAccount: spark-team-a labels: version: 3.1.1 - # Using Karpenter provisioner nodeSelectors and tolerations + # Using Karpenter Nodepool nodeSelectors and tolerations nodeSelector: NodeGroupType: "SparkComputeOptimized" karpenter.sh/capacity-type: "spot"