From abbf1f609ea4476821ae3cfd36fbfd99ae304e82 Mon Sep 17 00:00:00 2001 From: George Jahad Date: Fri, 31 Jan 2025 13:36:56 -0800 Subject: [PATCH 01/13] working --- conf/spark-pi-driver.yaml | 101 ++++++++++++++++++ conf/spark-pi-executor.yaml | 96 +++++++++++++++++ ....apache.spark.deploy.SparkSubmitOperation} | 0 ...he.spark.scheduler.ExternalClusterManager} | 2 +- .../src/main/dockerfiles/spark/entrypoint.sh | 28 +++++ 5 files changed, 226 insertions(+), 1 deletion(-) create mode 100644 conf/spark-pi-driver.yaml create mode 100644 conf/spark-pi-executor.yaml rename resource-managers/armada/core/src/main/resources/META-INF/services/{io.armadaproject.spark.deploy.SparkSubmitOperation => org.apache.spark.deploy.SparkSubmitOperation} (100%) rename resource-managers/armada/core/src/main/resources/META-INF/services/{io.armadaproject.spark.scheduler.ExternalClusterManager => org.apache.spark.scheduler.ExternalClusterManager} (92%) diff --git a/conf/spark-pi-driver.yaml b/conf/spark-pi-driver.yaml new file mode 100644 index 0000000000000..2e1ca082a6430 --- /dev/null +++ b/conf/spark-pi-driver.yaml @@ -0,0 +1,101 @@ + queue: test + jobSetId: job-set-1 + jobs: + - namespace: default + k8sService: + metadata: + name: driverService + spec: + type: "ClusterIP" + clusterIP: None + ports: + - port: 7078 + - port: 7079 + podSpec: + terminationGracePeriodSeconds: 0 + restartPolicy: Never + containers: + - args: + - driver + - --conf + - "spark.driver.extraJavaOptions=-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=0.0.0.0:5005" + - --conf + - "spark.driver.port=7078" + - --conf + - spark.master=armada://service-0:38819" + - --class + - org.apache.spark.examples.SparkPi + - local:///opt/spark/examples/jars/spark-examples.jar + - "100" + image: spark:testing10 + env: + - name: SPARK_USER + value: gbj + - name: SPARK_APPLICATION_ID + value: spark-245bb63032344654a54269df9f70f527 + - name: IP_ADDR + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.podIP + - name: SPARK_DRIVER_BIND_ADDRESS + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.podIP + - name: SPARK_CONF_DIR + value: /opt/spark/conf + imagePullPolicy: IfNotPresent + name: spark-kubernetes-driver + ports: + - containerPort: 7078 + name: driver-rpc-port + protocol: TCP + - containerPort: 7079 + name: blockmanager + protocol: TCP + - containerPort: 4040 + name: spark-ui + protocol: TCP + resources: + limits: + cpu: "1" + memory: 1408Mi + requests: + cpu: "1" + memory: 1408Mi + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + dnsPolicy: ClusterFirst + enableServiceLinks: true + hostname: spark-pi-09ce2f94a9d2b56c-driver + name: spark-pi-09ce2f94a9d2b56c-driver + nodeName: armada-worker + restartPolicy: Never + schedulerName: default-scheduler + securityContext: {} + terminationGracePeriodSeconds: 30 + tolerations: + - effect: NoExecute + key: node.kubernetes.io/not-ready + operator: Exists + tolerationSeconds: 300 + - effect: NoExecute + key: node.kubernetes.io/unreachable + operator: Exists + tolerationSeconds: 300 + volumes: + - name: kube-api-access-k9xlp + projected: + defaultMode: 420 + sources: + - serviceAccountToken: + expirationSeconds: 3607 + path: token + - downwardAPI: + items: + - fieldRef: + apiVersion: v1 + fieldPath: metadata.namespace + path: namespace + \ No newline at end of file diff --git a/conf/spark-pi-executor.yaml b/conf/spark-pi-executor.yaml new file mode 100644 index 0000000000000..b9f01a083f96b --- /dev/null +++ b/conf/spark-pi-executor.yaml @@ -0,0 +1,96 @@ + queue: test + jobSetId: job-set-1 + jobs: + - namespace: default + podSpec: + terminationGracePeriodSeconds: 0 + restartPolicy: Never + containers: + - args: + - executor + env: + - name: SPARK_USER + value: gbj + - name: SPARK_DRIVER_URL + value: "spark://CoarseGrainedScheduler@10.244.1.47:7078" + - name: SPARK_EXECUTOR_POD_IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.podIP + - name: SPARK_EXECUTOR_POD_NAME + value: "executor_pod" + - name: SPARK_EXECUTOR_CORES + value: "1" + - name: SPARK_EXECUTOR_MEMORY + value: 1024m + - name: SPARK_APPLICATION_ID + value: spark-0b0907d65ec240618163930cc6671aaa + - name: SPARK_CONF_DIR + value: /opt/spark/conf + - name: SPARK_EXECUTOR_ID + value: "1" + - name: SPARK_RESOURCE_PROFILE_ID + value: "0" + - name: SPARK_JAVA_OPT_0 + value: -Djava.net.preferIPv6Addresses=false + - name: SPARK_JAVA_OPT_20 + value: -agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=5005 + - name: SPARK_JAVA_OPT_6 + value: --add-opens=java.base/java.io=ALL-UNNAMED + - name: SPARK_JAVA_OPT_1 + value: -XX:+IgnoreUnrecognizedVMOptions + - name: SPARK_JAVA_OPT_21 + value: -Dspark.driver.port=7078 + - name: SPARK_JAVA_OPT_10 + value: --add-opens=java.base/java.util.concurrent=ALL-UNNAMED + - name: SPARK_JAVA_OPT_9 + value: --add-opens=java.base/java.util=ALL-UNNAMED + - name: SPARK_JAVA_OPT_8 + value: --add-opens=java.base/java.nio=ALL-UNNAMED + - name: SPARK_JAVA_OPT_12 + value: --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED + - name: SPARK_JAVA_OPT_15 + value: --add-opens=java.base/sun.security.action=ALL-UNNAMED + - name: SPARK_JAVA_OPT_3 + value: --add-opens=java.base/java.lang=ALL-UNNAMED + - name: SPARK_JAVA_OPT_19 + value: -Dio.netty.tryReflectionSetAccessible=true + - name: SPARK_JAVA_OPT_13 + value: --add-opens=java.base/sun.nio.ch=ALL-UNNAMED + - name: SPARK_JAVA_OPT_4 + value: --add-opens=java.base/java.lang.invoke=ALL-UNNAMED + - name: SPARK_JAVA_OPT_18 + value: -Djdk.reflect.useDirectMethodHandle=false + - name: SPARK_JAVA_OPT_7 + value: --add-opens=java.base/java.net=ALL-UNNAMED + - name: SPARK_JAVA_OPT_17 + value: --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED + - name: SPARK_JAVA_OPT_22 + value: -Dspark.driver.blockManager.port=7079 + - name: SPARK_JAVA_OPT_11 + value: --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED + - name: SPARK_JAVA_OPT_2 + value: --add-modules=jdk.incubator.vector + - name: SPARK_JAVA_OPT_16 + value: --add-opens=java.base/sun.util.calendar=ALL-UNNAMED + - name: SPARK_JAVA_OPT_14 + value: --add-opens=java.base/sun.nio.cs=ALL-UNNAMED + - name: SPARK_JAVA_OPT_5 + value: --add-opens=java.base/java.lang.reflect=ALL-UNNAMED + image: spark:testing10 + imagePullPolicy: IfNotPresent + name: spark-kubernetes-executor + ports: + - containerPort: 7079 + name: blockmanager + protocol: TCP + resources: + limits: + cpu: "1" + memory: 1408Mi + requests: + cpu: "1" + memory: 1408Mi + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File diff --git a/resource-managers/armada/core/src/main/resources/META-INF/services/io.armadaproject.spark.deploy.SparkSubmitOperation b/resource-managers/armada/core/src/main/resources/META-INF/services/org.apache.spark.deploy.SparkSubmitOperation similarity index 100% rename from resource-managers/armada/core/src/main/resources/META-INF/services/io.armadaproject.spark.deploy.SparkSubmitOperation rename to resource-managers/armada/core/src/main/resources/META-INF/services/org.apache.spark.deploy.SparkSubmitOperation diff --git a/resource-managers/armada/core/src/main/resources/META-INF/services/io.armadaproject.spark.scheduler.ExternalClusterManager b/resource-managers/armada/core/src/main/resources/META-INF/services/org.apache.spark.scheduler.ExternalClusterManager similarity index 92% rename from resource-managers/armada/core/src/main/resources/META-INF/services/io.armadaproject.spark.scheduler.ExternalClusterManager rename to resource-managers/armada/core/src/main/resources/META-INF/services/org.apache.spark.scheduler.ExternalClusterManager index 72cb48ec46478..cdf3b501b3eb9 100644 --- a/resource-managers/armada/core/src/main/resources/META-INF/services/io.armadaproject.spark.scheduler.ExternalClusterManager +++ b/resource-managers/armada/core/src/main/resources/META-INF/services/org.apache.spark.scheduler.ExternalClusterManager @@ -15,4 +15,4 @@ # limitations under the License. # -org.apache.spark.scheduler.cluster.k8s.KubernetesClusterManager +org.apache.spark.scheduler.cluster.armada.ArmadaClusterManager \ No newline at end of file diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh index f9561b9aa4ed5..c3452d97ada21 100755 --- a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh +++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh @@ -78,6 +78,33 @@ fi # SPARK-43540: add current working directory into executor classpath SPARK_CLASSPATH="$SPARK_CLASSPATH:$PWD" +replace_regex() { + local input_string="$1" + local regex="=ARMADA_IP_ADDR" + local replacement="=$IP_ADDR" + + # Use sed to perform the replacement + echo "$input_string" | sed -E "s/${regex}/${replacement}/g" +} + +process_element() { + echo $(replace_regex "$1") +} + +# Function to process all arguments and return results in an array +process_all() { + local input_array=("$@") # Convert $@ into an array + local output_array=() # Create an empty array for results + + for element in "${input_array[@]}"; do + result=$(process_element "$element") # Call processing function + output_array+=("$result") # Store result in array + done + + echo "${output_array[@]}" # Return as space-separated string +} + + case "$1" in driver) shift 1 @@ -85,6 +112,7 @@ case "$1" in "$SPARK_HOME/bin/spark-submit" --conf "spark.driver.bindAddress=$SPARK_DRIVER_BIND_ADDRESS" --conf "spark.executorEnv.SPARK_DRIVER_POD_IP=$SPARK_DRIVER_BIND_ADDRESS" + --conf "spark.driver.host=$SPARK_DRIVER_BIND_ADDRESS" --deploy-mode client "$@" ) From 9ad6d65903542ef9774c43154d37937dee46c488 Mon Sep 17 00:00:00 2001 From: George Jahad Date: Fri, 31 Jan 2025 14:18:40 -0800 Subject: [PATCH 02/13] cleanup --- conf/spark-pi-driver.yaml | 101 ---------------------- examples/spark-pi-driver.yaml | 49 +++++++++++ {conf => examples}/spark-pi-executor.yaml | 0 3 files changed, 49 insertions(+), 101 deletions(-) delete mode 100644 conf/spark-pi-driver.yaml create mode 100644 examples/spark-pi-driver.yaml rename {conf => examples}/spark-pi-executor.yaml (100%) diff --git a/conf/spark-pi-driver.yaml b/conf/spark-pi-driver.yaml deleted file mode 100644 index 2e1ca082a6430..0000000000000 --- a/conf/spark-pi-driver.yaml +++ /dev/null @@ -1,101 +0,0 @@ - queue: test - jobSetId: job-set-1 - jobs: - - namespace: default - k8sService: - metadata: - name: driverService - spec: - type: "ClusterIP" - clusterIP: None - ports: - - port: 7078 - - port: 7079 - podSpec: - terminationGracePeriodSeconds: 0 - restartPolicy: Never - containers: - - args: - - driver - - --conf - - "spark.driver.extraJavaOptions=-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=0.0.0.0:5005" - - --conf - - "spark.driver.port=7078" - - --conf - - spark.master=armada://service-0:38819" - - --class - - org.apache.spark.examples.SparkPi - - local:///opt/spark/examples/jars/spark-examples.jar - - "100" - image: spark:testing10 - env: - - name: SPARK_USER - value: gbj - - name: SPARK_APPLICATION_ID - value: spark-245bb63032344654a54269df9f70f527 - - name: IP_ADDR - valueFrom: - fieldRef: - apiVersion: v1 - fieldPath: status.podIP - - name: SPARK_DRIVER_BIND_ADDRESS - valueFrom: - fieldRef: - apiVersion: v1 - fieldPath: status.podIP - - name: SPARK_CONF_DIR - value: /opt/spark/conf - imagePullPolicy: IfNotPresent - name: spark-kubernetes-driver - ports: - - containerPort: 7078 - name: driver-rpc-port - protocol: TCP - - containerPort: 7079 - name: blockmanager - protocol: TCP - - containerPort: 4040 - name: spark-ui - protocol: TCP - resources: - limits: - cpu: "1" - memory: 1408Mi - requests: - cpu: "1" - memory: 1408Mi - terminationMessagePath: /dev/termination-log - terminationMessagePolicy: File - dnsPolicy: ClusterFirst - enableServiceLinks: true - hostname: spark-pi-09ce2f94a9d2b56c-driver - name: spark-pi-09ce2f94a9d2b56c-driver - nodeName: armada-worker - restartPolicy: Never - schedulerName: default-scheduler - securityContext: {} - terminationGracePeriodSeconds: 30 - tolerations: - - effect: NoExecute - key: node.kubernetes.io/not-ready - operator: Exists - tolerationSeconds: 300 - - effect: NoExecute - key: node.kubernetes.io/unreachable - operator: Exists - tolerationSeconds: 300 - volumes: - - name: kube-api-access-k9xlp - projected: - defaultMode: 420 - sources: - - serviceAccountToken: - expirationSeconds: 3607 - path: token - - downwardAPI: - items: - - fieldRef: - apiVersion: v1 - fieldPath: metadata.namespace - path: namespace - \ No newline at end of file diff --git a/examples/spark-pi-driver.yaml b/examples/spark-pi-driver.yaml new file mode 100644 index 0000000000000..ea8ef56ac581b --- /dev/null +++ b/examples/spark-pi-driver.yaml @@ -0,0 +1,49 @@ + queue: test + jobSetId: job-set-1 + jobs: + - namespace: default + priority: 0 + podSpec: + terminationGracePeriodSeconds: 0 + restartPolicy: Never + containers: + - name: spark-driver + image: spark:testing + env: + - name: SPARK_DRIVER_BIND_ADDRESS + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.podIP + command: + - /opt/entrypoint.sh + args: + - driver + - --verbose + - --class + - org.apache.spark.examples.SparkPi + - --conf + - "spark.driver.port=7078" + - --conf + - spark.master=armada://service-0:38819" + - --conf + - "spark.driver.extraJavaOptions=-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=0.0.0.0:5005" + - local:///opt/spark/examples/jars/spark-examples.jar + - "100" + ports: + - containerPort: 7078 + name: driver-rpc-port + protocol: TCP + - containerPort: 7079 + name: blockmanager + protocol: TCP + - containerPort: 4040 + name: spark-ui + protocol: TCP + resources: + limits: + cpu: "1" + memory: 1408Mi + requests: + cpu: "1" + memory: 1408Mi diff --git a/conf/spark-pi-executor.yaml b/examples/spark-pi-executor.yaml similarity index 100% rename from conf/spark-pi-executor.yaml rename to examples/spark-pi-executor.yaml From 22ffae9d9f6cbe741235bb63b62f5ace018c53ec Mon Sep 17 00:00:00 2001 From: George Jahad Date: Fri, 31 Jan 2025 14:24:23 -0800 Subject: [PATCH 03/13] cleanup --- examples/spark-pi-driver.yaml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/examples/spark-pi-driver.yaml b/examples/spark-pi-driver.yaml index ea8ef56ac581b..6ea6034e3e583 100644 --- a/examples/spark-pi-driver.yaml +++ b/examples/spark-pi-driver.yaml @@ -22,14 +22,21 @@ - --verbose - --class - org.apache.spark.examples.SparkPi + - --master + - armada://192.168.1.167:50051 - --conf - "spark.driver.port=7078" - --conf - - spark.master=armada://service-0:38819" - - --conf - "spark.driver.extraJavaOptions=-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=0.0.0.0:5005" - local:///opt/spark/examples/jars/spark-examples.jar - "100" + resources: + limits: + memory: 1Gi + cpu: 1 + requests: + memory: 1Gi + cpu: 1 ports: - containerPort: 7078 name: driver-rpc-port @@ -40,10 +47,3 @@ - containerPort: 4040 name: spark-ui protocol: TCP - resources: - limits: - cpu: "1" - memory: 1408Mi - requests: - cpu: "1" - memory: 1408Mi From 12a9ba4df5908e6c5dbcea21441ea1f1667af070 Mon Sep 17 00:00:00 2001 From: George Jahad Date: Fri, 31 Jan 2025 14:27:27 -0800 Subject: [PATCH 04/13] cleanup --- examples/spark-driver-job.yaml | 63 +++++++++++++------------- examples/spark-executor-job.yaml | 76 ++++++++++++++++---------------- examples/spark-pi-driver.yaml | 16 +++---- 3 files changed, 78 insertions(+), 77 deletions(-) diff --git a/examples/spark-driver-job.yaml b/examples/spark-driver-job.yaml index 66a8a3b9d1c1e..ece7ff2bcbd95 100644 --- a/examples/spark-driver-job.yaml +++ b/examples/spark-driver-job.yaml @@ -1,31 +1,32 @@ - queue: test - jobSetId: job-set-1 - jobs: - - namespace: default - priority: 0 - podSpec: - terminationGracePeriodSeconds: 0 - restartPolicy: Never - containers: - - name: spark-driver - image: spark:testing - env: - - name: SPARK_DRIVER_BIND_ADDRESS - value: "0.0.0.0:1234" - command: - - /opt/entrypoint.sh - args: - - driver - - --verbose - - --class - - org.apache.spark.examples.LocalPi - - --master - - armada://192.168.1.167:50051 - - submit - resources: - limits: - memory: 1Gi - cpu: 1 - requests: - memory: 1Gi - cpu: 1 + queue: test + jobSetId: job-set-1 + jobs: + - namespace: default + priority: 0 + podSpec: + terminationGracePeriodSeconds: 0 + restartPolicy: Never + containers: + - name: spark-driver + image: spark:testing + env: + - name: SPARK_DRIVER_BIND_ADDRESS + value: "0.0.0.0:1234" + command: + - /opt/entrypoint.sh + args: + - driver + - --verbose + - --class + - org.apache.spark.examples.LocalPi + - --master + - armada://192.168.1.167:50051 + - submit + resources: + limits: + memory: 1Gi + cpu: 1 + requests: + memory: 1Gi + cpu: 1 + \ No newline at end of file diff --git a/examples/spark-executor-job.yaml b/examples/spark-executor-job.yaml index f243c3e54fb57..de14ab9cbb985 100644 --- a/examples/spark-executor-job.yaml +++ b/examples/spark-executor-job.yaml @@ -1,39 +1,39 @@ - queue: test - jobSetId: job-set-1 - jobs: - - namespace: default - priority: 0 - podSpec: - terminationGracePeriodSeconds: 0 - restartPolicy: Never - containers: - - name: spark-executor - image: spark:testing - env: - - name: SPARK_EXECUTOR_MEMORY - value: "512m" - - name: SPARK_DRIVER_URL - value: "spark://localhost:1337" - - name: SPARK_EXECUTOR_ID - value: "1" - - name: SPARK_EXECUTOR_CORES - value: "1" - - name: SPARK_APPLICATION_ID - value: "test_spark_app_id" - - name: SPARK_EXECUTOR_POD_IP - value: "localhost" - - name: SPARK_RESOURCE_PROFILE_ID - value: "1" - - name: SPARK_EXECUTOR_POD_NAME - value: "test-pod-name" - command: - - /opt/entrypoint.sh - args: - - executor - resources: - limits: - memory: 1Gi - cpu: 1 - requests: - memory: 1Gi + queue: test + jobSetId: job-set-1 + jobs: + - namespace: default + priority: 0 + podSpec: + terminationGracePeriodSeconds: 0 + restartPolicy: Never + containers: + - name: spark-executor + image: spark:testing + env: + - name: SPARK_EXECUTOR_MEMORY + value: "512m" + - name: SPARK_DRIVER_URL + value: "spark://localhost:1337" + - name: SPARK_EXECUTOR_ID + value: "1" + - name: SPARK_EXECUTOR_CORES + value: "1" + - name: SPARK_APPLICATION_ID + value: "test_spark_app_id" + - name: SPARK_EXECUTOR_POD_IP + value: "localhost" + - name: SPARK_RESOURCE_PROFILE_ID + value: "1" + - name: SPARK_EXECUTOR_POD_NAME + value: "test-pod-name" + command: + - /opt/entrypoint.sh + args: + - executor + resources: + limits: + memory: 1Gi + cpu: 1 + requests: + memory: 1Gi cpu: 1 diff --git a/examples/spark-pi-driver.yaml b/examples/spark-pi-driver.yaml index 6ea6034e3e583..7d7772b907543 100644 --- a/examples/spark-pi-driver.yaml +++ b/examples/spark-pi-driver.yaml @@ -1,12 +1,12 @@ - queue: test - jobSetId: job-set-1 - jobs: - - namespace: default + queue: test + jobSetId: job-set-1 + jobs: + - namespace: default priority: 0 - podSpec: - terminationGracePeriodSeconds: 0 - restartPolicy: Never - containers: + podSpec: + terminationGracePeriodSeconds: 0 + restartPolicy: Never + containers: - name: spark-driver image: spark:testing env: From 9624c0684f0900b672e83590dd6d092e91ce8069 Mon Sep 17 00:00:00 2001 From: George Jahad Date: Fri, 31 Jan 2025 14:53:41 -0800 Subject: [PATCH 05/13] run script --- examples/runSparkPi.sh | 13 +++++++++++++ examples/spark-pi-executor.yaml | 2 +- 2 files changed, 14 insertions(+), 1 deletion(-) create mode 100755 examples/runSparkPi.sh diff --git a/examples/runSparkPi.sh b/examples/runSparkPi.sh new file mode 100755 index 0000000000000..18fb51373ff7f --- /dev/null +++ b/examples/runSparkPi.sh @@ -0,0 +1,13 @@ +#!/bin/bash +set -e +armadactl submit examples/spark-pi-driver.yaml >& /tmp/jobid.txt +export JOB_ID=`cat /tmp/jobid.txt | awk '{print $5}'` +cat /tmp/jobid.txt +echo waiting for SparkPi driver to start: `date` +sleep 20 +export IP_ADDR=`kubectl get pod "armada-$JOB_ID-0" -o jsonpath='{.status.podIP}'` +echo driver has ip address: $IP_ADDR +envsubst < examples/spark-pi-executor.yaml > /tmp/ex.yaml +echo starting executor +armadactl submit /tmp/ex.yaml +echo SparkPi executor started \ No newline at end of file diff --git a/examples/spark-pi-executor.yaml b/examples/spark-pi-executor.yaml index b9f01a083f96b..29facf6dbe75d 100644 --- a/examples/spark-pi-executor.yaml +++ b/examples/spark-pi-executor.yaml @@ -12,7 +12,7 @@ - name: SPARK_USER value: gbj - name: SPARK_DRIVER_URL - value: "spark://CoarseGrainedScheduler@10.244.1.47:7078" + value: "spark://CoarseGrainedScheduler@${IP_ADDR}:7078" - name: SPARK_EXECUTOR_POD_IP valueFrom: fieldRef: From f3dd4046ebfdc120d7c0734870571742e8d627eb Mon Sep 17 00:00:00 2001 From: George Jahad Date: Fri, 31 Jan 2025 15:14:57 -0800 Subject: [PATCH 06/13] cleanup --- examples/runSparkPi.sh | 3 +- examples/spark-pi-executor.yaml | 74 ++++++++++++++++----------------- 2 files changed, 37 insertions(+), 40 deletions(-) diff --git a/examples/runSparkPi.sh b/examples/runSparkPi.sh index 18fb51373ff7f..06b61133d5962 100755 --- a/examples/runSparkPi.sh +++ b/examples/runSparkPi.sh @@ -1,9 +1,10 @@ #!/bin/bash set -e +echo starting SparkPi driver armadactl submit examples/spark-pi-driver.yaml >& /tmp/jobid.txt export JOB_ID=`cat /tmp/jobid.txt | awk '{print $5}'` cat /tmp/jobid.txt -echo waiting for SparkPi driver to start: `date` +echo waiting for SparkPi driver to start: sleep 20 export IP_ADDR=`kubectl get pod "armada-$JOB_ID-0" -o jsonpath='{.status.podIP}'` echo driver has ip address: $IP_ADDR diff --git a/examples/spark-pi-executor.yaml b/examples/spark-pi-executor.yaml index 29facf6dbe75d..749633f291f5f 100644 --- a/examples/spark-pi-executor.yaml +++ b/examples/spark-pi-executor.yaml @@ -1,37 +1,34 @@ - queue: test - jobSetId: job-set-1 - jobs: - - namespace: default - podSpec: - terminationGracePeriodSeconds: 0 - restartPolicy: Never - containers: - - args: - - executor - env: - - name: SPARK_USER - value: gbj + queue: test + jobSetId: job-set-1 + jobs: + - namespace: default + priority: 0 + podSpec: + terminationGracePeriodSeconds: 0 + restartPolicy: Never + containers: + - name: spark-executor + image: spark:testing + env: + - name: SPARK_EXECUTOR_MEMORY + value: "512m" - name: SPARK_DRIVER_URL value: "spark://CoarseGrainedScheduler@${IP_ADDR}:7078" + - name: SPARK_EXECUTOR_ID + value: "1" + - name: SPARK_EXECUTOR_CORES + value: "1" + - name: SPARK_APPLICATION_ID + value: "test_spark_app_id" - name: SPARK_EXECUTOR_POD_IP valueFrom: fieldRef: apiVersion: v1 fieldPath: status.podIP - - name: SPARK_EXECUTOR_POD_NAME - value: "executor_pod" - - name: SPARK_EXECUTOR_CORES - value: "1" - - name: SPARK_EXECUTOR_MEMORY - value: 1024m - - name: SPARK_APPLICATION_ID - value: spark-0b0907d65ec240618163930cc6671aaa - - name: SPARK_CONF_DIR - value: /opt/spark/conf - - name: SPARK_EXECUTOR_ID - value: "1" - - name: SPARK_RESOURCE_PROFILE_ID - value: "0" + - name: SPARK_RESOURCE_PROFILE_ID + value: "0" + - name: SPARK_EXECUTOR_POD_NAME + value: "test-pod-name" - name: SPARK_JAVA_OPT_0 value: -Djava.net.preferIPv6Addresses=false - name: SPARK_JAVA_OPT_20 @@ -78,19 +75,18 @@ value: --add-opens=java.base/sun.nio.cs=ALL-UNNAMED - name: SPARK_JAVA_OPT_5 value: --add-opens=java.base/java.lang.reflect=ALL-UNNAMED - image: spark:testing10 - imagePullPolicy: IfNotPresent - name: spark-kubernetes-executor + command: + - /opt/entrypoint.sh + args: + - executor + resources: + limits: + memory: 1Gi + cpu: 1 + requests: + memory: 1Gi + cpu: 1 ports: - containerPort: 7079 name: blockmanager protocol: TCP - resources: - limits: - cpu: "1" - memory: 1408Mi - requests: - cpu: "1" - memory: 1408Mi - terminationMessagePath: /dev/termination-log - terminationMessagePolicy: File From f8257a3b6f12a9b29f646e8eea11c75453d12d79 Mon Sep 17 00:00:00 2001 From: George Jahad Date: Fri, 31 Jan 2025 15:24:31 -0800 Subject: [PATCH 07/13] cleanup --- examples/spark-pi-executor.yaml | 50 +++------------------------------ 1 file changed, 4 insertions(+), 46 deletions(-) diff --git a/examples/spark-pi-executor.yaml b/examples/spark-pi-executor.yaml index 749633f291f5f..2e0cbe3381a53 100644 --- a/examples/spark-pi-executor.yaml +++ b/examples/spark-pi-executor.yaml @@ -30,51 +30,13 @@ - name: SPARK_EXECUTOR_POD_NAME value: "test-pod-name" - name: SPARK_JAVA_OPT_0 - value: -Djava.net.preferIPv6Addresses=false - - name: SPARK_JAVA_OPT_20 - value: -agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=5005 - - name: SPARK_JAVA_OPT_6 - value: --add-opens=java.base/java.io=ALL-UNNAMED - - name: SPARK_JAVA_OPT_1 - value: -XX:+IgnoreUnrecognizedVMOptions - - name: SPARK_JAVA_OPT_21 value: -Dspark.driver.port=7078 - - name: SPARK_JAVA_OPT_10 - value: --add-opens=java.base/java.util.concurrent=ALL-UNNAMED - - name: SPARK_JAVA_OPT_9 - value: --add-opens=java.base/java.util=ALL-UNNAMED - - name: SPARK_JAVA_OPT_8 - value: --add-opens=java.base/java.nio=ALL-UNNAMED - - name: SPARK_JAVA_OPT_12 - value: --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED - - name: SPARK_JAVA_OPT_15 - value: --add-opens=java.base/sun.security.action=ALL-UNNAMED - - name: SPARK_JAVA_OPT_3 - value: --add-opens=java.base/java.lang=ALL-UNNAMED - - name: SPARK_JAVA_OPT_19 - value: -Dio.netty.tryReflectionSetAccessible=true - - name: SPARK_JAVA_OPT_13 - value: --add-opens=java.base/sun.nio.ch=ALL-UNNAMED - - name: SPARK_JAVA_OPT_4 - value: --add-opens=java.base/java.lang.invoke=ALL-UNNAMED - - name: SPARK_JAVA_OPT_18 - value: -Djdk.reflect.useDirectMethodHandle=false - - name: SPARK_JAVA_OPT_7 - value: --add-opens=java.base/java.net=ALL-UNNAMED - - name: SPARK_JAVA_OPT_17 - value: --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED - - name: SPARK_JAVA_OPT_22 + - name: SPARK_JAVA_OPT_1 value: -Dspark.driver.blockManager.port=7079 - - name: SPARK_JAVA_OPT_11 - value: --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED - name: SPARK_JAVA_OPT_2 - value: --add-modules=jdk.incubator.vector - - name: SPARK_JAVA_OPT_16 - value: --add-opens=java.base/sun.util.calendar=ALL-UNNAMED - - name: SPARK_JAVA_OPT_14 - value: --add-opens=java.base/sun.nio.cs=ALL-UNNAMED - - name: SPARK_JAVA_OPT_5 - value: --add-opens=java.base/java.lang.reflect=ALL-UNNAMED + value: -Dio.netty.tryReflectionSetAccessible=true + - name: SPARK_JAVA_OPT_3 + value: -agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=5005 command: - /opt/entrypoint.sh args: @@ -86,7 +48,3 @@ requests: memory: 1Gi cpu: 1 - ports: - - containerPort: 7079 - name: blockmanager - protocol: TCP From 36ee2ec89444f43f8384de8b643f7ffe8d7609b4 Mon Sep 17 00:00:00 2001 From: George Jahad Date: Fri, 31 Jan 2025 15:29:56 -0800 Subject: [PATCH 08/13] cleanup --- examples/spark-pi-executor.yaml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/examples/spark-pi-executor.yaml b/examples/spark-pi-executor.yaml index 2e0cbe3381a53..995bd1900b725 100644 --- a/examples/spark-pi-executor.yaml +++ b/examples/spark-pi-executor.yaml @@ -30,12 +30,6 @@ - name: SPARK_EXECUTOR_POD_NAME value: "test-pod-name" - name: SPARK_JAVA_OPT_0 - value: -Dspark.driver.port=7078 - - name: SPARK_JAVA_OPT_1 - value: -Dspark.driver.blockManager.port=7079 - - name: SPARK_JAVA_OPT_2 - value: -Dio.netty.tryReflectionSetAccessible=true - - name: SPARK_JAVA_OPT_3 value: -agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=5005 command: - /opt/entrypoint.sh From 3834812c3bf93fb836a8e205a36ff9b229eaeed9 Mon Sep 17 00:00:00 2001 From: George Jahad Date: Fri, 31 Jan 2025 15:33:17 -0800 Subject: [PATCH 09/13] cleanup entrypoint.sh --- .../src/main/dockerfiles/spark/entrypoint.sh | 27 ------------------- 1 file changed, 27 deletions(-) diff --git a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh index c3452d97ada21..09b7c52b5b889 100755 --- a/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh +++ b/resource-managers/kubernetes/docker/src/main/dockerfiles/spark/entrypoint.sh @@ -78,33 +78,6 @@ fi # SPARK-43540: add current working directory into executor classpath SPARK_CLASSPATH="$SPARK_CLASSPATH:$PWD" -replace_regex() { - local input_string="$1" - local regex="=ARMADA_IP_ADDR" - local replacement="=$IP_ADDR" - - # Use sed to perform the replacement - echo "$input_string" | sed -E "s/${regex}/${replacement}/g" -} - -process_element() { - echo $(replace_regex "$1") -} - -# Function to process all arguments and return results in an array -process_all() { - local input_array=("$@") # Convert $@ into an array - local output_array=() # Create an empty array for results - - for element in "${input_array[@]}"; do - result=$(process_element "$element") # Call processing function - output_array+=("$result") # Store result in array - done - - echo "${output_array[@]}" # Return as space-separated string -} - - case "$1" in driver) shift 1 From 66f009b819697e9efb20cb13c893386d455021a3 Mon Sep 17 00:00:00 2001 From: George Jahad Date: Fri, 31 Jan 2025 15:48:49 -0800 Subject: [PATCH 10/13] restored old yaml files --- examples/spark-driver-job.yaml | 63 +++++++++++++------------- examples/spark-executor-job.yaml | 76 ++++++++++++++++---------------- 2 files changed, 69 insertions(+), 70 deletions(-) diff --git a/examples/spark-driver-job.yaml b/examples/spark-driver-job.yaml index ece7ff2bcbd95..66a8a3b9d1c1e 100644 --- a/examples/spark-driver-job.yaml +++ b/examples/spark-driver-job.yaml @@ -1,32 +1,31 @@ - queue: test - jobSetId: job-set-1 - jobs: - - namespace: default - priority: 0 - podSpec: - terminationGracePeriodSeconds: 0 - restartPolicy: Never - containers: - - name: spark-driver - image: spark:testing - env: - - name: SPARK_DRIVER_BIND_ADDRESS - value: "0.0.0.0:1234" - command: - - /opt/entrypoint.sh - args: - - driver - - --verbose - - --class - - org.apache.spark.examples.LocalPi - - --master - - armada://192.168.1.167:50051 - - submit - resources: - limits: - memory: 1Gi - cpu: 1 - requests: - memory: 1Gi - cpu: 1 - \ No newline at end of file + queue: test + jobSetId: job-set-1 + jobs: + - namespace: default + priority: 0 + podSpec: + terminationGracePeriodSeconds: 0 + restartPolicy: Never + containers: + - name: spark-driver + image: spark:testing + env: + - name: SPARK_DRIVER_BIND_ADDRESS + value: "0.0.0.0:1234" + command: + - /opt/entrypoint.sh + args: + - driver + - --verbose + - --class + - org.apache.spark.examples.LocalPi + - --master + - armada://192.168.1.167:50051 + - submit + resources: + limits: + memory: 1Gi + cpu: 1 + requests: + memory: 1Gi + cpu: 1 diff --git a/examples/spark-executor-job.yaml b/examples/spark-executor-job.yaml index de14ab9cbb985..f243c3e54fb57 100644 --- a/examples/spark-executor-job.yaml +++ b/examples/spark-executor-job.yaml @@ -1,39 +1,39 @@ - queue: test - jobSetId: job-set-1 - jobs: - - namespace: default - priority: 0 - podSpec: - terminationGracePeriodSeconds: 0 - restartPolicy: Never - containers: - - name: spark-executor - image: spark:testing - env: - - name: SPARK_EXECUTOR_MEMORY - value: "512m" - - name: SPARK_DRIVER_URL - value: "spark://localhost:1337" - - name: SPARK_EXECUTOR_ID - value: "1" - - name: SPARK_EXECUTOR_CORES - value: "1" - - name: SPARK_APPLICATION_ID - value: "test_spark_app_id" - - name: SPARK_EXECUTOR_POD_IP - value: "localhost" - - name: SPARK_RESOURCE_PROFILE_ID - value: "1" - - name: SPARK_EXECUTOR_POD_NAME - value: "test-pod-name" - command: - - /opt/entrypoint.sh - args: - - executor - resources: - limits: - memory: 1Gi - cpu: 1 - requests: - memory: 1Gi + queue: test + jobSetId: job-set-1 + jobs: + - namespace: default + priority: 0 + podSpec: + terminationGracePeriodSeconds: 0 + restartPolicy: Never + containers: + - name: spark-executor + image: spark:testing + env: + - name: SPARK_EXECUTOR_MEMORY + value: "512m" + - name: SPARK_DRIVER_URL + value: "spark://localhost:1337" + - name: SPARK_EXECUTOR_ID + value: "1" + - name: SPARK_EXECUTOR_CORES + value: "1" + - name: SPARK_APPLICATION_ID + value: "test_spark_app_id" + - name: SPARK_EXECUTOR_POD_IP + value: "localhost" + - name: SPARK_RESOURCE_PROFILE_ID + value: "1" + - name: SPARK_EXECUTOR_POD_NAME + value: "test-pod-name" + command: + - /opt/entrypoint.sh + args: + - executor + resources: + limits: + memory: 1Gi + cpu: 1 + requests: + memory: 1Gi cpu: 1 From 6623dd2015d8391073c9963d89f73d9339a3990b Mon Sep 17 00:00:00 2001 From: George Jahad Date: Fri, 31 Jan 2025 15:55:53 -0800 Subject: [PATCH 11/13] cleanup --- examples/runSparkPi.sh | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/examples/runSparkPi.sh b/examples/runSparkPi.sh index 06b61133d5962..0e8c19b3223ee 100755 --- a/examples/runSparkPi.sh +++ b/examples/runSparkPi.sh @@ -1,14 +1,29 @@ #!/bin/bash + +# Start up the driver, get it's ip address, then start the executor with it set -e + + +echo echo starting SparkPi driver armadactl submit examples/spark-pi-driver.yaml >& /tmp/jobid.txt export JOB_ID=`cat /tmp/jobid.txt | awk '{print $5}'` cat /tmp/jobid.txt -echo waiting for SparkPi driver to start: +echo + + +echo waiting for SparkPi driver to start sleep 20 + +echo +echo SparkPi driver ip addr: export IP_ADDR=`kubectl get pod "armada-$JOB_ID-0" -o jsonpath='{.status.podIP}'` -echo driver has ip address: $IP_ADDR +echo $IP_ADDR +echo + +echo passing drivers ip addr to executor and starting it envsubst < examples/spark-pi-executor.yaml > /tmp/ex.yaml -echo starting executor armadactl submit /tmp/ex.yaml -echo SparkPi executor started \ No newline at end of file +echo + +echo SparkPi driver/executor started \ No newline at end of file From 5f9caa040fa37153cd4d1a18b6865872eda18be7 Mon Sep 17 00:00:00 2001 From: George Jahad Date: Fri, 31 Jan 2025 16:47:45 -0800 Subject: [PATCH 12/13] cleanup --- examples/runSparkPi.sh | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/examples/runSparkPi.sh b/examples/runSparkPi.sh index 0e8c19b3223ee..4bf9feecbb4f8 100755 --- a/examples/runSparkPi.sh +++ b/examples/runSparkPi.sh @@ -3,11 +3,10 @@ # Start up the driver, get it's ip address, then start the executor with it set -e - echo echo starting SparkPi driver armadactl submit examples/spark-pi-driver.yaml >& /tmp/jobid.txt -export JOB_ID=`cat /tmp/jobid.txt | awk '{print $5}'` +JOB_ID=`cat /tmp/jobid.txt | awk '{print $5}'` cat /tmp/jobid.txt echo @@ -17,12 +16,12 @@ sleep 20 echo echo SparkPi driver ip addr: -export IP_ADDR=`kubectl get pod "armada-$JOB_ID-0" -o jsonpath='{.status.podIP}'` +IP_ADDR=`kubectl get pod "armada-$JOB_ID-0" -o jsonpath='{.status.podIP}'` echo $IP_ADDR echo echo passing drivers ip addr to executor and starting it -envsubst < examples/spark-pi-executor.yaml > /tmp/ex.yaml +IP_ADDR=$IP_ADDR envsubst < examples/spark-pi-executor.yaml > /tmp/ex.yaml armadactl submit /tmp/ex.yaml echo From c4f9c64cb99755e1f4da05d9b810a6776c78818e Mon Sep 17 00:00:00 2001 From: George Jahad Date: Fri, 31 Jan 2025 16:56:33 -0800 Subject: [PATCH 13/13] cleanup --- examples/runSparkPi.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/runSparkPi.sh b/examples/runSparkPi.sh index 4bf9feecbb4f8..bef732a0fb319 100755 --- a/examples/runSparkPi.sh +++ b/examples/runSparkPi.sh @@ -5,7 +5,7 @@ set -e echo echo starting SparkPi driver -armadactl submit examples/spark-pi-driver.yaml >& /tmp/jobid.txt +armadactl submit examples/spark-pi-driver.yaml > /tmp/jobid.txt JOB_ID=`cat /tmp/jobid.txt | awk '{print $5}'` cat /tmp/jobid.txt echo