stackabletech · razvan · Jan 2, 2023 · Jan 2, 2023 · Jan 3, 2023 · Jan 3, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,11 +4,19 @@ All notable changes to this project will be documented in this file.
 
 ## [Unreleased]
 
+### Added
+
+- Create and manage history servers ([#187])
+
+[#187]: https://github.com/stackabletech/spark-k8s-operator/pull/187
+
 ### Changed
 
 - Updated stackable image versions ([#176])
 - `operator-rs` `0.22.0` → `0.27.1` ([#178])
+- `operator-rs` `0.27.1` -> `0.30.2` ([#187])
 - Don't run init container as root and avoid chmod and chowning ([#183])
+- [BREAKING] Implement fix for S3 reference inconsistency as described in the issue #162 ([#187])
 
 [#176]: https://github.com/stackabletech/spark-k8s-operator/pull/176
 [#178]: https://github.com/stackabletech/spark-k8s-operator/pull/178
@@ -43,7 +51,6 @@ All notable changes to this project will be documented in this file.
 - Update RBAC properties for OpenShift compatibility ([#126]).
 
 [#112]: https://github.com/stackabletech/spark-k8s-operator/pull/112
-[#114]: https://github.com/stackabletech/spark-k8s-operator/pull/114
 [#126]: https://github.com/stackabletech/spark-k8s-operator/pull/126
 
 ## [0.4.0] - 2022-08-03

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/deploy/helm/spark-k8s-operator/crds/crds.yaml b/deploy/helm/spark-k8s-operator/crds/crds.yaml
diff --git a/deploy/helm/spark-k8s-operator/templates/roles.yaml b/deploy/helm/spark-k8s-operator/templates/roles.yaml
@@ -84,6 +84,7 @@ rules:
       - spark.stackable.tech
     resources:
       - sparkapplications
+      - sparkhistoryservers
     verbs:
       - get
       - list

diff --git a/deploy/helm/spark-k8s-operator/templates/spark-clusterrole.yaml b/deploy/helm/spark-k8s-operator/templates/spark-clusterrole.yaml
@@ -52,6 +52,7 @@ rules:
       - ""
     resources:
       - configmaps
+      - persistentvolumeclaims
       - pods
       - secrets
       - serviceaccounts

diff --git a/docs/modules/ROOT/examples/example-history-app.yaml b/docs/modules/ROOT/examples/example-history-app.yaml
@@ -0,0 +1,34 @@
+---
+apiVersion: spark.stackable.tech/v1alpha1
+kind: SparkApplication
+metadata:
+  name: spark-pi-s3-1
+spec:
+  version: "1.0"
+  sparkImage: docker.stackable.tech/stackable/spark-k8s:3.3.0-stackable0.3.0
+  sparkImagePullPolicy: IfNotPresent
+  mode: cluster
+  mainClass: org.apache.spark.examples.SparkPi
+  mainApplicationFile: s3a://my-bucket/spark-examples_2.12-3.3.0.jar
+  s3connection: # <1>
+    inline:
+      host: test-minio
+      port: 9000
+      accessStyle: Path
+      credentials:
+        secretClass: s3-credentials-class # <2>
+  logFileDirectory: # <3>
+    s3:
+      prefix: eventlogs/ # <4>
+      bucket:
+        inline:
+          bucketName: spark-logs # <5>
+          connection:
+            inline:
+              host: test-minio
+              port: 9000
+              accessStyle: Path
+              credentials:
+                secretClass: history-credentials-class # <6>
+  executor:
+    instances: 1
diff --git a/docs/modules/ROOT/examples/example-history-server.yaml b/docs/modules/ROOT/examples/example-history-server.yaml
@@ -0,0 +1,29 @@
+---
+apiVersion: spark.stackable.tech/v1alpha1
+kind: SparkHistoryServer
+metadata:
+  name: spark-history
+spec:
+  image:
+    productVersion: 3.3.0
+    stackableVersion: 0.3.0
+  logFileDirectory:  # <1>
+    s3:
+      prefix: eventlogs/  # <2>
+      bucket:  # <3>
+        inline:
+          bucketName: spark-logs
+          connection:
+            inline:
+              host: test-minio
+              port: 9000
+              accessStyle: Path
+              credentials:
+                secretClass: history-credentials-class
+  sparkConf:  # <4>
+  nodes:
+    roleGroups:
+      cleaner:
+        replicas: 1  # <5>
+        config:
+          cleaner: true # <6>
diff --git a/docs/modules/ROOT/examples/example-sparkapp-s3-private.yaml b/docs/modules/ROOT/examples/example-sparkapp-s3-private.yaml
@@ -9,16 +9,13 @@ spec:
   mode: cluster
   mainApplicationFile: s3a://my-bucket/spark-examples_2.12-3.3.0.jar # <1>
   mainClass: org.apache.spark.examples.SparkPi # <2>
-  s3bucket:  # <3>
+  s3connection: # <3>
     inline:
-      bucketName: my-bucket
-      connection:
-        inline:
-          host: test-minio
-          port: 9000
-          accessStyle: Path
-          credentials: # <4>
-            secretClass: s3-credentials-class
+      host: test-minio
+      port: 9000
+      accessStyle: Path
+      credentials: # <4>
+        secretClass: s3-credentials-class
   sparkConf: # <5>
     spark.hadoop.fs.s3a.aws.credentials.provider: "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider" # <6>
     spark.driver.extraClassPath: "/dependencies/jars/hadoop-aws-3.2.0.jar:/dependencies/jars/aws-java-sdk-bundle-1.11.375.jar"

diff --git a/docs/modules/ROOT/images/history-server-ui.png b/docs/modules/ROOT/images/history-server-ui.png
diff --git a/docs/modules/ROOT/nav.adoc b/docs/modules/ROOT/nav.adoc
@@ -2,3 +2,4 @@
 * xref:usage.adoc[]
 * xref:job_dependencies.adoc[]
 * xref:rbac.adoc[]
+* xref:history_server.adoc[]
diff --git a/docs/modules/ROOT/pages/history_server.adoc b/docs/modules/ROOT/pages/history_server.adoc
@@ -0,0 +1,66 @@
+= Spark History Server
+
+== Overview
+
+The Stackable Spark-on-Kubernetes operator runs Apache Spark workloads in a Kubernetes cluster, whereby driver- and executor-pods are created for the duration of the job and then terminated. One or more Spark History Server instances can be deployed independently of `SparkApplication` jobs and used as an end-point for spark logging, so that job information can be viewed once the job pods are no longer available.
+
+== Deployment
+
+The example below demonstrates how to set up the history server running in one Pod with scheduled cleanups of the event logs. The event logs are loaded from an S3 bucket named `spark-logs` and the folder `eventlogs/`. The credentials for this bucket are provided by the secret class `s3-credentials-class`. For more details on how the Stackable Data Platform manages S3 resources see the xref:home:concepts:s3.adoc[S3 resources] page.
+
+
+[source,yaml]
+----
+include::example$example-history-server.yaml[]
+----
+
+<1> The location of the event logs. Must be a S3 bucket. Future implementations might add support for other shared filesystems such as HDFS.
+<2> Folder within the S3 bucket where the log files are located. This folder is required and must exist before setting up the history server.
+<3> The S3 bucket definition, here provided in-line.
+<4> Additional history server configuration properties can be provided here as a map. For possible properties see: https://spark.apache.org/docs/latest/monitoring.html#spark-history-server-configuration-options
+<5> This deployment has only one Pod. Multiple history servers can be started, all reading the same event logs by increasing the replica count.
+<6> This history server will automatically clean up old log files by using default properties. You can change any of these by using the `sparkConf` map.
+
+NOTE: Only one role group can have scheduled cleanups enabled (`cleaner: true`) and this role group cannot have more than 1 replica.
+
+The secret with S3 credentials must contain at least the following two keys:
+
+* `accessKey` - the access key of a user with read and write access to the event log bucket.
+* `secretKey` - the secret key of a user with read and write access to the event log bucket.
+
+Any other entries of the Secret are ignored by the operator.
+
+== Application configuration
+
+
+The example below demonstrates how to configure Spark applications to write log events to an S3 bucket.
+
+[source,yaml]
+----
+include::example$example-history-app.yaml[]
+----
+
+<1> Location of the data that is being processed by the application.
+<2> Credentials used to access the data above.
+<3> Instruct the operator to configure the application with logging enabled.
+<4> Folder to store logs. This must match the prefix used by the history server.
+<5> Bucket to store logs. This must match the bucket used by the history server.
+<6> Credentials used to write event logs. These can, of course, differ from the credentials used to process data.
+
+
+
+== History Web UI
+
+To access the history server web UI, use one of the `NodePort` services created by the operator. For the example above, the operator created two services as shown:
+
+[source,bash]
+----
+$ kubectl get svc
+NAME                         TYPE        CLUSTER-IP      EXTERNAL-IP   PORT(S)             AGE
+spark-history-node           NodePort    10.96.222.233   <none>        18080:30136/TCP     52m
+spark-history-node-cleaner   NodePort    10.96.203.43    <none>        18080:32585/TCP     52m
+----
+
+By setting up port forwarding on 18080 the UI can be opened by pointing your browser to `http://localhost:18080`:
+
+image::history-server-ui.png[History Server Console]
diff --git a/docs/modules/ROOT/pages/usage.adoc b/docs/modules/ROOT/pages/usage.adoc
@@ -92,57 +92,50 @@ include::example$example-sparkapp-configmap.yaml[]
 
 You can specify S3 connection details directly inside the `SparkApplication` specification or by referring to an external `S3Bucket` custom resource.
 
-To specify S3 connection details directly as part of the `SparkApplication` resource you add an inline bucket configuration as shown below.
+To specify S3 connection details directly as part of the `SparkApplication` resource you add an inline connection configuration as shown below.
 
 [source,yaml]
 ----
-s3bucket:  # <1>
+s3connection: # <1>
   inline:
-    bucketName: my-bucket # <2>
-    connection:
-      inline:
-        host: test-minio # <3>
-        port: 9000 # <4>
-        accessStyle: Path
-        credentials:
-          secretClass: s3-credentials-class  # <5>
+    host: test-minio # <2>
+    port: 9000 # <3>
+    accessStyle: Path
+    credentials:
+      secretClass: s3-credentials-class  # <4>
 ----
-<1> Entry point for the bucket configuration.
-<2> Bucket name.
-<3> Bucket host.
-<4> Optional bucket port.
-<5> Name of the `Secret` object expected to contain the following keys: `ACCESS_KEY_ID` and `SECRET_ACCESS_KEY`
+<1> Entry point for the S3 connection configuration.
+<2> Connection host.
+<3> Optional connection port.
+<4> Name of the `Secret` object expected to contain the following keys: `ACCESS_KEY_ID` and `SECRET_ACCESS_KEY`
 
-It is also possible to configure the bucket connection details as a separate Kubernetes resource and only refer to that object from the `SparkApplication` like this:
+It is also possible to configure the  connection details as a separate Kubernetes resource and only refer to that object from the `SparkApplication` like this:
 
 [source,yaml]
 ----
-s3bucket:
-  reference: my-bucket-resource # <1>
+s3connection:
+  reference: s3-connection-resource # <1>
 ----
-<1> Name of the bucket resource with connection details.
+<1> Name of the connection resource with connection details.
 
-The resource named `my-bucket-resource` is then defined as shown below:
+The resource named `s3-connection-resource` is then defined as shown below:
 
 [source,yaml]
 ----
 ---
 apiVersion: s3.stackable.tech/v1alpha1
-kind: S3Bucket
+kind: S3Connection
 metadata:
-  name: my-bucket-resource
+  name: s3-connection-resource
 spec:
-  bucketName: my-bucket-name
-  connection:
-    inline:
-      host: test-minio
-      port: 9000
-      accessStyle: Path
-      credentials:
-        secretClass: minio-credentials-class
+  host: test-minio
+  port: 9000
+  accessStyle: Path
+  credentials:
+    secretClass: minio-credentials-class
 ----
 
-This has the advantage that bucket configuration can be shared across `SparkApplication`s and reduces the cost of updating these details.
+This has the advantage that one connection configuration can be shared across `SparkApplication`s and reduces the cost of updating these details.
 
 == Resource Requests
 
@@ -228,8 +221,8 @@ Below are listed the CRD fields that can be defined by the user:
 |`spec.args`
 |Arguments passed directly to the job artifact
 
-|`spec.s3bucket`
-|S3 bucket and connection specification. See the <<S3 bucket specification>> for more details.
+|`spec.s3connection`
+|S3 connection specification. See the <<S3 bucket specification>> for more details.
 
 |`spec.sparkConf`
 |A map of key/value strings that will be passed directly to `spark-submit`

diff --git a/examples/ny-tlc-report-external-dependencies.yaml b/examples/ny-tlc-report-external-dependencies.yaml
@@ -16,14 +16,11 @@ spec:
   deps:
     requirements:
       - tabulate==0.8.9
-  s3bucket:
+  s3connection:
     inline:
-      bucketName: my-bucket
-      connection:
-        inline:
-          host: test-minio
-          port: 9000
-          accessStyle: Path
+      host: test-minio
+      port: 9000
+      accessStyle: Path
   sparkConf:
     spark.hadoop.fs.s3a.aws.credentials.provider: "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider"
     spark.driver.extraClassPath: "/dependencies/jars/*"

diff --git a/examples/ny-tlc-report-image.yaml b/examples/ny-tlc-report-image.yaml
@@ -17,14 +17,11 @@ spec:
   deps:
     requirements:
       - tabulate==0.8.9
-  s3bucket:
+  s3connection:
     inline:
-      bucketName: my-bucket
-      connection:
-        inline:
-          host: test-minio
-          port: 9000
-          accessStyle: Path
+      host: test-minio
+      port: 9000
+      accessStyle: Path
   sparkConf:
     spark.hadoop.fs.s3a.aws.credentials.provider: "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider"
   executor:

diff --git a/examples/ny-tlc-report.yaml b/examples/ny-tlc-report.yaml
@@ -23,14 +23,11 @@ spec:
         name: cm-job-arguments
   args:
     - "--input /arguments/job-args.txt"
-  s3bucket:
+  s3connection:
     inline:
-      bucketName: my-bucket
-      connection:
-        inline:
-          host: test-minio
-          port: 9000
-          accessStyle: Path
+      host: test-minio
+      port: 9000
+      accessStyle: Path
   sparkConf:
     spark.hadoop.fs.s3a.aws.credentials.provider: "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider"
   driver:

diff --git a/rust/crd/Cargo.toml b/rust/crd/Cargo.toml
@@ -9,10 +9,10 @@ version = "0.7.0-nightly"
 publish = false
 
 [dependencies]
-stackable-operator = { git = "https://github.com/stackabletech/operator-rs.git", tag="0.27.1" }
+stackable-operator = { git = "https://github.com/stackabletech/operator-rs.git", tag="0.30.2" }
 
 semver = "1.0"
-serde = { version = "1.0", features = ["derive"] }
+serde = "1.0"
 serde_json = "1.0"
 serde_yaml = "0.8"
 snafu = "0.7"