[SPARK-609] use environment based secrets for S3Job (apache#237)

Arthur Rand · web-flow · commit a52e08e66b3f · 2017-12-19T06:43:13.000-08:00
* use environment based secrets for S3Job

* add documentation, remove old secrets test
diff --git a/docs/hdfs.md b/docs/hdfs.md
@@ -1,14 +1,15 @@
 ---
-post_title: Integration with HDFS
+post_title: Integration with HDFS and S3
 nav_title: HDFS
 menu_order: 20
 enterprise: 'no'
 ---
 
+# HDFS
 
 If you plan to read and write from HDFS using Spark, there are two Hadoop configuration files that should be included on Spark's classpath: `hdfs-site.xml`, which provides default behaviors for the HDFS client. `core-site.xml`, which sets the default filesystem name. You can specify the location of these files at install time or for each job.
 
-# Spark Installation
+## Spark Installation
 Within the Spark service configuration, set `hdfs.config-url` to be a URL that serves your `hdfs-site.xml` and `core-site.xml`, use this example where `http://mydomain.com/hdfs-config/hdfs-site.xml` and `http://mydomain.com/hdfs-config/core-site.xml` are valid URLs:
 
 ```json
@@ -20,10 +21,10 @@ Within the Spark service configuration, set `hdfs.config-url` to be a URL that s
 ```
 This can also be done through the UI. If you are using the default installation of HDFS from Mesosphere this is probably `http://api.hdfs.marathon.l4lb.thisdcos.directory/v1/endpoints`.
 
-# Adding HDFS files per-job
+## Adding HDFS configuration files per-job
 To add the configuration files manually for a job, use `--conf spark.mesos.uris=<location_of_hdfs-site.xml>,<location_of_core-site.xml>`. This will download the files to the sandbox of the Driver Spark application, and DC/OS Spark will automatically load these files into the correct location. **Note** It is important these files are called `hdfs-site.xml` and `core-site.xml`.
 
-## Spark Checkpointing
+### Spark Checkpointing
 
 In order to use spark with checkpointing make sure you follow the instructions [here](https://spark.apache.org/docs/latest/streaming-programming-guide.html#checkpointing) and use an hdfs directory as the checkpointing directory. For example:
 ```
@@ -33,6 +34,23 @@ ssc.checkpoint(checkpointDirectory)
 ```
 That hdfs directory will be automatically created on hdfs and spark streaming app will work from checkpointed data even in the presence of application restarts/failures.
 
+# S3
+You can read/write files to S3 using environment-based secrets to pass your AWS credentials. Your credentials must first be uploaded to the DC/OS secret store:
+
+```
+dcos security secrets create <secret_path_for_key_id> -v <AWS_ACCESS_KEY_ID>
+dcos security secrets create <secret_path_for_secret_key> -v <AWS_SECRET_ACCESS_KEY> 
+```
+Then your Spark jobs can get these credentials directly:
+
+```
+dcos spark run --submit-args="\
+...
+--conf spark.mesos.containerizer=mesos  # required for secrets
+--conf spark.mesos.driver.secret.names=<secret_path_for_key_id>,<secret_path_for_secret_key>
+--conf spark.mesos.driver.secret.envkeys=AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY
+...
+```
 
 [8]: http://spark.apache.org/docs/latest/configuration.html#inheriting-hadoop-cluster-configuration
 [9]: https://docs.mesosphere.com/services/spark/2.1.0-2.2.0-1/limitations/
diff --git a/tests/test_spark.py b/tests/test_spark.py
@@ -167,19 +167,33 @@ def _check_task_network_info(task):
 
 @pytest.mark.sanity
 def test_s3():
+    def make_credential_secret(envvar, secret_path):
+        rc, stdout, stderr = sdk_cmd.run_raw_cli("security secrets create {p} -v {e}"
+                                                 .format(p=secret_path, e=os.environ[envvar]))
+        assert rc == 0, "Failed to create secret {secret} from envvar {envvar}, stderr: {err}, stdout: {out}".format(
+            secret=secret_path, envvar=envvar, err=stderr, out=stdout)
+
+    LOGGER.info("Creating AWS secrets")
+
+    aws_access_key_secret_path = "aws_access_key_id"
+    aws_secret_access_key_path = "aws_secret_access_key"
+
+    make_credential_secret(envvar="AWS_ACCESS_KEY_ID", secret_path="/{}".format(aws_access_key_secret_path))
+    make_credential_secret(envvar="AWS_SECRET_ACCESS_KEY", secret_path="/{}".format(aws_secret_access_key_path))
+
     linecount_path = os.path.join(THIS_DIR, 'resources', 'linecount.txt')
     s3.upload_file(linecount_path)
 
     app_args = "--readUrl {} --writeUrl {}".format(
         s3.s3n_url('linecount.txt'),
         s3.s3n_url("linecount-out"))
 
-    args = ["--conf",
-            "spark.mesos.driverEnv.AWS_ACCESS_KEY_ID={}".format(
-                os.environ["AWS_ACCESS_KEY_ID"]),
+    args = ["--conf", "spark.mesos.containerizer=mesos",
             "--conf",
-            "spark.mesos.driverEnv.AWS_SECRET_ACCESS_KEY={}".format(
-                os.environ["AWS_SECRET_ACCESS_KEY"]),
+            "spark.mesos.driver.secret.names=/{key},/{secret}".format(
+                key=aws_access_key_secret_path, secret=aws_secret_access_key_path),
+            "--conf",
+            "spark.mesos.driver.secret.envkeys=AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY",
             "--class", "S3Job"]
     utils.run_tests(app_url=utils._scala_test_jar_url(),
                     app_args=app_args,
@@ -229,28 +243,6 @@ def test_marathon_group():
     #shakedown.uninstall_package_and_wait(SPARK_PACKAGE_NAME, app_id)
 
 
-@pytest.mark.sanity
-@pytest.mark.secrets
-def test_secrets():
-    properties_file_path = os.path.join(THIS_DIR, "resources", "secrets-opts.txt")
-    # Create secret
-    shakedown.run_dcos_command('security secrets create /{} --value {}'.format(SECRET_NAME, SECRET_CONTENTS))
-
-    secret_file_name = "secret_file"
-    output = "Contents of file {}: {}".format(secret_file_name, SECRET_CONTENTS)
-    args = ["--properties-file", properties_file_path,
-            "--class", "SecretsJob"]
-    try:
-        utils.run_tests(app_url=utils._scala_test_jar_url(),
-                        app_args=secret_file_name,
-                        expected_output=output,
-                        args=args)
-
-    finally:
-        # Delete secret
-        shakedown.run_dcos_command('security secrets delete /{}'.format(SECRET_NAME))
-
-
 @pytest.mark.sanity
 def test_cli_multiple_spaces():
     utils.run_tests(app_url=utils.SPARK_EXAMPLES,