apache · markgrover · Nov 17, 2016 · Nov 23, 2016 · Nov 23, 2016 · Nov 23, 2016
diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -223,4 +223,13 @@ package object config {
       " bigger files.")
     .longConf
     .createWithDefault(4 * 1024 * 1024)
+
+  private[spark] val SECRET_REDACTION_PATTERN =
+    ConfigBuilder("spark.redaction.regex")
+      .doc("Regex to decide which Spark configuration properties and environment variables in " +
+        "driver and executor environments contain sensitive information. When this regex matches " +
+        "a property , its value is redacted from the environment UI and various logs like YARN " +
+        "and event logs")
+      .stringConf
+      .createWithDefault("(?i)secret|password")
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala b/core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala
@@ -153,7 +153,9 @@ private[spark] class EventLoggingListener(
 
   override def onTaskEnd(event: SparkListenerTaskEnd): Unit = logEvent(event)
 
-  override def onEnvironmentUpdate(event: SparkListenerEnvironmentUpdate): Unit = logEvent(event)
+  override def onEnvironmentUpdate(event: SparkListenerEnvironmentUpdate): Unit = {
+    logEvent(redactEvent(event))
+  }
 
   // Events that trigger a flush
   override def onStageCompleted(event: SparkListenerStageCompleted): Unit = {
@@ -231,6 +233,19 @@ private[spark] class EventLoggingListener(
     }
   }
 
+
+  private def redactEvent(event: SparkListenerEnvironmentUpdate): SparkListenerEnvironmentUpdate = {
+    // "Spark Properties" entry will always exist because the map is always populated with it.
+    val props = event
+      .environmentDetails
+      .get("Spark Properties")
+      .get
+    val redactedProps = Utils.redact(sparkConf, props)
+    val redactedEnvironmentDetails = event.environmentDetails +
+      ("Spark Properties" -> redactedProps)
+    SparkListenerEnvironmentUpdate(redactedEnvironmentDetails)
+  }
+
 }
 
 private[spark] object EventLoggingListener extends Logging {

diff --git a/core/src/main/scala/org/apache/spark/ui/env/EnvironmentPage.scala b/core/src/main/scala/org/apache/spark/ui/env/EnvironmentPage.scala
@@ -22,21 +22,17 @@ import javax.servlet.http.HttpServletRequest
 import scala.xml.Node
 
 import org.apache.spark.ui.{UIUtils, WebUIPage}
+import org.apache.spark.util.Utils
 
 private[ui] class EnvironmentPage(parent: EnvironmentTab) extends WebUIPage("") {
   private val listener = parent.listener
 
-  private def removePass(kv: (String, String)): (String, String) = {
-    if (kv._1.toLowerCase.contains("password") || kv._1.toLowerCase.contains("secret")) {
-      (kv._1, "******")
-    } else kv
-  }
-
   def render(request: HttpServletRequest): Seq[Node] = {
     val runtimeInformationTable = UIUtils.listingTable(
       propertyHeader, jvmRow, listener.jvmInformation, fixedWidth = true)
-    val sparkPropertiesTable = UIUtils.listingTable(
-      propertyHeader, propertyRow, listener.sparkProperties.map(removePass), fixedWidth = true)
+    val sparkPropertiesTable = UIUtils.listingTable(propertyHeader, propertyRow,
+      Utils.redact(parent.conf, listener.sparkProperties), fixedWidth = true)
+
     val systemPropertiesTable = UIUtils.listingTable(
       propertyHeader, propertyRow, listener.systemProperties, fixedWidth = true)
     val classpathEntriesTable = UIUtils.listingTable(

diff --git a/core/src/main/scala/org/apache/spark/ui/env/EnvironmentTab.scala b/core/src/main/scala/org/apache/spark/ui/env/EnvironmentTab.scala
@@ -23,6 +23,7 @@ import org.apache.spark.ui._
 
 private[ui] class EnvironmentTab(parent: SparkUI) extends SparkUITab(parent, "environment") {
   val listener = parent.environmentListener
+  val conf = parent.conf
   attachPage(new EnvironmentPage(this))
 }
 

diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -55,7 +55,7 @@ import org.slf4j.Logger
 import org.apache.spark._
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
-import org.apache.spark.internal.config.{DYN_ALLOCATION_INITIAL_EXECUTORS, DYN_ALLOCATION_MIN_EXECUTORS, EXECUTOR_INSTANCES}
+import org.apache.spark.internal.config._
 import org.apache.spark.network.util.JavaUtils
 import org.apache.spark.serializer.{DeserializationStream, SerializationStream, SerializerInstance}
 import org.apache.spark.util.logging.RollingFileAppender
@@ -2555,6 +2555,19 @@ private[spark] object Utils extends Logging {
       sparkJars.map(_.split(",")).map(_.filter(_.nonEmpty)).toSeq.flatten
     }
   }
+
+  private[util] val REDACTION_REPLACEMENT_TEXT = "*********(redacted)"
+
+  def redact(conf: SparkConf, kvs: Seq[(String, String)]): Seq[(String, String)] = {
+    val redactionPattern = conf.get(SECRET_REDACTION_PATTERN).r
+    kvs.map { kv =>
+      if (redactionPattern.findFirstIn(kv._1).isDefined) {
+        (kv._1, REDACTION_REPLACEMENT_TEXT)
+      }
+      else kv
+    }
+  }
+
 }
 
 private[util] object CallerContext extends Logging {

diff --git a/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/EventLoggingListenerSuite.scala
@@ -95,6 +95,30 @@ class EventLoggingListenerSuite extends SparkFunSuite with LocalSparkContext wit
     }
   }
 
+  test("Event logging with password redaction") {
+    val secretPassword = "secret_password"
+    val conf = getLoggingConf(testDirPath, None).set("spark.executorEnv.HADOOP_CREDSTORE_PASSWORD",
+      secretPassword)
+    sc = new SparkContext("local-cluster[2,2,1024]", "test", conf)
+    assert(sc.eventLogger.isDefined)
+    val eventLogger = sc.eventLogger.get
+
+    sc.parallelize(1 to 10000).count()
+    sc.stop()
+
+    val logData = EventLoggingListener.openEventLog(new Path(eventLogger.logPath), fileSystem)
+    val eventLog = Source.fromInputStream(logData).mkString
+    // Make sure nothing secret shows up anywhere
+    assert(!eventLog.contains(secretPassword), s"Secret password ($secretPassword) not redacted " +
+      s"from event logs:\n $eventLog")
+    val expected = """"spark.executorEnv.HADOOP_CREDSTORE_PASSWORD":"*********(redacted)""""
+    // Make sure every occurrence of the property is accompanied by a redaction text.
+    val regex = """"spark.executorEnv.HADOOP_CREDSTORE_PASSWORD":"([^"]*)"""".r
+    val matches = regex.findAllIn(eventLog)
+    assert(matches.nonEmpty)
+    matches.foreach{ matched => assert(matched.equals(expected)) }
+  }
+
   test("Log overwriting") {
     val logUri = EventLoggingListener.getLogPath(testDir.toURI, "test", None)
     val logPath = new URI(logUri).getPath

diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
@@ -974,4 +974,28 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging {
 
     assert(pValue > threshold)
   }
+
+  test("redact sensitive information") {
+    val sparkConf = new SparkConf
+
+    // Set some secret keys
+    val secretKeys = Seq("" +
+      "spark.executorEnv.HADOOP_CREDSTORE_PASSWORD",
+      "spark.my.password",
+      "spark.my.sECreT")
+    secretKeys.foreach { key =>
+      sparkConf.set(key, "secret_password")
+    }
+    // Set a non-secret key
+    sparkConf.set("spark.regular.property", "not_a_secret")
+
+    // Redact sensitive information
+    val redactedConf = Utils.redact(sparkConf, sparkConf.getAll).toMap
+
+    // Assert that secret information got redacted while the regular property remained the same
+    secretKeys.foreach { key =>
+      assert(redactedConf.get(key).get == Utils.REDACTION_REPLACEMENT_TEXT)
+    }
+    assert(redactedConf.get("spark.regular.property").get == "not_a_secret")
+  }
 }
diff --git a/docs/configuration.md b/docs/configuration.md
@@ -356,6 +356,15 @@ Apart from these, the following properties are also available, and may be useful
     process. The user can specify multiple of these to set multiple environment variables.
   </td>
 </tr>
+<tr>
+  <td><code>spark.redaction.regex</code></td>
+  <td>(?i)secret|password</td>
+  <td>
+    Regex to decide which Spark configuration properties and environment variables in driver and
+    executor environments contain sensitive information. When this regex matches a property, its
+    value is redacted from the environment UI and various logs like YARN and event logs.
+  </td>
+</tr>
 <tr>
   <td><code>spark.python.profile</code></td>
   <td>false</td>

diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ExecutorRunnable.scala
@@ -36,7 +36,6 @@ import org.apache.hadoop.yarn.ipc.YarnRPC
 import org.apache.hadoop.yarn.util.{ConverterUtils, Records}
 
 import org.apache.spark.{SecurityManager, SparkConf, SparkException}
-import org.apache.spark.deploy.yarn.config._
 import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config._
 import org.apache.spark.launcher.YarnCommandBuilderUtils
@@ -75,7 +74,7 @@ private[yarn] class ExecutorRunnable(
     |===============================================================================
     |YARN executor launch context:
     |  env:
-    |${env.map { case (k, v) => s"    $k -> $v\n" }.mkString}
+    |${Utils.redact(sparkConf, env.toSeq).map { case (k, v) => s"    $k -> $v\n" }.mkString}
     |  command:
     |    ${commands.mkString(" \\ \n      ")}
     |