alteryx
diff --git a/‎R/pkg/R/DataFrame.R‎
Lines changed: 21 additions & 1 deletion b/‎R/pkg/R/DataFrame.R‎
Lines changed: 21 additions & 1 deletion
diff --git a/‎R/pkg/inst/tests/testthat/test_sparkSQL.R‎
Lines changed: 11 additions & 0 deletions b/‎R/pkg/inst/tests/testthat/test_sparkSQL.R‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/MapOutputTracker.scala‎
Lines changed: 29 additions & 23 deletions b/‎core/src/main/scala/org/apache/spark/MapOutputTracker.scala‎
Lines changed: 29 additions & 23 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/TaskContext.scala‎
Lines changed: 26 additions & 2 deletions b/‎core/src/main/scala/org/apache/spark/TaskContext.scala‎
Lines changed: 26 additions & 2 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/TaskContextImpl.scala‎
Lines changed: 32 additions & 7 deletions b/‎core/src/main/scala/org/apache/spark/TaskContextImpl.scala‎
Lines changed: 32 additions & 7 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/deploy/client/AppClient.scala‎
Lines changed: 8 additions & 10 deletions b/‎core/src/main/scala/org/apache/spark/deploy/client/AppClient.scala‎
Lines changed: 8 additions & 10 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala‎
Lines changed: 2 additions & 2 deletions b/‎core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/scheduler/Task.scala‎
Lines changed: 5 additions & 0 deletions b/‎core/src/main/scala/org/apache/spark/scheduler/Task.scala‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/util/TaskCompletionListenerException.scala‎
Lines changed: 0 additions & 34 deletions b/‎core/src/main/scala/org/apache/spark/util/TaskCompletionListenerException.scala‎
Lines changed: 0 additions & 34 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/util/Utils.scala‎
Lines changed: 42 additions & 3 deletions b/‎core/src/main/scala/org/apache/spark/util/Utils.scala‎
Lines changed: 42 additions & 3 deletions
@@ -303,8 +303,28 @@ setMethod("colnames",
 #' @rdname columns
 #' @name colnames<-
 setMethod("colnames<-",
-          signature(x = "DataFrame", value = "character"),
+          signature(x = "DataFrame"),
           function(x, value) {
+
+            # Check parameter integrity
+            if (class(value) != "character") {
+              stop("Invalid column names.")
+            }
+
+            if (length(value) != ncol(x)) {
+              stop(
+                "Column names must have the same length as the number of columns in the dataset.")
+            }
+
+            if (any(is.na(value))) {
+              stop("Column names cannot be NA.")
+            }
+
+            # Check if the column names have . in it
+            if (any(regexec(".", value, fixed=TRUE)[[1]][1] != -1)) {
+              stop("Colum names cannot contain the '.' symbol.")
+            }
+
             sdf <- callJMethod(x@sdf, "toDF", as.list(value))
             dataFrame(sdf)
           })
 
@@ -692,6 +692,17 @@ test_that("names() colnames() set the column names", {
   colnames(df) <- c("col3", "col4")
   expect_equal(names(df)[1], "col3")
 
+  expect_error(colnames(df) <- c("sepal.length", "sepal_width"),
+               "Colum names cannot contain the '.' symbol.")
+  expect_error(colnames(df) <- c(1, 2), "Invalid column names.")
+  expect_error(colnames(df) <- c("a"),
+               "Column names must have the same length as the number of columns in the dataset.")
+  expect_error(colnames(df) <- c("1", NA), "Column names cannot be NA.")
+
+  # Note: if this test is broken, remove check for "." character on colnames<- method
+  irisDF <- suppressWarnings(createDataFrame(sqlContext, iris))
+  expect_equal(names(irisDF)[1], "Sepal_Length")
+
   # Test base::colnames base::names
   m2 <- cbind(1, 1:4)
   expect_equal(colnames(m2, do.NULL = FALSE), c("col1", "col2"))
 
@@ -384,8 +384,6 @@ private[spark] class MapOutputTrackerMaster(conf: SparkConf)
    * @param numReducers total number of reducers in the shuffle
    * @param fractionThreshold fraction of total map output size that a location must have
    *                          for it to be considered large.
-   *
-   * This method is not thread-safe.
    */
   def getLocationsWithLargestOutputs(
       shuffleId: Int,
@@ -394,28 +392,36 @@ private[spark] class MapOutputTrackerMaster(conf: SparkConf)
       fractionThreshold: Double)
     : Option[Array[BlockManagerId]] = {
 
-    if (mapStatuses.contains(shuffleId)) {
-      val statuses = mapStatuses(shuffleId)
-      if (statuses.nonEmpty) {
-        // HashMap to add up sizes of all blocks at the same location
-        val locs = new HashMap[BlockManagerId, Long]
-        var totalOutputSize = 0L
-        var mapIdx = 0
-        while (mapIdx < statuses.length) {
-          val status = statuses(mapIdx)
-          val blockSize = status.getSizeForBlock(reducerId)
-          if (blockSize > 0) {
-            locs(status.location) = locs.getOrElse(status.location, 0L) + blockSize
-            totalOutputSize += blockSize
+    val statuses = mapStatuses.get(shuffleId).orNull
+    if (statuses != null) {
+      statuses.synchronized {
+        if (statuses.nonEmpty) {
+          // HashMap to add up sizes of all blocks at the same location
+          val locs = new HashMap[BlockManagerId, Long]
+          var totalOutputSize = 0L
+          var mapIdx = 0
+          while (mapIdx < statuses.length) {
+            val status = statuses(mapIdx)
+            // status may be null here if we are called between registerShuffle, which creates an
+            // array with null entries for each output, and registerMapOutputs, which populates it
+            // with valid status entries. This is possible if one thread schedules a job which
+            // depends on an RDD which is currently being computed by another thread.
+            if (status != null) {
+              val blockSize = status.getSizeForBlock(reducerId)
+              if (blockSize > 0) {
+                locs(status.location) = locs.getOrElse(status.location, 0L) + blockSize
+                totalOutputSize += blockSize
+              }
+            }
+            mapIdx = mapIdx + 1
+          }
+          val topLocs = locs.filter { case (loc, size) =>
+            size.toDouble / totalOutputSize >= fractionThreshold
+          }
+          // Return if we have any locations which satisfy the required threshold
+          if (topLocs.nonEmpty) {
+            return Some(topLocs.keys.toArray)
           }
-          mapIdx = mapIdx + 1
-        }
-        val topLocs = locs.filter { case (loc, size) =>
-          size.toDouble / totalOutputSize >= fractionThreshold
-        }
-        // Return if we have any locations which satisfy the required threshold
-        if (topLocs.nonEmpty) {
-          return Some(topLocs.map(_._1).toArray)
         }
       }
     }
 
@@ -23,7 +23,7 @@ import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.memory.TaskMemoryManager
 import org.apache.spark.metrics.source.Source
-import org.apache.spark.util.TaskCompletionListener
+import org.apache.spark.util.{TaskCompletionListener, TaskFailureListener}
 
 
 object TaskContext {
@@ -108,15 +108,39 @@ abstract class TaskContext extends Serializable {
    * Adds a (Java friendly) listener to be executed on task completion.
    * This will be called in all situation - success, failure, or cancellation.
    * An example use is for HadoopRDD to register a callback to close the input stream.
+   *
+   * Exceptions thrown by the listener will result in failure of the task.
    */
   def addTaskCompletionListener(listener: TaskCompletionListener): TaskContext
 
   /**
    * Adds a listener in the form of a Scala closure to be executed on task completion.
    * This will be called in all situations - success, failure, or cancellation.
    * An example use is for HadoopRDD to register a callback to close the input stream.
+   *
+   * Exceptions thrown by the listener will result in failure of the task.
+   */
+  def addTaskCompletionListener(f: (TaskContext) => Unit): TaskContext = {
+    addTaskCompletionListener(new TaskCompletionListener {
+      override def onTaskCompletion(context: TaskContext): Unit = f(context)
+    })
+  }
+
+  /**
+   * Adds a listener to be executed on task failure.
+   * Operations defined here must be idempotent, as `onTaskFailure` can be called multiple times.
    */
-  def addTaskCompletionListener(f: (TaskContext) => Unit): TaskContext
+  def addTaskFailureListener(listener: TaskFailureListener): TaskContext
+
+  /**
+   * Adds a listener to be executed on task failure.
+   * Operations defined here must be idempotent, as `onTaskFailure` can be called multiple times.
+   */
+  def addTaskFailureListener(f: (TaskContext, Throwable) => Unit): TaskContext = {
+    addTaskFailureListener(new TaskFailureListener {
+      override def onTaskFailure(context: TaskContext, error: Throwable): Unit = f(context, error)
+    })
+  }
 
   /**
    * Adds a callback function to be executed on task completion. An example use
 
@@ -23,7 +23,7 @@ import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.memory.TaskMemoryManager
 import org.apache.spark.metrics.MetricsSystem
 import org.apache.spark.metrics.source.Source
-import org.apache.spark.util.{TaskCompletionListener, TaskCompletionListenerException}
+import org.apache.spark.util._
 
 private[spark] class TaskContextImpl(
     val stageId: Int,
@@ -41,24 +41,28 @@ private[spark] class TaskContextImpl(
   // For backwards-compatibility; this method is now deprecated as of 1.3.0.
   override def attemptId(): Long = taskAttemptId
 
-  // List of callback functions to execute when the task completes.
+  /** List of callback functions to execute when the task completes. */
   @transient private val onCompleteCallbacks = new ArrayBuffer[TaskCompletionListener]
 
+  /** List of callback functions to execute when the task fails. */
+  @transient private val onFailureCallbacks = new ArrayBuffer[TaskFailureListener]
+
   // Whether the corresponding task has been killed.
   @volatile private var interrupted: Boolean = false
 
   // Whether the task has completed.
   @volatile private var completed: Boolean = false
 
+  // Whether the task has failed.
+  @volatile private var failed: Boolean = false
+
   override def addTaskCompletionListener(listener: TaskCompletionListener): this.type = {
     onCompleteCallbacks += listener
     this
   }
 
-  override def addTaskCompletionListener(f: TaskContext => Unit): this.type = {
-    onCompleteCallbacks += new TaskCompletionListener {
-      override def onTaskCompletion(context: TaskContext): Unit = f(context)
-    }
+  override def addTaskFailureListener(listener: TaskFailureListener): this.type = {
+    onFailureCallbacks += listener
     this
   }
 
@@ -69,7 +73,28 @@ private[spark] class TaskContextImpl(
     }
   }
 
-  /** Marks the task as completed and triggers the listeners. */
+  /** Marks the task as failed and triggers the failure listeners. */
+  private[spark] def markTaskFailed(error: Throwable): Unit = {
+    // failure callbacks should only be called once
+    if (failed) return
+    failed = true
+    val errorMsgs = new ArrayBuffer[String](2)
+    // Process failure callbacks in the reverse order of registration
+    onFailureCallbacks.reverse.foreach { listener =>
+      try {
+        listener.onTaskFailure(this, error)
+      } catch {
+        case e: Throwable =>
+          errorMsgs += e.getMessage
+          logError("Error in TaskFailureListener", e)
+      }
+    }
+    if (errorMsgs.nonEmpty) {
+      throw new TaskCompletionListenerException(errorMsgs, Option(error))
+    }
+  }
+
+  /** Marks the task as completed and triggers the completion listeners. */
   private[spark] def markTaskCompleted(): Unit = {
     completed = true
     val errorMsgs = new ArrayBuffer[String](2)
 
@@ -126,16 +126,14 @@ private[spark] class AppClient(
       registerMasterFutures.set(tryRegisterAllMasters())
       registrationRetryTimer.set(registrationRetryThread.scheduleAtFixedRate(new Runnable {
         override def run(): Unit = {
-          Utils.tryOrExit {
-            if (registered.get) {
-              registerMasterFutures.get.foreach(_.cancel(true))
-              registerMasterThreadPool.shutdownNow()
-            } else if (nthRetry >= REGISTRATION_RETRIES) {
-              markDead("All masters are unresponsive! Giving up.")
-            } else {
-              registerMasterFutures.get.foreach(_.cancel(true))
-              registerWithMaster(nthRetry + 1)
-            }
+          if (registered.get) {
+            registerMasterFutures.get.foreach(_.cancel(true))
+            registerMasterThreadPool.shutdownNow()
+          } else if (nthRetry >= REGISTRATION_RETRIES) {
+            markDead("All masters are unresponsive! Giving up.")
+          } else {
+            registerMasterFutures.get.foreach(_.cancel(true))
+            registerWithMaster(nthRetry + 1)
           }
         }
       }, REGISTRATION_TIMEOUT_SECONDS, REGISTRATION_TIMEOUT_SECONDS, TimeUnit.SECONDS))
 
@@ -1108,7 +1108,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
       val writer = format.getRecordWriter(hadoopContext).asInstanceOf[NewRecordWriter[K, V]]
       require(writer != null, "Unable to obtain RecordWriter")
       var recordsWritten = 0L
-      Utils.tryWithSafeFinally {
+      Utils.tryWithSafeFinallyAndFailureCallbacks {
         while (iter.hasNext) {
           val pair = iter.next()
           writer.write(pair._1, pair._2)
@@ -1194,7 +1194,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
       writer.open()
       var recordsWritten = 0L
 
-      Utils.tryWithSafeFinally {
+      Utils.tryWithSafeFinallyAndFailureCallbacks {
         while (iter.hasNext) {
           val record = iter.next()
           writer.write(record._1.asInstanceOf[AnyRef], record._2.asInstanceOf[AnyRef])
 
@@ -87,7 +87,12 @@ private[spark] abstract class Task[T](
     }
     try {
       (runTask(context), context.collectAccumulators())
+    } catch { case e: Throwable =>
+      // Catch all errors; run task failure callbacks, and rethrow the exception.
+      context.markTaskFailed(e)
+      throw e
     } finally {
+      // Call the task completion callbacks.
       context.markTaskCompleted()
       try {
         Utils.tryLogNonFatalError {
 
@@ -1244,7 +1244,6 @@ private[spark] object Utils extends Logging {
    * exception from the original `out.write` call.
    */
   def tryWithSafeFinally[T](block: => T)(finallyBlock: => Unit): T = {
-    // It would be nice to find a method on Try that did this
     var originalThrowable: Throwable = null
     try {
       block
@@ -1270,6 +1269,44 @@ private[spark] object Utils extends Logging {
     }
   }
 
+  /**
+   * Execute a block of code, call the failure callbacks before finally block if there is any
+   * exceptions happen. But if exceptions happen in the finally block, do not suppress the original
+   * exception.
+   *
+   * This is primarily an issue with `finally { out.close() }` blocks, where
+   * close needs to be called to clean up `out`, but if an exception happened
+   * in `out.write`, it's likely `out` may be corrupted and `out.close` will
+   * fail as well. This would then suppress the original/likely more meaningful
+   * exception from the original `out.write` call.
+   */
+  def tryWithSafeFinallyAndFailureCallbacks[T](block: => T)(finallyBlock: => Unit): T = {
+    var originalThrowable: Throwable = null
+    try {
+      block
+    } catch {
+      case t: Throwable =>
+        // Purposefully not using NonFatal, because even fatal exceptions
+        // we don't want to have our finallyBlock suppress
+        originalThrowable = t
+        TaskContext.get().asInstanceOf[TaskContextImpl].markTaskFailed(t)
+        throw originalThrowable
+    } finally {
+      try {
+        finallyBlock
+      } catch {
+        case t: Throwable =>
+          if (originalThrowable != null) {
+            originalThrowable.addSuppressed(t)
+            logWarning(s"Suppressing exception in finally: " + t.getMessage, t)
+            throw originalThrowable
+          } else {
+            throw t
+          }
+      }
+    }
+  }
+
   /** Default filtering function for finding call sites using `getCallSite`. */
   private def sparkInternalExclusionFunction(className: String): Boolean = {
     // A regular expression to match classes of the internal Spark API's
@@ -1991,8 +2028,10 @@ private[spark] object Utils extends Logging {
       } catch {
         case e: Exception if isBindCollision(e) =>
           if (offset >= maxRetries) {
-            val exceptionMessage =
-              s"${e.getMessage}: Service$serviceString failed after $maxRetries retries!"
+            val exceptionMessage = s"${e.getMessage}: Service$serviceString failed after " +
+              s"$maxRetries retries! Consider explicitly setting the appropriate port for the " +
+              s"service$serviceString (for example spark.ui.port for SparkUI) to an available " +
+              "port or increasing spark.port.maxRetries."
             val exception = new BindException(exceptionMessage)
             // restore original stack trace
             exception.setStackTrace(e.getStackTrace)