apache
diff --git a/‎R/pkg/NAMESPACE‎
Lines changed: 1 addition & 0 deletions b/‎R/pkg/NAMESPACE‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎R/pkg/R/DataFrame.R‎
Lines changed: 37 additions & 0 deletions b/‎R/pkg/R/DataFrame.R‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎R/pkg/R/generics.R‎
Lines changed: 4 additions & 0 deletions b/‎R/pkg/R/generics.R‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎R/pkg/inst/tests/test_sparkSQL.R‎
Lines changed: 11 additions & 0 deletions b/‎R/pkg/inst/tests/test_sparkSQL.R‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js‎
Lines changed: 1 addition & 3 deletions b/‎core/src/main/resources/org/apache/spark/ui/static/spark-dag-viz.js‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/SparkContext.scala‎
Lines changed: 5 additions & 3 deletions b/‎core/src/main/scala/org/apache/spark/SparkContext.scala‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/annotation/Private.java‎
Lines changed: 41 additions & 0 deletions b/‎core/src/main/scala/org/apache/spark/annotation/Private.java‎
Lines changed: 41 additions & 0 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala‎
Lines changed: 7 additions & 0 deletions b/‎core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/serializer/Serializer.scala‎
Lines changed: 34 additions & 1 deletion b/‎core/src/main/scala/org/apache/spark/serializer/Serializer.scala‎
Lines changed: 34 additions & 1 deletion
diff --git a/‎core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala‎
Lines changed: 1 addition & 2 deletions b/‎core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala‎
Lines changed: 1 addition & 2 deletions
@@ -13,6 +13,7 @@ exportMethods("cache",
               "collect",
               "columns",
               "count",
+              "describe",
               "distinct",
               "dtypes",
               "except",
 
@@ -1276,3 +1276,40 @@ setMethod("saveAsTable",
             callJMethod(df@sdf, "saveAsTable", tableName, source, jmode, options)
           })
 
+#' describe
+#'
+#' Computes statistics for numeric columns.
+#' If no columns are given, this function computes statistics for all numerical columns.
+#'
+#' @param x A DataFrame to be computed.
+#' @param col A string of name
+#' @param ... Additional expressions
+#' @return A DataFrame
+#' @rdname describe 
+#' @export
+#' @examples
+#'\dontrun{
+#' sc <- sparkR.init()
+#' sqlCtx <- sparkRSQL.init(sc)
+#' path <- "path/to/file.json"
+#' df <- jsonFile(sqlCtx, path)
+#' describe(df)
+#' describe(df, "col1")
+#' describe(df, "col1", "col2")
+#' }
+setMethod("describe",
+          signature(x = "DataFrame", col = "character"),
+          function(x, col, ...) {
+            colList <- list(col, ...)
+            sdf <- callJMethod(x@sdf, "describe", listToSeq(colList))
+            dataFrame(sdf)
+          })
+
+#' @rdname describe
+setMethod("describe",
+          signature(x = "DataFrame"),
+          function(x) {
+            colList <- as.list(c(columns(x)))
+            sdf <- callJMethod(x@sdf, "describe", listToSeq(colList))
+            dataFrame(sdf)
+          })
@@ -384,6 +384,10 @@ setGeneric("value", function(bcast) { standardGeneric("value") })
 #' @export
 setGeneric("columns", function(x) {standardGeneric("columns") })
 
+#' @rdname describe
+#' @export
+setGeneric("describe", function(x, col, ...) { standardGeneric("describe") })
+
 #' @rdname schema
 #' @export
 setGeneric("dtypes", function(x) { standardGeneric("dtypes") })
 
@@ -705,5 +705,16 @@ test_that("parquetFile works with multiple input paths", {
   expect_true(count(parquetDF) == count(df)*2)
 })
 
+test_that("describe() on a DataFrame", {
+  df <- jsonFile(sqlCtx, jsonPath)
+  stats <- describe(df, "age")
+  expect_true(collect(stats)[1, "summary"] == "count")
+  expect_true(collect(stats)[2, "age"] == 24.5)
+  expect_true(collect(stats)[3, "age"] == 5.5)
+  stats <- describe(df)
+  expect_true(collect(stats)[4, "name"] == "Andy")
+  expect_true(collect(stats)[5, "age"] == 30.0)
+})
+
 unlink(parquetPath)
 unlink(jsonPath)
@@ -143,9 +143,7 @@ function renderDagViz(forJob) {
   resizeSvg(svg);
 }
 
-/*
- * Render the RDD DAG visualization on the stage page.
- */
+/* Render the RDD DAG visualization on the stage page. */
 function renderDagVizForStage(svgContainer) {
   var metadata = metadataContainer().select(".stage-metadata");
   var dot = metadata.select(".dot-file").text();
 
@@ -407,15 +407,17 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
 
     if (master == "yarn-client") System.setProperty("SPARK_YARN_MODE", "true")
 
+    // "_jobProgressListener" should be set up before creating SparkEnv because when creating
+    // "SparkEnv", some messages will be posted to "listenerBus" and we should not miss them.
+    _jobProgressListener = new JobProgressListener(_conf)
+    listenerBus.addListener(jobProgressListener)
+
     // Create the Spark execution environment (cache, map output tracker, etc)
     _env = createSparkEnv(_conf, isLocal, listenerBus)
     SparkEnv.set(_env)
 
     _metadataCleaner = new MetadataCleaner(MetadataCleanerType.SPARK_CONTEXT, this.cleanup, _conf)
 
-    _jobProgressListener = new JobProgressListener(_conf)
-    listenerBus.addListener(jobProgressListener)
-
     _statusTracker = new SparkStatusTracker(this)
 
     _progressBar =
 
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.annotation;
+
+import java.lang.annotation.ElementType;
+import java.lang.annotation.Retention;
+import java.lang.annotation.RetentionPolicy;
+import java.lang.annotation.Target;
+
+/**
+ * A class that is considered private to the internals of Spark -- there is a high-likelihood
+ * they will be changed in future versions of Spark.
+ *
+ * This should be used only when the standard Scala / Java means of protecting classes are
+ * insufficient.  In particular, Java has no equivalent of private[spark], so we use this annotation
+ * in its place.
+ *
+ * NOTE: If there exists a Scaladoc comment that immediately precedes this annotation, the first
+ * line of the comment must be ":: Private ::" with no trailing blank line. This is because
+ * of the known issue that Scaladoc displays only either the annotation or the comment, whichever
+ * comes first.
+ */
+@Retention(RetentionPolicy.RUNTIME)
+@Target({ElementType.TYPE, ElementType.FIELD, ElementType.METHOD, ElementType.PARAMETER,
+        ElementType.CONSTRUCTOR, ElementType.LOCAL_VARIABLE, ElementType.PACKAGE})
+public @interface Private {}
@@ -125,6 +125,13 @@ class KryoSerializer(conf: SparkConf)
   override def newInstance(): SerializerInstance = {
     new KryoSerializerInstance(this)
   }
+
+  private[spark] override lazy val supportsRelocationOfSerializedObjects: Boolean = {
+    // If auto-reset is disabled, then Kryo may store references to duplicate occurrences of objects
+    // in the stream rather than writing those objects' serialized bytes, breaking relocation. See
+    // https://groups.google.com/d/msg/kryo-users/6ZUSyfjjtdo/FhGG1KHDXPgJ for more details.
+    newInstance().asInstanceOf[KryoSerializerInstance].getAutoReset()
+  }
 }
 
 private[spark]
 
@@ -23,7 +23,7 @@ import java.nio.ByteBuffer
 import scala.reflect.ClassTag
 
 import org.apache.spark.{SparkConf, SparkEnv}
-import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.annotation.{DeveloperApi, Private}
 import org.apache.spark.util.{Utils, ByteBufferInputStream, NextIterator}
 
 /**
@@ -63,6 +63,39 @@ abstract class Serializer {
 
   /** Creates a new [[SerializerInstance]]. */
   def newInstance(): SerializerInstance
+
+  /**
+   * :: Private ::
+   * Returns true if this serializer supports relocation of its serialized objects and false
+   * otherwise. This should return true if and only if reordering the bytes of serialized objects
+   * in serialization stream output is equivalent to having re-ordered those elements prior to
+   * serializing them. More specifically, the following should hold if a serializer supports
+   * relocation:
+   *
+   * {{{
+   * serOut.open()
+   * position = 0
+   * serOut.write(obj1)
+   * serOut.flush()
+   * position = # of bytes writen to stream so far
+   * obj1Bytes = output[0:position-1]
+   * serOut.write(obj2)
+   * serOut.flush()
+   * position2 = # of bytes written to stream so far
+   * obj2Bytes = output[position:position2-1]
+   * serIn.open([obj2bytes] concatenate [obj1bytes]) should return (obj2, obj1)
+   * }}}
+   *
+   * In general, this property should hold for serializers that are stateless and that do not
+   * write special metadata at the beginning or end of the serialization stream.
+   *
+   * This API is private to Spark; this method should not be overridden in third-party subclasses
+   * or called in user code and is subject to removal in future Spark releases.
+   *
+   * See SPARK-7311 for more details.
+   */
+  @Private
+  private[spark] def supportsRelocationOfSerializedObjects: Boolean = false
 }
 
 
 
@@ -131,8 +131,7 @@ private[spark] class ExternalSorter[K, V, C](
   private val kvChunkSize = conf.getInt("spark.shuffle.sort.kvChunkSize", 1 << 22) // 4 MB
   private val useSerializedPairBuffer =
     !ordering.isDefined && conf.getBoolean("spark.shuffle.sort.serializeMapOutputs", true) &&
-    ser.isInstanceOf[KryoSerializer] &&
-    serInstance.asInstanceOf[KryoSerializerInstance].getAutoReset
+    ser.supportsRelocationOfSerializedObjects
 
   // Data structures to store in-memory objects before we spill. Depending on whether we have an
   // Aggregator set, we either put objects into an AppendOnlyMap where we combine them, or we
Original file line number	Diff line number	Diff line change
`@@ -125,6 +125,13 @@ class KryoSerializer(conf: SparkConf)`
`125`	`125`	`override def newInstance(): SerializerInstance = {`
`126`	`126`	`new KryoSerializerInstance(this)`
`127`	`127`	`}`
	`128`	`+`
	`129`	`+ private[spark] override lazy val supportsRelocationOfSerializedObjects: Boolean = {`
	`130`	`+ // If auto-reset is disabled, then Kryo may store references to duplicate occurrences of objects`
	`131`	`+ // in the stream rather than writing those objects' serialized bytes, breaking relocation. See`
	`132`	`+ // https://groups.google.com/d/msg/kryo-users/6ZUSyfjjtdo/FhGG1KHDXPgJ for more details.`
	`133`	`+ newInstance().asInstanceOf[KryoSerializerInstance].getAutoReset()`
	`134`	`+ }`
`128`	`135`	`}`
`129`	`136`
`130`	`137`	`private[spark]`