[SQL] JavaDoc update for various DataFrame functions.

rxin · rxin · commit 322e7e7f6899 · 2015-05-06T08:50:56.000-07:00
Author: Reynold Xin <rxin@databricks.com> Closes #5935 from rxin/df-doc1 and squashes the following commits: aaeaadb [Reynold Xin] [SQL] JavaDoc update for various DataFrame functions.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
@@ -84,14 +84,14 @@ class Column(protected[sql] val expr: Expression) extends Logging {
 
   /**
    * Inversion of boolean expression, i.e. NOT.
-   * {{
+   * {{{
    *   // Scala: select rows that are not active (isActive === false)
    *   df.filter( !df("isActive") )
    *
    *   // Java:
    *   import static org.apache.spark.sql.functions.*;
    *   df.filter( not(df.col("isActive")) );
-   * }}
+   * }}}
    *
    * @group expr_ops
    */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -681,11 +681,11 @@ class DataFrame private[sql](
 
   /**
    * (Scala-specific) Aggregates on the entire [[DataFrame]] without groups.
-   * {{
+   * {{{
    *   // df.agg(...) is a shorthand for df.groupBy().agg(...)
    *   df.agg("age" -> "max", "salary" -> "avg")
    *   df.groupBy().agg("age" -> "max", "salary" -> "avg")
-   * }}
+   * }}}
    * @group dfops
    */
   def agg(aggExpr: (String, String), aggExprs: (String, String)*): DataFrame = {
@@ -694,33 +694,33 @@ class DataFrame private[sql](
 
   /**
    * (Scala-specific) Aggregates on the entire [[DataFrame]] without groups.
-   * {{
+   * {{{
    *   // df.agg(...) is a shorthand for df.groupBy().agg(...)
    *   df.agg(Map("age" -> "max", "salary" -> "avg"))
    *   df.groupBy().agg(Map("age" -> "max", "salary" -> "avg"))
-   * }}
+   * }}}
    * @group dfops
    */
   def agg(exprs: Map[String, String]): DataFrame = groupBy().agg(exprs)
 
   /**
    * (Java-specific) Aggregates on the entire [[DataFrame]] without groups.
-   * {{
+   * {{{
    *   // df.agg(...) is a shorthand for df.groupBy().agg(...)
    *   df.agg(Map("age" -> "max", "salary" -> "avg"))
    *   df.groupBy().agg(Map("age" -> "max", "salary" -> "avg"))
-   * }}
+   * }}}
    * @group dfops
    */
   def agg(exprs: java.util.Map[String, String]): DataFrame = groupBy().agg(exprs)
 
   /**
    * Aggregates on the entire [[DataFrame]] without groups.
-   * {{
+   * {{{
    *   // df.agg(...) is a shorthand for df.groupBy().agg(...)
    *   df.agg(max($"age"), avg($"salary"))
    *   df.groupBy().agg(max($"age"), avg($"salary"))
-   * }}
+   * }}}
    * @group dfops
    */
   @scala.annotation.varargs
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala
@@ -37,7 +37,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
     StatFunctions.calculateCov(df, Seq(col1, col2))
   }
 
-  /*
+  /**
    * Calculates the correlation of two columns of a DataFrame. Currently only supports the Pearson
    * Correlation Coefficient. For Spearman Correlation, consider using RDD methods found in 
    * MLlib's Statistics.
@@ -75,7 +75,7 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
    *             each row.
    * @param col2 The name of the second column. Distinct items will make the column names
    *             of the DataFrame.
-   * @return A Local DataFrame containing the table
+   * @return A DataFrame containing for the contingency table.
    */
   def crosstab(col1: String, col2: String): DataFrame = {
     StatFunctions.crossTabulate(df, col1, col2)
@@ -110,14 +110,25 @@ final class DataFrameStatFunctions private[sql](df: DataFrame) {
   }
 
   /**
-   * Python friendly implementation for `freqItems`
+   * (Scala-specific) Finding frequent items for columns, possibly with false positives. Using the
+   * frequent element count algorithm described in
+   * [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]].
+   *
+   * @param cols the names of the columns to search frequent items in.
+   * @return A Local DataFrame with the Array of frequent items for each column.
    */
   def freqItems(cols: Seq[String], support: Double): DataFrame = {
     FrequentItems.singlePassFreqItems(df, cols, support)
   }
 
   /**
-   * Python friendly implementation for `freqItems` with a default `support` of 1%.
+   * (Scala-specific) Finding frequent items for columns, possibly with false positives. Using the
+   * frequent element count algorithm described in
+   * [[http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou]].
+   * Uses a `default` support of 1%.
+   *
+   * @param cols the names of the columns to search frequent items in.
+   * @return A Local DataFrame with the Array of frequent items for each column.
    */
   def freqItems(cols: Seq[String]): DataFrame = {
     FrequentItems.singlePassFreqItems(df, cols, 0.01)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -22,7 +22,7 @@ import scala.reflect.runtime.universe.{TypeTag, typeTag}
 
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.sql.catalyst.ScalaReflection
-import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedFunction, Star}
+import org.apache.spark.sql.catalyst.analysis.{UnresolvedFunction, Star}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.mathfuncs._
 import org.apache.spark.sql.types._
@@ -86,21 +86,21 @@ object functions {
 
   /**
    * Returns a sort expression based on ascending order of the column.
-   * {{
+   * {{{
    *   // Sort by dept in ascending order, and then age in descending order.
    *   df.sort(asc("dept"), desc("age"))
-   * }}
+   * }}}
    *
    * @group sort_funcs
    */
   def asc(columnName: String): Column = Column(columnName).asc
 
   /**
    * Returns a sort expression based on the descending order of the column.
-   * {{
+   * {{{
    *   // Sort by dept in ascending order, and then age in descending order.
    *   df.sort(asc("dept"), desc("age"))
-   * }}
+   * }}}
    *
    * @group sort_funcs
    */
@@ -353,13 +353,13 @@ object functions {
 
   /**
    * Inversion of boolean expression, i.e. NOT.
-   * {{
+   * {{{
    *   // Scala: select rows that are not active (isActive === false)
    *   df.filter( !df("isActive") )
    *
    *   // Java:
    *   df.filter( not(df.col("isActive")) );
-   * }}
+   * }}}
    *
    * @group normal_funcs
    */