[SPARK-20055][DOCS] Added documentation for loading csv files into DataFrames

jomach · gatorsmile · commit ccdf21f56e4f · 2017-10-11T22:13:07.000-07:00
## What changes were proposed in this pull request? Added documentation for loading csv files into Dataframes ## How was this patch tested? /dev/run-tests Author: Jorge Machado <jorge.w.machado@hotmail.com> Closes #19429 from jomach/master.
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
@@ -461,6 +461,8 @@ name (i.e., `org.apache.spark.sql.parquet`), but for built-in sources you can al
 names (`json`, `parquet`, `jdbc`, `orc`, `libsvm`, `csv`, `text`). DataFrames loaded from any data
 source type can be converted into other types using this syntax.
 
+To load a JSON file you can use:
+
 <div class="codetabs">
 <div data-lang="scala"  markdown="1">
 {% include_example manual_load_options scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala %}
@@ -479,6 +481,26 @@ source type can be converted into other types using this syntax.
 </div>
 </div>
 
+To load a CSV file you can use:
+
+<div class="codetabs">
+<div data-lang="scala"  markdown="1">
+{% include_example manual_load_options_csv scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala %}
+</div>
+
+<div data-lang="java"  markdown="1">
+{% include_example manual_load_options_csv java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java %}
+</div>
+
+<div data-lang="python"  markdown="1">
+{% include_example manual_load_options_csv python/sql/datasource.py %}
+</div>
+
+<div data-lang="r"  markdown="1">
+{% include_example manual_load_options_csv r/RSparkSQLExample.R %}
+
+</div>
+</div>
 ### Run SQL on files directly
 
 Instead of using read API to load a file into DataFrame and query it, you can also query that
@@ -573,7 +595,7 @@ Note that partition information is not gathered by default when creating externa
 
 ### Bucketing, Sorting and Partitioning
 
-For file-based data source, it is also possible to bucket and sort or partition the output. 
+For file-based data source, it is also possible to bucket and sort or partition the output.
 Bucketing and sorting are applicable only to persistent tables:
 
 <div class="codetabs">
@@ -598,7 +620,7 @@ CREATE TABLE users_bucketed_by_name(
   name STRING,
   favorite_color STRING,
   favorite_numbers array<integer>
-) USING parquet 
+) USING parquet
 CLUSTERED BY(name) INTO 42 BUCKETS;
 
 {% endhighlight %}
@@ -629,7 +651,7 @@ while partitioning can be used with both `save` and `saveAsTable` when using the
 {% highlight sql %}
 
 CREATE TABLE users_by_favorite_color(
-  name STRING, 
+  name STRING,
   favorite_color STRING,
   favorite_numbers array<integer>
 ) USING csv PARTITIONED BY(favorite_color);
@@ -664,7 +686,7 @@ CREATE TABLE users_bucketed_and_partitioned(
   name STRING,
   favorite_color STRING,
   favorite_numbers array<integer>
-) USING parquet 
+) USING parquet
 PARTITIONED BY (favorite_color)
 CLUSTERED BY(name) SORTED BY (favorite_numbers) INTO 42 BUCKETS;
 
@@ -675,7 +697,7 @@ CLUSTERED BY(name) SORTED BY (favorite_numbers) INTO 42 BUCKETS;
 </div>
 
 `partitionBy` creates a directory structure as described in the [Partition Discovery](#partition-discovery) section.
-Thus, it has limited applicability to columns with high cardinality. In contrast 
+Thus, it has limited applicability to columns with high cardinality. In contrast
  `bucketBy` distributes
 data across a fixed number of buckets and can be used when a number of unique values is unbounded.
 
diff --git a/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java b/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java
@@ -116,6 +116,13 @@ private static void runBasicDataSourceExample(SparkSession spark) {
       spark.read().format("json").load("examples/src/main/resources/people.json");
     peopleDF.select("name", "age").write().format("parquet").save("namesAndAges.parquet");
     // $example off:manual_load_options$
+    // $example on:manual_load_options_csv$
+    Dataset<Row> peopleDFCsv = spark.read().format("csv")
+      .option("sep", ";")
+      .option("inferSchema", "true")
+      .option("header", "true")
+      .load("examples/src/main/resources/people.csv");
+    // $example off:manual_load_options_csv$
     // $example on:direct_sql$
     Dataset<Row> sqlDF =
       spark.sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`");
diff --git a/examples/src/main/python/sql/datasource.py b/examples/src/main/python/sql/datasource.py
@@ -53,6 +53,11 @@ def basic_datasource_example(spark):
     df.select("name", "age").write.save("namesAndAges.parquet", format="parquet")
     # $example off:manual_load_options$
 
+    # $example on:manual_load_options_csv$
+    df = spark.read.load("examples/src/main/resources/people.csv",
+                         format="csv", sep=":", inferSchema="true", header="true")
+    # $example off:manual_load_options_csv$
+
     # $example on:write_sorting_and_bucketing$
     df.write.bucketBy(42, "name").sortBy("age").saveAsTable("people_bucketed")
     # $example off:write_sorting_and_bucketing$
diff --git a/examples/src/main/r/RSparkSQLExample.R b/examples/src/main/r/RSparkSQLExample.R
@@ -113,6 +113,12 @@ write.df(namesAndAges, "namesAndAges.parquet", "parquet")
 # $example off:manual_load_options$
 
 
+# $example on:manual_load_options_csv$
+df <- read.df("examples/src/main/resources/people.csv", "csv")
+namesAndAges <- select(df, "name", "age")
+# $example off:manual_load_options_csv$
+
+
 # $example on:direct_sql$
 df <- sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`")
 # $example off:direct_sql$
diff --git a/examples/src/main/resources/people.csv b/examples/src/main/resources/people.csv
@@ -0,0 +1,3 @@
+name;age;job
+Jorge;30;Developer
+Bob;32;Developer
diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala b/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala
@@ -49,6 +49,14 @@ object SQLDataSourceExample {
     val peopleDF = spark.read.format("json").load("examples/src/main/resources/people.json")
     peopleDF.select("name", "age").write.format("parquet").save("namesAndAges.parquet")
     // $example off:manual_load_options$
+    // $example on:manual_load_options_csv$
+    val peopleDFCsv = spark.read.format("csv")
+      .option("sep", ";")
+      .option("inferSchema", "true")
+      .option("header", "true")
+      .load("examples/src/main/resources/people.csv")
+    // $example off:manual_load_options_csv$
+
     // $example on:direct_sql$
     val sqlDF = spark.sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`")
     // $example off:direct_sql$

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+name;age;job`
	`2`	`+Jorge;30;Developer`
	`3`	`+Bob;32;Developer`