apache · Emaasit · Jun 5, 2015 · Jun 5, 2015 · Jun 5, 2015 · Jun 5, 2015
diff --git a/examples/src/main/r/0-getting-started.R b/examples/src/main/r/0-getting-started.R
@@ -0,0 +1,23 @@
+#
+# Author:   Daniel Emaasit (@emaasit)
+# Purpose: This script shows how to install SparkR onto your workstation/PC
+#          and initialize a spark context and a SparkSQL context
+# Date:    06/05/2015
+#
+
+
+# Install SparkR from CRAN
+install.packages("SparkR")
+
+## OR Install the dev version from Github
+install.packages(devtools)
+devtools::install_github("amplab-extras/SparkR-pkg", subdir="pkg")
+
+# Load SparkR onto your PC
+library(SparkR)
+
+## Initialize SparkContext on your local PC
+sc <- sparkR.init(master = "local", appName = "MyApp")
+
+## Initialize SQLContext
+sqlCtx <- SparkRSQL.init(sc)
diff --git a/examples/src/main/r/1-data.R b/examples/src/main/r/1-data.R
@@ -0,0 +1,30 @@
+#
+# Author:   Daniel Emaasit (@emaasit)
+# Purpose: This script shows how to create Spark DataFrames 
+# Date:    06/05/2015
+#
+
+# For this example, we shall use the "flights" dataset
+# The data can be downloaded from: https://s3-us-west-2.amazonaws.com/sparkr-data/flights.csv 
+# The dataset consists of every flight departing Houston in 2011.
+# The data set is made up of 227,496 rows x 14 columns. 
+
+source("0-getting-started.R")
+
+# Create an R data frame and then convert it to a SparkR DataFrame -------
+
+## Create R dataframe
+install.packages("data.table") #We want to use the fread() function to read the dataset
+library(data.table)
+
+flights_df <- fread("flights.csv")
+flights_df$date <- as.Date(flights_df$date)
+
+## Convert the local data frame into a SparkR DataFrame
+flightsDF <- createDataFrame(sqlCtx, flights_df)
+
+## Print the schema of this Spark DataFrame
+printSchema(flightsDF)
+
+## Cache the DataFrame
+cache(flightsDF)
diff --git a/examples/src/main/r/2-data-manipulation.R b/examples/src/main/r/2-data-manipulation.R
@@ -0,0 +1,51 @@
+#
+# Author:   Daniel Emaasit (@emaasit)
+# Purpose: This script shows how to explore and manipulate Spark DataFrames 
+# Date:    06/05/2015
+#
+
+source("1-data.R")
+
+
+# Install the magrittr pipeline operator
+install.packages("magrittr")
+library(magrittr)
+
+# Print the first 6 rows of the DataFrame
+showDF(flightsDF, numRows = 6) ## Or
+head(flightsDF)
+
+# Show the column names in the DataFrame
+columns(flightsDF)
+
+# Show the number of rows in the DataFrame
+count(flightsDF)
+
+# Show summary statistics for numeric colums
+Describe(flightsDF)
+
+# Select specific columns
+destDF <- select(flightsDF, "dest", "cancelled")
+
+# Using SQL to select columns of data
+# First, register the flights DataFrame as a table
+registerTempTable(flightsDF, "flightsTable")
+destDF <- sql(sqlCtx, "SELECT dest, cancelled FROM flightsTable")
+
+# Use collect to create a local R data frame
+dest_df <- collect(destDF)
+
+# Print the newly created local data frame
+print(dest_df)
+
+# Filter flights whose destination is JFK
+jfkDF <- filter(flightsDF, "dest == JFK") ##OR
+jfkDF <- filter(flightsDF, flightsDF$dest == JFK)
+
+# Group the flights by date and then find the average daily delay
+# Write the result into a DataFrame
+groupBy(flightsDF, "date") %>%
+  avg(dep_delay = "avg", arr_delay = "avg") -> dailyDelayDF
+
+# Stop the SparkContext now
+sparkR.stop()