Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
114 commits
Select commit Hold shift + click to select a range
0ebb014
show create table DDL -- hive metastore table
xwu0226 Apr 2, 2016
6d060be
update upon review
xwu0226 Apr 2, 2016
2799672
ignoring sqlContext temp table and considering datasource table ddl
xwu0226 Apr 2, 2016
98c020a
fix scala style issue
xwu0226 Apr 4, 2016
efd8898
fix scala style issue in testcase
xwu0226 Apr 4, 2016
b370630
fix testcase for test failure
xwu0226 Apr 5, 2016
8cb7a72
continue the database ddl generation
xwu0226 Apr 6, 2016
8b67d22
support datasource ddl
xwu0226 Apr 8, 2016
9ab863f
scala style fix
xwu0226 Apr 8, 2016
a40273c
merge the code committed by CREATE TABLE native support
xwu0226 Apr 13, 2016
d214a3b
rework show create ddl based on new native supported create table DDL…
xwu0226 Apr 14, 2016
1680ea0
Merge branch 'show_create_table_1' into show_create_table_2
xwu0226 Apr 14, 2016
ff9ae61
[SPARK-14601][DOC] Minor doc/usage changes related to removal of Spar…
markgrover Apr 15, 2016
fa8373c
remove spaces
xwu0226 Apr 15, 2016
b5c60bc
[SPARK-14447][SQL] Speed up TungstenAggregate w/ keys using Vectorize…
sameeragarwal Apr 15, 2016
297ba3f
[SPARK-14275][SQL] Reimplement TypedAggregateExpression to Declarativ…
cloud-fan Apr 15, 2016
b961323
[SPARK-14374][ML][PYSPARK] PySpark ml GBTClassifier, Regressor suppor…
yanboliang Apr 15, 2016
a9324a0
Closes #12407
rxin Apr 15, 2016
96534aa
[SPARK-14549][ML] Copy the Vector and Matrix classes from mllib to ml…
Apr 15, 2016
e249232
[SPARK-14370][MLLIB] removed duplicate generation of ids in OnlineLDA…
Apr 15, 2016
06b9d62
[SPARK-14633] Use more readable format to show memory bytes in Error …
peterableda Apr 15, 2016
83af297
[SPARK-13925][ML][SPARKR] Expose R-like summary statistics in SparkR:…
yanboliang Apr 15, 2016
5095b6c
update upon review - use visitTableIdentifier
xwu0226 Apr 15, 2016
d6ae7d4
[SPARK-14665][ML][PYTHON] Fixed bug with StopWordsRemover default sto…
jkbradley Apr 15, 2016
129f2f4
[SPARK-14104][PYSPARK][ML] All Python param setters should use the `_…
sethah Apr 15, 2016
90b46e0
[SPARK-7861][ML] PySpark OneVsRest
yinxusen Apr 15, 2016
8028a28
[SPARK-14628][CORE] Simplify task metrics by always tracking read/wri…
rxin Apr 15, 2016
4df6518
[SPARK-14620][SQL] Use/benchmark a better hash in VectorizedHashMap
sameeragarwal Apr 15, 2016
b2dfa84
[SPARK-14668][SQL] Move CurrentDatabase to Catalyst
yhuai Apr 16, 2016
f4be094
[SPARK-14677][SQL] Make the max number of iterations configurable for…
rxin Apr 16, 2016
1285446
[SPARK-13363][SQL] support Aggregator in RelationalGroupedDataset
cloud-fan Apr 16, 2016
527c780
Revert "[SPARK-13363][SQL] support Aggregator in RelationalGroupedDat…
rxin Apr 16, 2016
9f678e9
[MINOR] Remove inappropriate type notation and extra anonymous closur…
HyukjinKwon Apr 16, 2016
36da5e3
[SPARK-14605][ML][PYTHON] Changed Python to use unicode UIDs for spar…
jkbradley Apr 16, 2016
7319fcc
[SPARK-14677][SQL] follow up: make max iter num config internal
rxin Apr 16, 2016
3f49afe
[SPARK-14683][DOCUMENTATION] Configure external links in ScalaDoc
Atry Apr 16, 2016
5cefecc
[SPARK-14647][SQL] Group SQLContext/HiveContext state into SharedState
Apr 16, 2016
3394b12
[SPARK-14672][SQL] Move HiveContext analyze logic to AnalyzeTable
Apr 16, 2016
af1f4da
[SPARK-13904][SCHEDULER] Add support for pluggable cluster manager
Apr 17, 2016
8a87f7d
Mark ExternalClusterManager as private[spark].
rxin Apr 17, 2016
699a4df
[SPARK-14632] randomSplit method fails on dataframes with maps in schema
sbcd90 Apr 17, 2016
7de06a6
Revert "[SPARK-14647][SQL] Group SQLContext/HiveContext state into Sh…
Apr 18, 2016
2f1d032
[SPARK-13363][SQL] support Aggregator in RelationalGroupedDataset
cloud-fan Apr 18, 2016
1a39664
[SPARK-14696][SQL] Add implicit encoders for boxed primitive types
rxin Apr 18, 2016
d6fb485
[SPARK-14423][YARN] Avoid same name files added to distributed cache …
jerryshao Apr 18, 2016
432d139
[SPARK-14614] [SQL] Add `bround` function
dongjoon-hyun Apr 18, 2016
775cf17
[SPARK-14473][SQL] Define analysis rules to catch operations not supp…
tdas Apr 18, 2016
b64482f
[SPARK-14306][ML][PYSPARK] PySpark ml.classification OneVsRest suppor…
yinxusen Apr 18, 2016
d280d1d
[SPARK-14580][SPARK-14655][SQL] Hive IfCoercion should preserve predi…
dongjoon-hyun Apr 18, 2016
3d66a2c
[SPARK-14564][ML][MLLIB][PYSPARK] Python Word2Vec missing setWindowSi…
jasoncl Apr 18, 2016
e4ae974
[HOTFIX] Fix Scala 2.10 compilation break.
rxin Apr 18, 2016
28ee157
[SPARK-14647][SQL] Group SQLContext/HiveContext state into SharedState
Apr 18, 2016
f31a62d
[SPARK-14440][PYSPARK] Remove pipeline specific reader and writer
yinxusen Apr 18, 2016
8c62edb
[SPARK-14299][EXAMPLES] Remove duplications for scala.examples.ml
yinxusen Apr 18, 2016
6fc1e72
[MINOR] Revert removing explicit typing (changed in some examples and…
HyukjinKwon Apr 18, 2016
8bd8121
[SPARK-14710][SQL] Rename gen/genCode to genCode/doGenCode to better …
sameeragarwal Apr 18, 2016
f1a1197
[SPARK-14674][SQL] Move HiveContext.hiveconf to HiveSessionState
Apr 18, 2016
68450c8
[SPARK-14504][SQL] Enable Oracle docker tests
lresende Apr 18, 2016
6ff0435
[SPARK-14713][TESTS] Fix the flaky test NettyBlockTransferServiceSuite
zsxwing Apr 18, 2016
6027340
[SPARK-14628][CORE][FOLLLOW-UP] Always tracking read/write metrics
cloud-fan Apr 18, 2016
9bfb35d
[SPARK-14515][DOC] Add python example for ChiSqSelector
zhengruifeng Apr 19, 2016
d29e429
[SPARK-14714][ML][PYTHON] Fixed issues with non-kwarg typeConverter a…
jkbradley Apr 19, 2016
2b151b6
[SPARK-14711][BUILD] Examples jar not a part of distribution.
markgrover Apr 19, 2016
4b3d129
[SPARK-13227] Risky apply() in OpenHashMap
CodingCat Apr 19, 2016
5e92583
[SPARK-14667] Remove HashShuffleManager
rxin Apr 19, 2016
ed2de02
[SPARK-14719] WriteAheadLogBasedBlockHandler should ignore BlockManag…
JoshRosen Apr 19, 2016
4eae1db
[SPARK-14718][SQL] Avoid mutating ExprCode in doGenCode
sameeragarwal Apr 19, 2016
6f88006
[SPARK-14722][SQL] Rename upstreams() -> inputRDDs() in WholeStageCod…
sameeragarwal Apr 19, 2016
d4b94ea
[SPARK-14595][SQL] add input metrics for FileScanRDD
cloud-fan Apr 19, 2016
74fe235
[SPARK-14398][SQL] Audit non-reserved keyword list in ANTLR4 parser
Apr 19, 2016
3d46d79
[SPARK-14577][SQL] Add spark.sql.codegen.maxCaseBranches config option
dongjoon-hyun Apr 19, 2016
5e360c9
[SPARK-13681][SPARK-14458][SPARK-14566][SQL] Add back once removed Co…
liancheng Apr 19, 2016
9ee95b6
[SPARK-14491] [SQL] refactor object operator framework to make it eas…
cloud-fan Apr 19, 2016
e896336
[SPARK-13904] Add exit code parameter to exitExecutor()
tedyu Apr 19, 2016
d9620e7
[SPARK-12457] Fixed the Wrong Description and Missing Example in Coll…
gatorsmile Apr 19, 2016
947b902
[SPARK-14676] Wrap and re-throw Await.result exceptions in order to c…
JoshRosen Apr 19, 2016
5cb2e33
[SPARK-14675][SQL] ClassFormatError when use Seq as Aggregator buffer…
cloud-fan Apr 19, 2016
0b8369d
[SPARK-14656][CORE] Fix Benchmark.getPorcessorName() always return "U…
kiszk Apr 19, 2016
3c91afe
[SPARK-14042][CORE] Add custom coalescer support
nezihyigitbasi Apr 19, 2016
da88592
[SPARK-4226] [SQL] Support IN/EXISTS Subqueries
hvanhovell Apr 19, 2016
008a8bb
[SPARK-14733] Allow custom timing control in microbenchmarks
ericl Apr 19, 2016
ecd877e
[SPARK-12224][SPARKR] R support for JDBC source
felixcheung Apr 19, 2016
a685e65
Revert "[SPARK-14719] WriteAheadLogBasedBlockHandler should ignore Bl…
JoshRosen Apr 19, 2016
3664142
[SPARK-14717] [PYTHON] Scala, Python APIs for Dataset.unpersist diffe…
felixcheung Apr 20, 2016
10f273d
[SPARK-14407][SQL] Hides HadoopFsRelation related data source API int…
liancheng Apr 20, 2016
3ae25f2
[SPARK-13929] Use Scala reflection for UDTs
joan38 Apr 20, 2016
4514aeb
[SPARK-14705][YARN] support Multiple FileSystem for YARN STAGING DIR
lianhuiwang Apr 20, 2016
8eedf0b
[SPARK-13905][SPARKR] Change signature of as.data.frame() to be consi…
Apr 20, 2016
78b3810
[SPARK-13419] [SQL] Update SubquerySuite to use checkAnswer for valid…
lresende Apr 20, 2016
85d759c
[SPARK-14704][CORE] create accumulators in TaskMetrics
cloud-fan Apr 20, 2016
856bc46
[SPARK-14600] [SQL] Push predicates through Expand
cloud-fan Apr 20, 2016
6f1ec1f
[MINOR] [SQL] Re-enable `explode()` and `json_tuple()` testcases in E…
dongjoon-hyun Apr 20, 2016
14869ae
[SPARK-14639] [PYTHON] [R] Add `bround` function in Python/R.
dongjoon-hyun Apr 20, 2016
7abe9a6
[SPARK-9013][SQL] generate MutableProjection directly instead of retu…
cloud-fan Apr 20, 2016
a345111
[SPARK-14679][UI] Fix UI DAG visualization OOM.
rdblue Apr 20, 2016
17db4bf
[SPARK-14687][CORE][SQL][MLLIB] Call path.getFileSystem(conf) instead…
lw-lin Apr 20, 2016
ed9d803
[SPARK-14635][ML] Documentation and Examples for TF-IDF only refer to…
hhbyyh Apr 20, 2016
8342778
[SPARK-8171][WEB UI] Javascript based infinite scrolling for the log …
ajbozarth Apr 20, 2016
80bf48f
[SPARK-14555] First cut of Python API for Structured Streaming
brkyvz Apr 20, 2016
b4e76a9
[SPARK-14742][DOCS] Redirect spark-ec2 doc to new location
srowen Apr 20, 2016
90cbc82
[SPARK-14725][CORE] Remove HttpServer class
jerryshao Apr 20, 2016
08f84d7
[MINOR][ML][PYSPARK] Fix omissive param setters which should use _set…
yanboliang Apr 20, 2016
15f226c
generate dataframe API create table for some datasource tables
xwu0226 Apr 20, 2016
601867a
synch up with master branch
xwu0226 Apr 20, 2016
acc7e59
[SPARK-14478][ML][MLLIB][DOC] Doc that StandardScaler uses the correc…
jkbradley Apr 20, 2016
cb8ea9e
[SPARK-14741][SQL] Fixed error in reading json file stream inside a p…
tdas Apr 20, 2016
8fc267a
[SPARK-14720][SPARK-13643] Move Hive-specific methods into HiveSessio…
Apr 20, 2016
296c384
[MINOR][ML][PYSPARK] Fix omissive params which should use TypeConverter
yanboliang Apr 20, 2016
7bc9485
[SPARK-14678][SQL] Add a file sink log to support versioning and comp…
zsxwing Apr 20, 2016
e7791c4
[SPARK-13842] [PYSPARK] pyspark.sql.types.StructType accessor enhance…
shea-parkes Apr 20, 2016
fd82681
[SPARK-14749][SQL, TESTS] PlannerSuite failed when it run individually
sbcd90 Apr 20, 2016
687f7ac
update upon review
xwu0226 Apr 20, 2016
bf3512b
synch up with latest change
xwu0226 Apr 20, 2016
ca44d67
synch up again
xwu0226 Apr 21, 2016
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
6 changes: 5 additions & 1 deletion R/pkg/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ exportMethods("arrange",
"withColumn",
"withColumnRenamed",
"write.df",
"write.jdbc",
"write.json",
"write.parquet",
"write.text")
Expand All @@ -125,6 +126,7 @@ exportMethods("%in%",
"between",
"bin",
"bitwiseNOT",
"bround",
"cast",
"cbrt",
"ceil",
Expand Down Expand Up @@ -284,6 +286,7 @@ export("as.DataFrame",
"loadDF",
"parquetFile",
"read.df",
"read.jdbc",
"read.json",
"read.parquet",
"read.text",
Expand All @@ -292,7 +295,8 @@ export("as.DataFrame",
"tableToDF",
"tableNames",
"tables",
"uncacheTable")
"uncacheTable",
"print.summary.GeneralizedLinearRegressionModel")

export("structField",
"structField.jobj",
Expand Down
47 changes: 40 additions & 7 deletions R/pkg/R/DataFrame.R
Original file line number Diff line number Diff line change
Expand Up @@ -2296,12 +2296,8 @@ setMethod("fillna",
#' }
setMethod("as.data.frame",
signature(x = "DataFrame"),
function(x, ...) {
# Check if additional parameters have been passed
if (length(list(...)) > 0) {
stop(paste("Unused argument(s): ", paste(list(...), collapse = ", ")))
}
collect(x)
function(x, row.names = NULL, optional = FALSE, ...) {
as.data.frame(collect(x), row.names, optional, ...)
})

#' The specified DataFrame is attached to the R search path. This means that
Expand Down Expand Up @@ -2363,7 +2359,7 @@ setMethod("with",
#' @examples \dontrun{
#' # Create a DataFrame from the Iris dataset
#' irisDF <- createDataFrame(sqlContext, iris)
#'
#'
#' # Show the structure of the DataFrame
#' str(irisDF)
#' }
Expand Down Expand Up @@ -2468,3 +2464,40 @@ setMethod("drop",
function(x) {
base::drop(x)
})

#' Saves the content of the DataFrame to an external database table via JDBC
#'
#' Additional JDBC database connection properties can be set (...)
#'
#' Also, mode is used to specify the behavior of the save operation when
#' data already exists in the data source. There are four modes: \cr
#' append: Contents of this DataFrame are expected to be appended to existing data. \cr
#' overwrite: Existing data is expected to be overwritten by the contents of this DataFrame. \cr
#' error: An exception is expected to be thrown. \cr
#' ignore: The save operation is expected to not save the contents of the DataFrame
#' and to not change the existing data. \cr
#'
#' @param x A SparkSQL DataFrame
#' @param url JDBC database url of the form `jdbc:subprotocol:subname`
#' @param tableName The name of the table in the external database
#' @param mode One of 'append', 'overwrite', 'error', 'ignore' save mode (it is 'error' by default)
#' @family DataFrame functions
#' @rdname write.jdbc
#' @name write.jdbc
#' @export
#' @examples
#'\dontrun{
#' sc <- sparkR.init()
#' sqlContext <- sparkRSQL.init(sc)
#' jdbcUrl <- "jdbc:mysql://localhost:3306/databasename"
#' write.jdbc(df, jdbcUrl, "table", user = "username", password = "password")
#' }
setMethod("write.jdbc",
signature(x = "DataFrame", url = "character", tableName = "character"),
function(x, url, tableName, mode = "error", ...){
jmode <- convertToJSaveMode(mode)
jprops <- varargsToJProperties(...)
write <- callJMethod(x@sdf, "write")
write <- callJMethod(write, "mode", jmode)
invisible(callJMethod(write, "jdbc", url, tableName, jprops))
})
58 changes: 58 additions & 0 deletions R/pkg/R/SQLContext.R
Original file line number Diff line number Diff line change
Expand Up @@ -583,3 +583,61 @@ createExternalTable <- function(sqlContext, tableName, path = NULL, source = NUL
sdf <- callJMethod(sqlContext, "createExternalTable", tableName, source, options)
dataFrame(sdf)
}

#' Create a DataFrame representing the database table accessible via JDBC URL
#'
#' Additional JDBC database connection properties can be set (...)
#'
#' Only one of partitionColumn or predicates should be set. Partitions of the table will be
#' retrieved in parallel based on the `numPartitions` or by the predicates.
#'
#' Don't create too many partitions in parallel on a large cluster; otherwise Spark might crash
#' your external database systems.
#'
#' @param sqlContext SQLContext to use
#' @param url JDBC database url of the form `jdbc:subprotocol:subname`
#' @param tableName the name of the table in the external database
#' @param partitionColumn the name of a column of integral type that will be used for partitioning
#' @param lowerBound the minimum value of `partitionColumn` used to decide partition stride
#' @param upperBound the maximum value of `partitionColumn` used to decide partition stride
#' @param numPartitions the number of partitions, This, along with `lowerBound` (inclusive),
#' `upperBound` (exclusive), form partition strides for generated WHERE
#' clause expressions used to split the column `partitionColumn` evenly.
#' This defaults to SparkContext.defaultParallelism when unset.
#' @param predicates a list of conditions in the where clause; each one defines one partition
#' @return DataFrame
#' @rdname read.jdbc
#' @name read.jdbc
#' @export
#' @examples
#'\dontrun{
#' sc <- sparkR.init()
#' sqlContext <- sparkRSQL.init(sc)
#' jdbcUrl <- "jdbc:mysql://localhost:3306/databasename"
#' df <- read.jdbc(sqlContext, jdbcUrl, "table", predicates = list("field<=123"), user = "username")
#' df2 <- read.jdbc(sqlContext, jdbcUrl, "table2", partitionColumn = "index", lowerBound = 0,
#' upperBound = 10000, user = "username", password = "password")
#' }

read.jdbc <- function(sqlContext, url, tableName,
partitionColumn = NULL, lowerBound = NULL, upperBound = NULL,
numPartitions = 0L, predicates = list(), ...) {
jprops <- varargsToJProperties(...)

read <- callJMethod(sqlContext, "read")
if (!is.null(partitionColumn)) {
if (is.null(numPartitions) || numPartitions == 0) {
sc <- callJMethod(sqlContext, "sparkContext")
numPartitions <- callJMethod(sc, "defaultParallelism")
} else {
numPartitions <- numToInt(numPartitions)
}
sdf <- callJMethod(read, "jdbc", url, tableName, as.character(partitionColumn),
numToInt(lowerBound), numToInt(upperBound), numPartitions, jprops)
} else if (length(predicates) > 0) {
sdf <- callJMethod(read, "jdbc", url, tableName, as.list(as.character(predicates)), jprops)
} else {
sdf <- callJMethod(read, "jdbc", url, tableName, jprops)
}
dataFrame(sdf)
}
22 changes: 21 additions & 1 deletion R/pkg/R/functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -994,7 +994,7 @@ setMethod("rint",

#' round
#'
#' Returns the value of the column `e` rounded to 0 decimal places.
#' Returns the value of the column `e` rounded to 0 decimal places using HALF_UP rounding mode.
#'
#' @rdname round
#' @name round
Expand All @@ -1008,6 +1008,26 @@ setMethod("round",
column(jc)
})

#' bround
#'
#' Returns the value of the column `e` rounded to `scale` decimal places using HALF_EVEN rounding
#' mode if `scale` >= 0 or at integral part when `scale` < 0.
#' Also known as Gaussian rounding or bankers' rounding that rounds to the nearest even number.
#' bround(2.5, 0) = 2, bround(3.5, 0) = 4.
#'
#' @rdname bround
#' @name bround
#' @family math_funcs
#' @export
#' @examples \dontrun{bround(df$c, 0)}
setMethod("bround",
signature(x = "Column"),
function(x, scale = 0) {
jc <- callJStatic("org.apache.spark.sql.functions", "bround", x@jc, as.integer(scale))
column(jc)
})


#' rtrim
#'
#' Trim the spaces from right end for the specified string value.
Expand Down
15 changes: 14 additions & 1 deletion R/pkg/R/generics.R
Original file line number Diff line number Diff line change
Expand Up @@ -397,7 +397,10 @@ setGeneric("arrange", function(x, col, ...) { standardGeneric("arrange") })

#' @rdname as.data.frame
#' @export
setGeneric("as.data.frame")
setGeneric("as.data.frame",
function(x, row.names = NULL, optional = FALSE, ...) {
standardGeneric("as.data.frame")
})

#' @rdname attach
#' @export
Expand Down Expand Up @@ -577,6 +580,12 @@ setGeneric("saveDF", function(df, path, source = NULL, mode = "error", ...) {
standardGeneric("saveDF")
})

#' @rdname write.jdbc
#' @export
setGeneric("write.jdbc", function(x, url, tableName, mode = "error", ...) {
standardGeneric("write.jdbc")
})

#' @rdname write.json
#' @export
setGeneric("write.json", function(x, path) { standardGeneric("write.json") })
Expand Down Expand Up @@ -751,6 +760,10 @@ setGeneric("bin", function(x) { standardGeneric("bin") })
#' @export
setGeneric("bitwiseNOT", function(x) { standardGeneric("bitwiseNOT") })

#' @rdname bround
#' @export
setGeneric("bround", function(x, ...) { standardGeneric("bround") })

#' @rdname cbrt
#' @export
setGeneric("cbrt", function(x) { standardGeneric("cbrt") })
Expand Down
49 changes: 46 additions & 3 deletions R/pkg/R/mllib.R
Original file line number Diff line number Diff line change
Expand Up @@ -101,12 +101,55 @@ setMethod("summary", signature(object = "GeneralizedLinearRegressionModel"),
jobj <- object@jobj
features <- callJMethod(jobj, "rFeatures")
coefficients <- callJMethod(jobj, "rCoefficients")
coefficients <- as.matrix(unlist(coefficients))
colnames(coefficients) <- c("Estimate")
deviance.resid <- callJMethod(jobj, "rDevianceResiduals")
dispersion <- callJMethod(jobj, "rDispersion")
null.deviance <- callJMethod(jobj, "rNullDeviance")
deviance <- callJMethod(jobj, "rDeviance")
df.null <- callJMethod(jobj, "rResidualDegreeOfFreedomNull")
df.residual <- callJMethod(jobj, "rResidualDegreeOfFreedom")
aic <- callJMethod(jobj, "rAic")
iter <- callJMethod(jobj, "rNumIterations")
family <- callJMethod(jobj, "rFamily")

deviance.resid <- dataFrame(deviance.resid)
coefficients <- matrix(coefficients, ncol = 4)
colnames(coefficients) <- c("Estimate", "Std. Error", "t value", "Pr(>|t|)")
rownames(coefficients) <- unlist(features)
return(list(coefficients = coefficients))
ans <- list(deviance.resid = deviance.resid, coefficients = coefficients,
dispersion = dispersion, null.deviance = null.deviance,
deviance = deviance, df.null = df.null, df.residual = df.residual,
aic = aic, iter = iter, family = family)
class(ans) <- "summary.GeneralizedLinearRegressionModel"
return(ans)
})

#' Print the summary of GeneralizedLinearRegressionModel
#'
#' @rdname print
#' @name print.summary.GeneralizedLinearRegressionModel
#' @export
print.summary.GeneralizedLinearRegressionModel <- function(x, ...) {
x$deviance.resid <- setNames(unlist(approxQuantile(x$deviance.resid, "devianceResiduals",
c(0.0, 0.25, 0.5, 0.75, 1.0), 0.01)), c("Min", "1Q", "Median", "3Q", "Max"))
x$deviance.resid <- zapsmall(x$deviance.resid, 5L)
cat("\nDeviance Residuals: \n")
cat("(Note: These are approximate quantiles with relative error <= 0.01)\n")
print.default(x$deviance.resid, digits = 5L, na.print = "", print.gap = 2L)

cat("\nCoefficients:\n")
print.default(x$coefficients, digits = 5L, na.print = "", print.gap = 2L)

cat("\n(Dispersion parameter for ", x$family, " family taken to be ", format(x$dispersion),
")\n\n", apply(cbind(paste(format(c("Null", "Residual"), justify = "right"), "deviance:"),
format(unlist(x[c("null.deviance", "deviance")]), digits = 5L),
" on", format(unlist(x[c("df.null", "df.residual")])), " degrees of freedom\n"),
1L, paste, collapse = " "), sep = "")
cat("AIC: ", format(x$aic, digits = 4L), "\n\n",
"Number of Fisher Scoring iterations: ", x$iter, "\n", sep = "")
cat("\n")
invisible(x)
}

#' Make predictions from a generalized linear model
#'
#' Makes predictions from a generalized linear model produced by glm(), similarly to R's predict().
Expand Down
11 changes: 11 additions & 0 deletions R/pkg/R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -650,3 +650,14 @@ convertToJSaveMode <- function(mode) {
jmode <- callJStatic("org.apache.spark.sql.api.r.SQLUtils", "saveMode", mode)
jmode
}

varargsToJProperties <- function(...) {
pairs <- list(...)
props <- newJObject("java.util.Properties")
if (length(pairs) > 0) {
lapply(ls(pairs), function(k) {
callJMethod(props, "setProperty", as.character(k), as.character(pairs[[k]]))
})
}
props
}
2 changes: 1 addition & 1 deletion R/pkg/inst/tests/testthat/test_context.R
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ test_that("Check masked functions", {
maskedBySparkR <- masked[funcSparkROrEmpty]
namesOfMasked <- c("describe", "cov", "filter", "lag", "na.omit", "predict", "sd", "var",
"colnames", "colnames<-", "intersect", "rank", "rbind", "sample", "subset",
"summary", "transform", "drop", "window")
"summary", "transform", "drop", "window", "as.data.frame")
expect_equal(length(maskedBySparkR), length(namesOfMasked))
expect_equal(sort(maskedBySparkR), sort(namesOfMasked))
# above are those reported as masked when `library(SparkR)`
Expand Down
49 changes: 49 additions & 0 deletions R/pkg/inst/tests/testthat/test_mllib.R
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,55 @@ test_that("glm and predict", {
expect_equal(length(predict(lm(y ~ x))), 15)
})

test_that("glm summary", {
# gaussian family
training <- suppressWarnings(createDataFrame(sqlContext, iris))
stats <- summary(glm(Sepal_Width ~ Sepal_Length + Species, data = training))

rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = iris))

coefs <- unlist(stats$coefficients)
rCoefs <- unlist(rStats$coefficients)
expect_true(all(abs(rCoefs - coefs) < 1e-4))
expect_true(all(
rownames(stats$coefficients) ==
c("(Intercept)", "Sepal_Length", "Species_versicolor", "Species_virginica")))
expect_equal(stats$dispersion, rStats$dispersion)
expect_equal(stats$null.deviance, rStats$null.deviance)
expect_equal(stats$deviance, rStats$deviance)
expect_equal(stats$df.null, rStats$df.null)
expect_equal(stats$df.residual, rStats$df.residual)
expect_equal(stats$aic, rStats$aic)

# binomial family
df <- suppressWarnings(createDataFrame(sqlContext, iris))
training <- df[df$Species %in% c("versicolor", "virginica"), ]
stats <- summary(glm(Species ~ Sepal_Length + Sepal_Width, data = training,
family = binomial(link = "logit")))

rTraining <- iris[iris$Species %in% c("versicolor", "virginica"), ]
rStats <- summary(glm(Species ~ Sepal.Length + Sepal.Width, data = rTraining,
family = binomial(link = "logit")))

coefs <- unlist(stats$coefficients)
rCoefs <- unlist(rStats$coefficients)
expect_true(all(abs(rCoefs - coefs) < 1e-4))
expect_true(all(
rownames(stats$coefficients) ==
c("(Intercept)", "Sepal_Length", "Sepal_Width")))
expect_equal(stats$dispersion, rStats$dispersion)
expect_equal(stats$null.deviance, rStats$null.deviance)
expect_equal(stats$deviance, rStats$deviance)
expect_equal(stats$df.null, rStats$df.null)
expect_equal(stats$df.residual, rStats$df.residual)
expect_equal(stats$aic, rStats$aic)

# Test summary works on base GLM models
baseModel <- stats::glm(Sepal.Width ~ Sepal.Length + Species, data = iris)
baseSummary <- summary(baseModel)
expect_true(abs(baseSummary$deviance - 12.19313) < 1e-4)
})

test_that("kmeans", {
newIris <- iris
newIris$Species <- NULL
Expand Down
8 changes: 8 additions & 0 deletions R/pkg/inst/tests/testthat/test_sparkSQL.R
Original file line number Diff line number Diff line change
Expand Up @@ -1087,6 +1087,11 @@ test_that("column functions", {
expect_equal(collect(select(df, last(df$age, TRUE)))[[1]], 19)
expect_equal(collect(select(df, last("age")))[[1]], 19)
expect_equal(collect(select(df, last("age", TRUE)))[[1]], 19)

# Test bround()
df <- createDataFrame(sqlContext, data.frame(x = c(2.5, 3.5)))
expect_equal(collect(select(df, bround(df$x, 0)))[[1]][1], 2)
expect_equal(collect(select(df, bround(df$x, 0)))[[1]][2], 4)
})

test_that("column binary mathfunctions", {
Expand Down Expand Up @@ -1863,6 +1868,9 @@ test_that("Method as.data.frame as a synonym for collect()", {
expect_equal(as.data.frame(irisDF), collect(irisDF))
irisDF2 <- irisDF[irisDF$Species == "setosa", ]
expect_equal(as.data.frame(irisDF2), collect(irisDF2))

# Make sure as.data.frame in the R base package is not covered
expect_that(as.data.frame(c(1, 2)), not(throws_error()))
})

test_that("attach() on a DataFrame", {
Expand Down
Loading