Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
111 commits
Select commit Hold shift + click to select a range
0ebb014
show create table DDL -- hive metastore table
xwu0226 Apr 2, 2016
6d060be
update upon review
xwu0226 Apr 2, 2016
2799672
ignoring sqlContext temp table and considering datasource table ddl
xwu0226 Apr 2, 2016
98c020a
fix scala style issue
xwu0226 Apr 4, 2016
efd8898
fix scala style issue in testcase
xwu0226 Apr 4, 2016
b370630
fix testcase for test failure
xwu0226 Apr 5, 2016
8cb7a72
continue the database ddl generation
xwu0226 Apr 6, 2016
8b67d22
support datasource ddl
xwu0226 Apr 8, 2016
9ab863f
scala style fix
xwu0226 Apr 8, 2016
906eef4
[SPARK-11416][BUILD] Update to Chill 0.8.0 & Kryo 3.0.3
JoshRosen Apr 8, 2016
4d7c359
[SPARK-14437][CORE] Use the address that NettyBlockTransferService li…
zsxwing Apr 9, 2016
813e96e
[SPARK-14454] Better exception handling while marking tasks as failed
sameeragarwal Apr 9, 2016
d7af736
[SPARK-14498][ML][PYTHON][SQL] Many cleanups to ML and ML-related docs
jkbradley Apr 9, 2016
2f0b882
[SPARK-14482][SQL] Change default Parquet codec from gzip to snappy
rxin Apr 9, 2016
520dde4
[SPARK-14451][SQL] Move encoder definition into Aggregator interface
rxin Apr 9, 2016
90c0a04
[SPARK-14419] [SQL] Improve HashedRelation for key fit within Long
Apr 9, 2016
a9b8b65
[SPARK-14392][ML] CountVectorizer Estimator should include binary tog…
wangmiao1981 Apr 9, 2016
10a9578
[SPARK-14496][SQL] fix some javadoc typos
Apr 9, 2016
1598d11
[SPARK-14462][ML][MLLIB] add the mllib-local build to maven pom
Apr 9, 2016
adb9d73
[SPARK-14339][DOC] Add python examples for DCT,MinMaxScaler,MaxAbsScaler
zhengruifeng Apr 9, 2016
f7ec854
Revert "[SPARK-14419] [SQL] Improve HashedRelation for key fit within…
davies Apr 9, 2016
cd2fed7
[SPARK-14335][SQL] Describe function command returns wrong output
yongtang Apr 9, 2016
415446c
Revert "[SPARK-14462][ML][MLLIB] add the mllib-local build to maven pom"
mengxr Apr 9, 2016
9be5558
[SPARK-14481][SQL] Issue Exceptions for All Unsupported Options durin…
gatorsmile Apr 9, 2016
dfce966
[SPARK-14362][SPARK-14406][SQL] DDL Native Support: Drop View and Dro…
gatorsmile Apr 10, 2016
5cb5eda
[SPARK-14419] [SQL] Improve HashedRelation for key fit within Long
Apr 10, 2016
5989c85
[SPARK-14217] [SQL] Fix bug if parquet data has columns that use dict…
nongli Apr 10, 2016
00288ea
[SPARK-13687][PYTHON] Cleanup PySpark parallelize temporary files
holdenk Apr 10, 2016
72e66bb
[SPARK-14301][EXAMPLES] Java examples code merge and clean up.
yongtang Apr 10, 2016
aea30a1
[SPARK-14465][BUILD] Checkstyle should check all Java files
dongjoon-hyun Apr 10, 2016
3fb09af
[SPARK-14506][SQL] HiveClientImpl's toHiveTable misses a table proper…
yhuai Apr 10, 2016
2c95e4e
[SPARK-14455][STREAMING] Fix NPE in allocatedExecutors when calling i…
jerryshao Apr 10, 2016
22014e6
[SPARK-14357][CORE] Properly handle the root cause being a commit den…
jasonmoore2k Apr 10, 2016
f434458
[SPARK-14497][ML] Use top instead of sortBy() to get top N frequent w…
lionelfeng Apr 10, 2016
b5c7856
Update KMeansExample.scala
oluies Apr 10, 2016
a7ce473
[SPARK-14415][SQL] All functions should show usages by command `DESC …
dongjoon-hyun Apr 10, 2016
fbf8d00
[SPARK-14419] [MINOR] coding style cleanup
Apr 11, 2016
9f838bd
[SPARK-14362][SPARK-14406][SQL][FOLLOW-UP] DDL Native Support: Drop V…
gatorsmile Apr 11, 2016
1a0cca1
[MINOR][DOCS] Fix wrong data types in JSON Datasets example.
dongjoon-hyun Apr 11, 2016
e82d95b
[SPARK-14372][SQL] Dataset.randomSplit() needs a Java version
rekhajoshm Apr 11, 2016
1c751fc
[SPARK-14500] [ML] Accept Dataset[_] instead of DataFrame in MLlib APIs
mengxr Apr 11, 2016
643b4e2
[SPARK-14510][MLLIB] Add args-checking for LDA and StreamingKMeans
zhengruifeng Apr 11, 2016
efaf7d1
[SPARK-14462][ML][MLLIB] Add the mllib-local build to maven pom
Apr 11, 2016
652c470
[SPARK-14528] [SQL] Fix same result of Union
Apr 11, 2016
5de2619
[SPARK-14502] [SQL] Add optimization for Binary Comparison Simplifica…
dongjoon-hyun Apr 11, 2016
2dacc81
[SPARK-14494][SQL] Fix the race conditions in MemoryStream and Memory…
zsxwing Apr 11, 2016
89a41c5
[SPARK-13600][MLLIB] Use approxQuantile from DataFrame stats in Quant…
Apr 11, 2016
3f0f408
[SPARK-14298][ML][MLLIB] Add unit test for EM LDA disable checkpointing
yanboliang Apr 11, 2016
94de630
[SPARK-10521][SQL] Utilize Docker for test DB2 JDBC Dialect support
lresende Apr 11, 2016
6f27027
[SPARK-14475] Propagate user-defined context from driver to executors
ericl Apr 12, 2016
26d7af9
[SPARK-14520][SQL] Use correct return type in VectorizedParquetInputF…
viirya Apr 12, 2016
e9e1adc
[MINOR][ML] Fixed MLlib build warnings
jkbradley Apr 12, 2016
83fb964
[SPARK-14132][SPARK-14133][SQL] Alter table partition DDLs
Apr 12, 2016
2d81ba5
[SPARK-14362][SPARK-14406][SQL][FOLLOW-UP] DDL Native Support: Drop V…
gatorsmile Apr 12, 2016
52a8011
[SPARK-14554][SQL] disable whole stage codegen if there are too many …
cloud-fan Apr 12, 2016
678b96e
[SPARK-14535][SQL] Remove buildInternalScan from FileFormat
cloud-fan Apr 12, 2016
b0f5497
[SPARK-14508][BUILD] Add a new ScalaStyle Rule `OmitBracesInCase`
dongjoon-hyun Apr 12, 2016
124cbfb
[SPARK-14488][SPARK-14493][SQL] "CREATE TEMPORARY TABLE ... USING ...…
liancheng Apr 12, 2016
da60b34
[SPARK-3724][ML] RandomForest: More options for feature subset size.
yongtang Apr 12, 2016
6bf6921
[SPARK-14474][SQL] Move FileSource offset log into checkpointLocation
zsxwing Apr 12, 2016
75e05a5
[SPARK-12566][SPARK-14324][ML] GLM model family, link function suppor…
yanboliang Apr 12, 2016
101663f
[SPARK-13322][ML] AFTSurvivalRegression supports feature standardization
yanboliang Apr 12, 2016
7f024c4
[SPARK-13597][PYSPARK][ML] Python API for GeneralizedLinearRegression
vectorijk Apr 12, 2016
1995c2e
[SPARK-14563][ML] use a random table name instead of __THIS__ in SQLT…
mengxr Apr 12, 2016
111a624
[SPARK-14147][ML][SPARKR] SparkR predict should not output feature co…
yanboliang Apr 12, 2016
852bbc6
[SPARK-14556][SQL] Code clean-ups for package o.a.s.sql.execution.str…
lw-lin Apr 12, 2016
85e68b4
[SPARK-14562] [SQL] improve constraints propagation in Union
Apr 12, 2016
bcd2076
[SPARK-14414][SQL] improve the error message class hierarchy
Apr 12, 2016
3e53de4
[SPARK-14513][CORE] Fix threads left behind after stopping SparkContext
chtyim Apr 12, 2016
1ef5f8c
[SPARK-14544] [SQL] improve performance of SQL UI tab
Apr 12, 2016
c439d88
[SPARK-14547] Avoid DNS resolution for reusing connections
rxin Apr 12, 2016
d187e7d
[SPARK-14363] Fix executor OOM due to memory leak in the Sorter
Apr 12, 2016
372baf0
[SPARK-14578] [SQL] Fix codegen for CreateExternalRow with nested wid…
Apr 13, 2016
768b3d6
[SPARK-14579][SQL] Fix a race condition in StreamExecution.processAll…
zsxwing Apr 13, 2016
587cd55
[MINOR][SQL] Remove some unused imports in datasources.
HyukjinKwon Apr 13, 2016
a5f8c9b
[SPARK-14554][SQL][FOLLOW-UP] use checkDataset to check the result
cloud-fan Apr 13, 2016
23f93f5
[SPARK-13992][CORE][PYSPARK][FOLLOWUP] Update OFF_HEAP semantics for …
lw-lin Apr 13, 2016
dd11e40
[SPARK-14537][CORE] Make TaskSchedulerImpl waiting fail if context is…
drcrallen Apr 13, 2016
323e739
Revert "[SPARK-14154][MLLIB] Simplify the implementation for Kolmogor…
mengxr Apr 13, 2016
1018a1c
[SPARK-14568][ML] Instrumentation framework for logistic regression
thunterdb Apr 13, 2016
7d2ed8c
[SPARK-14388][SQL] Implement CREATE TABLE
Apr 13, 2016
f9d578e
[SPARK-13783][ML] Model export/import for spark.ml: GBTs
yanboliang Apr 13, 2016
dbbe149
[SPARK-14581] [SQL] push predicatese through more logical plans
Apr 13, 2016
b0adb9f
[SPARK-10386][MLLIB] PrefixSpanModel supports save/load
yanboliang Apr 13, 2016
0d17593
[SPARK-14461][ML] GLM training summaries should provide solver
yanboliang Apr 13, 2016
a91aaf5
[SPARK-14375][ML] Unit test for spark.ml KMeansSummary
yanboliang Apr 13, 2016
fcdd692
[SPARK-14509][DOC] Add python CountVectorizerExample
zhengruifeng Apr 13, 2016
781df49
[SPARK-13089][ML] [Doc] spark.ml Naive Bayes user guide and examples
hhbyyh Apr 13, 2016
fc3cd2f
[SPARK-14472][PYSPARK][ML] Cleanup ML JavaWrapper and related class h…
BryanCutler Apr 13, 2016
a40273c
merge the code committed by CREATE TABLE native support
xwu0226 Apr 13, 2016
62b7f30
[SPARK-14607] [SPARK-14484] [SQL] fix case-insensitive predicates in …
Apr 14, 2016
b481940
[SPARK-14596][SQL] Remove not used SqlNewHadoopRDD and some more unus…
HyukjinKwon Apr 14, 2016
478af2f
[SPARK-14573][PYSPARK][BUILD] Fix PyDoc Makefile & highlighting issues
holdenk Apr 14, 2016
6fc3dc8
[MINOR][SQL] Remove extra anonymous closure within functional transfo…
HyukjinKwon Apr 14, 2016
3cf3db1
[SPARK-14518][SQL] Support Comment in CREATE VIEW
gatorsmile Apr 14, 2016
f83ba45
[SPARK-14572][DOC] Update config docs to allow -Xms in extraJavaOptions
dhruve Apr 14, 2016
0d22092
[SPARK-14125][SQL] Native DDL Support: Alter View
gatorsmile Apr 14, 2016
de2ad52
[SPARK-14625] TaskUIData and ExecutorUIData shouldn't be case classes
rxin Apr 14, 2016
3e27940
[SPARK-14630][BUILD][CORE][SQL][STREAMING] Code style: public abstrac…
lw-lin Apr 14, 2016
9fa43a3
[SPARK-14612][ML] Consolidate the version of dependencies in mllib an…
srowen Apr 14, 2016
dac40b6
[SPARK-14619] Track internal accumulators (metrics) by stage attempt
rxin Apr 14, 2016
a46f98d
[SPARK-14617] Remove deprecated APIs in TaskMetrics
rxin Apr 14, 2016
1d04c86
[SPARK-14558][CORE] In ClosureCleaner, clean the outer pointer if it'…
cloud-fan Apr 14, 2016
c971aee
[SPARK-14499][SQL][TEST] Drop Partition Does Not Delete Data of Exter…
gatorsmile Apr 14, 2016
28efdd3
[SPARK-14592][SQL] Native support for CREATE TABLE LIKE DDL command
viirya Apr 14, 2016
c5172f8
[SPARK-13967][PYSPARK][ML] Added binary Param to Python CountVectorizer
BryanCutler Apr 14, 2016
bf65c87
[SPARK-14618][ML][DOC] Updated RegressionEvaluator.metricName param doc
jkbradley Apr 14, 2016
bc748b7
[SPARK-14238][ML][MLLIB][PYSPARK] Add binary toggle Param to PySpark …
yongtang Apr 14, 2016
d7e124e
[SPARK-14545][SQL] Improve `LikeSimplification` by adding `a%b` rule
dongjoon-hyun Apr 14, 2016
d214a3b
rework show create ddl based on new native supported create table DDL…
xwu0226 Apr 14, 2016
1680ea0
Merge branch 'show_create_table_1' into show_create_table_2
xwu0226 Apr 14, 2016
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
5 changes: 2 additions & 3 deletions LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -257,9 +257,8 @@ The text of each license is also included at licenses/LICENSE-[project].txt.
(BSD-style) scalacheck (org.scalacheck:scalacheck_2.11:1.10.0 - http://www.scalacheck.org)
(BSD-style) spire (org.spire-math:spire_2.11:0.7.1 - http://spire-math.org)
(BSD-style) spire-macros (org.spire-math:spire-macros_2.11:0.7.1 - http://spire-math.org)
(New BSD License) Kryo (com.esotericsoftware.kryo:kryo:2.21 - http://code.google.com/p/kryo/)
(New BSD License) MinLog (com.esotericsoftware.minlog:minlog:1.2 - http://code.google.com/p/minlog/)
(New BSD License) ReflectASM (com.esotericsoftware.reflectasm:reflectasm:1.07 - http://code.google.com/p/reflectasm/)
(New BSD License) Kryo (com.esotericsoftware:kryo:3.0.3 - https://github.com/EsotericSoftware/kryo)
(New BSD License) MinLog (com.esotericsoftware:minlog:1.3.0 - https://github.com/EsotericSoftware/minlog)
(New BSD license) Protocol Buffer Java API (com.google.protobuf:protobuf-java:2.5.0 - http://code.google.com/p/protobuf)
(New BSD license) Protocol Buffer Java API (org.spark-project.protobuf:protobuf-java:2.4.1-shaded - http://code.google.com/p/protobuf)
(The BSD License) Fortran to Java ARPACK (net.sourceforge.f2j:arpack_combined_all:0.1 - http://f2j.sourceforge.net)
Expand Down
139 changes: 61 additions & 78 deletions R/pkg/R/mllib.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@

# mllib.R: Provides methods for MLlib integration

#' @title S4 class that represents a PipelineModel
#' @param model A Java object reference to the backing Scala PipelineModel
#' @title S4 class that represents a generalized linear model
#' @param jobj a Java object reference to the backing Scala GeneralizedLinearRegressionWrapper
#' @export
setClass("PipelineModel", representation(model = "jobj"))
setClass("GeneralizedLinearRegressionModel", representation(jobj = "jobj"))

#' @title S4 class that represents a NaiveBayesModel
#' @param jobj a Java object reference to the backing Scala NaiveBayesWrapper
Expand All @@ -39,21 +39,18 @@ setClass("KMeansModel", representation(jobj = "jobj"))

#' Fits a generalized linear model
#'
#' Fits a generalized linear model, similarly to R's glm(). Also see the glmnet package.
#' Fits a generalized linear model, similarly to R's glm().
#'
#' @param formula A symbolic description of the model to be fitted. Currently only a few formula
#' operators are supported, including '~', '.', ':', '+', and '-'.
#' @param data DataFrame for training
#' @param family Error distribution. "gaussian" -> linear regression, "binomial" -> logistic reg.
#' @param lambda Regularization parameter
#' @param alpha Elastic-net mixing parameter (see glmnet's documentation for details)
#' @param standardize Whether to standardize features before training
#' @param solver The solver algorithm used for optimization, this can be "l-bfgs", "normal" and
#' "auto". "l-bfgs" denotes Limited-memory BFGS which is a limited-memory
#' quasi-Newton optimization method. "normal" denotes using Normal Equation as an
#' analytical solution to the linear regression problem. The default value is "auto"
#' which means that the solver algorithm is selected automatically.
#' @return a fitted MLlib model
#' @param data DataFrame for training.
#' @param family A description of the error distribution and link function to be used in the model.
#' This can be a character string naming a family function, a family function or
#' the result of a call to a family function. Refer R family at
#' \url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}.
#' @param epsilon Positive convergence tolerance of iterations.
#' @param maxit Integer giving the maximal number of IRLS iterations.
#' @return a fitted generalized linear model
#' @rdname glm
#' @export
#' @examples
Expand All @@ -64,36 +61,70 @@ setClass("KMeansModel", representation(jobj = "jobj"))
#' df <- createDataFrame(sqlContext, iris)
#' model <- glm(Sepal_Length ~ Sepal_Width, df, family="gaussian")
#' summary(model)
#'}
#' }
setMethod("glm", signature(formula = "formula", family = "ANY", data = "DataFrame"),
function(formula, family = c("gaussian", "binomial"), data, lambda = 0, alpha = 0,
standardize = TRUE, solver = "auto") {
family <- match.arg(family)
function(formula, family = gaussian, data, epsilon = 1e-06, maxit = 25) {
if (is.character(family)) {
family <- get(family, mode = "function", envir = parent.frame())
}
if (is.function(family)) {
family <- family()
}
if (is.null(family$family)) {
print(family)
stop("'family' not recognized")
}

formula <- paste(deparse(formula), collapse = "")
model <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
"fitRModelFormula", formula, data@sdf, family, lambda,
alpha, standardize, solver)
return(new("PipelineModel", model = model))

jobj <- callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper",
"fit", formula, data@sdf, family$family, family$link,
epsilon, as.integer(maxit))
return(new("GeneralizedLinearRegressionModel", jobj = jobj))
})

#' Make predictions from a model
#' Get the summary of a generalized linear model
#'
#' Makes predictions from a model produced by glm(), similarly to R's predict().
#' Returns the summary of a model produced by glm(), similarly to R's summary().
#'
#' @param object A fitted MLlib model
#' @param object A fitted generalized linear model
#' @return coefficients the model's coefficients, intercept
#' @rdname summary
#' @export
#' @examples
#' \dontrun{
#' model <- glm(y ~ x, trainingData)
#' summary(model)
#' }
setMethod("summary", signature(object = "GeneralizedLinearRegressionModel"),
function(object, ...) {
jobj <- object@jobj
features <- callJMethod(jobj, "rFeatures")
coefficients <- callJMethod(jobj, "rCoefficients")
coefficients <- as.matrix(unlist(coefficients))
colnames(coefficients) <- c("Estimate")
rownames(coefficients) <- unlist(features)
return(list(coefficients = coefficients))
})

#' Make predictions from a generalized linear model
#'
#' Makes predictions from a generalized linear model produced by glm(), similarly to R's predict().
#'
#' @param object A fitted generalized linear model
#' @param newData DataFrame for testing
#' @return DataFrame containing predicted values
#' @return DataFrame containing predicted labels in a column named "prediction"
#' @rdname predict
#' @export
#' @examples
#' \dontrun{
#' model <- glm(y ~ x, trainingData)
#' predicted <- predict(model, testData)
#' showDF(predicted)
#'}
setMethod("predict", signature(object = "PipelineModel"),
#' }
setMethod("predict", signature(object = "GeneralizedLinearRegressionModel"),
function(object, newData) {
return(dataFrame(callJMethod(object@model, "transform", newData@sdf)))
return(dataFrame(callJMethod(object@jobj, "transform", newData@sdf)))
})

#' Make predictions from a naive Bayes model
Expand All @@ -116,54 +147,6 @@ setMethod("predict", signature(object = "NaiveBayesModel"),
return(dataFrame(callJMethod(object@jobj, "transform", newData@sdf)))
})

#' Get the summary of a model
#'
#' Returns the summary of a model produced by glm(), similarly to R's summary().
#'
#' @param object A fitted MLlib model
#' @return a list with 'devianceResiduals' and 'coefficients' components for gaussian family
#' or a list with 'coefficients' component for binomial family. \cr
#' For gaussian family: the 'devianceResiduals' gives the min/max deviance residuals
#' of the estimation, the 'coefficients' gives the estimated coefficients and their
#' estimated standard errors, t values and p-values. (It only available when model
#' fitted by normal solver.) \cr
#' For binomial family: the 'coefficients' gives the estimated coefficients.
#' See summary.glm for more information. \cr
#' @rdname summary
#' @export
#' @examples
#' \dontrun{
#' model <- glm(y ~ x, trainingData)
#' summary(model)
#'}
setMethod("summary", signature(object = "PipelineModel"),
function(object, ...) {
modelName <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
"getModelName", object@model)
features <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
"getModelFeatures", object@model)
coefficients <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
"getModelCoefficients", object@model)
if (modelName == "LinearRegressionModel") {
devianceResiduals <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
"getModelDevianceResiduals", object@model)
devianceResiduals <- matrix(devianceResiduals, nrow = 1)
colnames(devianceResiduals) <- c("Min", "Max")
rownames(devianceResiduals) <- rep("", times = 1)
coefficients <- matrix(coefficients, ncol = 4)
colnames(coefficients) <- c("Estimate", "Std. Error", "t value", "Pr(>|t|)")
rownames(coefficients) <- unlist(features)
return(list(devianceResiduals = devianceResiduals, coefficients = coefficients))
} else if (modelName == "LogisticRegressionModel") {
coefficients <- as.matrix(unlist(coefficients))
colnames(coefficients) <- c("Estimate")
rownames(coefficients) <- unlist(features)
return(list(coefficients = coefficients))
} else {
stop(paste("Unsupported model", modelName, sep = " "))
}
})

#' Get the summary of a naive Bayes model
#'
#' Returns the summary of a naive Bayes model produced by naiveBayes(), similarly to R's summary().
Expand Down
95 changes: 29 additions & 66 deletions R/pkg/inst/tests/testthat/test_mllib.R
Original file line number Diff line number Diff line change
Expand Up @@ -25,20 +25,21 @@ sc <- sparkR.init()

sqlContext <- sparkRSQL.init(sc)

test_that("glm and predict", {
test_that("formula of glm", {
training <- suppressWarnings(createDataFrame(sqlContext, iris))
test <- select(training, "Sepal_Length")
model <- glm(Sepal_Width ~ Sepal_Length, training, family = "gaussian")
prediction <- predict(model, test)
expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
# dot minus and intercept vs native glm
model <- glm(Sepal_Width ~ . - Species + 0, data = training)
vals <- collect(select(predict(model, training), "prediction"))
rVals <- predict(glm(Sepal.Width ~ . - Species + 0, data = iris), iris)
expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)

# Test stats::predict is working
x <- rnorm(15)
y <- x + rnorm(15)
expect_equal(length(predict(lm(y ~ x))), 15)
})
# feature interaction vs native glm
model <- glm(Sepal_Width ~ Species:Sepal_Length, data = training)
vals <- collect(select(predict(model, training), "prediction"))
rVals <- predict(glm(Sepal.Width ~ Species:Sepal.Length, data = iris), iris)
expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)

test_that("glm should work with long formula", {
# glm should work with long formula
training <- suppressWarnings(createDataFrame(sqlContext, iris))
training$LongLongLongLongLongName <- training$Sepal_Width
training$VeryLongLongLongLonLongName <- training$Sepal_Length
Expand All @@ -50,68 +51,30 @@ test_that("glm should work with long formula", {
expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
})

test_that("predictions match with native glm", {
test_that("glm and predict", {
training <- suppressWarnings(createDataFrame(sqlContext, iris))
# gaussian family
model <- glm(Sepal_Width ~ Sepal_Length + Species, data = training)
vals <- collect(select(predict(model, training), "prediction"))
prediction <- predict(model, training)
expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
vals <- collect(select(prediction, "prediction"))
rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris)
expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
})

test_that("dot minus and intercept vs native glm", {
training <- suppressWarnings(createDataFrame(sqlContext, iris))
model <- glm(Sepal_Width ~ . - Species + 0, data = training)
vals <- collect(select(predict(model, training), "prediction"))
rVals <- predict(glm(Sepal.Width ~ . - Species + 0, data = iris), iris)
expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
})

test_that("feature interaction vs native glm", {
training <- suppressWarnings(createDataFrame(sqlContext, iris))
model <- glm(Sepal_Width ~ Species:Sepal_Length, data = training)
vals <- collect(select(predict(model, training), "prediction"))
rVals <- predict(glm(Sepal.Width ~ Species:Sepal.Length, data = iris), iris)
# poisson family
model <- glm(Sepal_Width ~ Sepal_Length + Species, data = training,
family = poisson(link = identity))
prediction <- predict(model, training)
expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double")
vals <- collect(select(prediction, "prediction"))
rVals <- suppressWarnings(predict(glm(Sepal.Width ~ Sepal.Length + Species,
data = iris, family = poisson(link = identity)), iris))
expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals)
})

test_that("summary coefficients match with native glm", {
training <- suppressWarnings(createDataFrame(sqlContext, iris))
stats <- summary(glm(Sepal_Width ~ Sepal_Length + Species, data = training, solver = "normal"))
coefs <- unlist(stats$coefficients)
devianceResiduals <- unlist(stats$devianceResiduals)

rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species, data = iris))
rCoefs <- unlist(rStats$coefficients)
rDevianceResiduals <- c(-0.95096, 0.72918)

expect_true(all(abs(rCoefs - coefs) < 1e-5))
expect_true(all(abs(rDevianceResiduals - devianceResiduals) < 1e-5))
expect_true(all(
rownames(stats$coefficients) ==
c("(Intercept)", "Sepal_Length", "Species_versicolor", "Species_virginica")))
})

test_that("summary coefficients match with native glm of family 'binomial'", {
df <- suppressWarnings(createDataFrame(sqlContext, iris))
training <- filter(df, df$Species != "setosa")
stats <- summary(glm(Species ~ Sepal_Length + Sepal_Width, data = training,
family = "binomial"))
coefs <- as.vector(stats$coefficients[, 1])

rTraining <- iris[iris$Species %in% c("versicolor", "virginica"), ]
rCoefs <- as.vector(coef(glm(Species ~ Sepal.Length + Sepal.Width, data = rTraining,
family = binomial(link = "logit"))))

expect_true(all(abs(rCoefs - coefs) < 1e-4))
expect_true(all(
rownames(stats$coefficients) ==
c("(Intercept)", "Sepal_Length", "Sepal_Width")))
})

test_that("summary works on base GLM models", {
baseModel <- stats::glm(Sepal.Width ~ Sepal.Length + Species, data = iris)
baseSummary <- summary(baseModel)
expect_true(abs(baseSummary$deviance - 12.19313) < 1e-4)
# Test stats::predict is working
x <- rnorm(15)
y <- x + rnorm(15)
expect_equal(length(predict(lm(y ~ x))), 15)
})

test_that("kmeans", {
Expand Down
2 changes: 1 addition & 1 deletion R/pkg/inst/tests/testthat/test_sparkSQL.R
Original file line number Diff line number Diff line change
Expand Up @@ -1853,7 +1853,7 @@ test_that("approxQuantile() on a DataFrame", {

test_that("SQL error message is returned from JVM", {
retError <- tryCatch(sql(sqlContext, "select * from blah"), error = function(e) e)
expect_equal(grepl("Table not found", retError), TRUE)
expect_equal(grepl("Table or View not found", retError), TRUE)
expect_equal(grepl("blah", retError), TRUE)
})

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,16 +123,15 @@ public TransportClientFactory(
public TransportClient createClient(String remoteHost, int remotePort) throws IOException {
// Get connection from the connection pool first.
// If it is not found or not active, create a new one.
long preResolveHost = System.nanoTime();
final InetSocketAddress address = new InetSocketAddress(remoteHost, remotePort);
long hostResolveTimeMs = (System.nanoTime() - preResolveHost) / 1000000;
logger.info("Spent {} ms to resolve {}", hostResolveTimeMs, address);
// Use unresolved address here to avoid DNS resolution each time we creates a client.
final InetSocketAddress unresolvedAddress =
InetSocketAddress.createUnresolved(remoteHost, remotePort);

// Create the ClientPool if we don't have it yet.
ClientPool clientPool = connectionPool.get(address);
ClientPool clientPool = connectionPool.get(unresolvedAddress);
if (clientPool == null) {
connectionPool.putIfAbsent(address, new ClientPool(numConnectionsPerPeer));
clientPool = connectionPool.get(address);
connectionPool.putIfAbsent(unresolvedAddress, new ClientPool(numConnectionsPerPeer));
clientPool = connectionPool.get(unresolvedAddress);
}

int clientIndex = rand.nextInt(numConnectionsPerPeer);
Expand All @@ -149,25 +148,35 @@ public TransportClient createClient(String remoteHost, int remotePort) throws IO
}

if (cachedClient.isActive()) {
logger.trace("Returning cached connection to {}: {}", address, cachedClient);
logger.trace("Returning cached connection to {}: {}",
cachedClient.getSocketAddress(), cachedClient);
return cachedClient;
}
}

// If we reach here, we don't have an existing connection open. Let's create a new one.
// Multiple threads might race here to create new connections. Keep only one of them active.
final long preResolveHost = System.nanoTime();
final InetSocketAddress resolvedAddress = new InetSocketAddress(remoteHost, remotePort);
final long hostResolveTimeMs = (System.nanoTime() - preResolveHost) / 1000000;
if (hostResolveTimeMs > 2000) {
logger.warn("DNS resolution for {} took {} ms", resolvedAddress, hostResolveTimeMs);
} else {
logger.trace("DNS resolution for {} took {} ms", resolvedAddress, hostResolveTimeMs);
}

synchronized (clientPool.locks[clientIndex]) {
cachedClient = clientPool.clients[clientIndex];

if (cachedClient != null) {
if (cachedClient.isActive()) {
logger.trace("Returning cached connection to {}: {}", address, cachedClient);
logger.trace("Returning cached connection to {}: {}", resolvedAddress, cachedClient);
return cachedClient;
} else {
logger.info("Found inactive connection to {}, creating a new one.", address);
logger.info("Found inactive connection to {}, creating a new one.", resolvedAddress);
}
}
clientPool.clients[clientIndex] = createClient(address);
clientPool.clients[clientIndex] = createClient(resolvedAddress);
return clientPool.clients[clientIndex];
}
}
Expand Down
1 change: 0 additions & 1 deletion core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,6 @@
<dependency>
<groupId>org.json4s</groupId>
<artifactId>json4s-jackson_${scala.binary.version}</artifactId>
<version>3.2.10</version>
</dependency>
<dependency>
<groupId>com.sun.jersey</groupId>
Expand Down
Loading