From 83892808fde633525c4366b368033ba80c81eaa8 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Thu, 5 May 2016 13:24:14 -0700 Subject: [PATCH 01/23] Mark a number of alogrithms and models experimental that are marked that way in scala and update the docs for logisitc regression threshold --- python/pyspark/ml/classification.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index f03296333446..645ebdf722f4 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -96,7 +96,8 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti threshold = Param(Params._dummy(), "threshold", "Threshold in binary classification prediction, in range [0, 1]." + - " If threshold and thresholds are both set, they must match.", + " If threshold and thresholds are both set, they must match." + + "e.g. threshold must be equal to [1-p, p].", typeConverter=TypeConverters.toFloat) @keyword_only @@ -154,7 +155,9 @@ def setThreshold(self, value): @since("1.4.0") def getThreshold(self): """ - Gets the value of threshold or its default value. + Gets the value of threshold or attempt to convert thresholds to threshold if set, or default + value if neither are set. + This conversion is equivalent to: {{{1 / (1 + thresholds(0) / thresholds(1))}}}. """ self._checkThresholdConsistency() if self.isSet(self.thresholds): @@ -616,6 +619,8 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred RandomForestParams, TreeClassifierParams, HasCheckpointInterval, JavaMLWritable, JavaMLReadable): """ + .. note:: Experimental + `http://en.wikipedia.org/wiki/Random_forest Random Forest` learning algorithm for classification. It supports both binary and multiclass labels, as well as both continuous and categorical @@ -708,6 +713,7 @@ def _create_model(self, java_model): class RandomForestClassificationModel(TreeEnsembleModels, JavaMLWritable, JavaMLReadable): """ + .. note:: Experimental Model fitted by RandomForestClassifier. .. versionadded:: 1.4.0 @@ -862,6 +868,8 @@ def featureImportances(self): class NaiveBayes(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasProbabilityCol, HasRawPredictionCol, JavaMLWritable, JavaMLReadable): """ + .. note:: Experimental + Naive Bayes Classifiers. It supports both Multinomial and Bernoulli NB. Multinomial NB (`http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html`) @@ -980,6 +988,8 @@ def getModelType(self): class NaiveBayesModel(JavaModel, JavaMLWritable, JavaMLReadable): """ + .. note:: Experimental + Model fitted by NaiveBayes. .. versionadded:: 1.5.0 @@ -1006,6 +1016,8 @@ def theta(self): class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter, HasTol, HasSeed, JavaMLWritable, JavaMLReadable): """ + .. note:: Experimental + Classifier trainer based on the Multilayer Perceptron. Each layer has sigmoid activation function, output layer has softmax. Number of inputs has to be equal to the size of feature vectors. @@ -1120,6 +1132,8 @@ def getBlockSize(self): class MultilayerPerceptronClassificationModel(JavaModel, JavaMLWritable, JavaMLReadable): """ + .. note:: Experimental + Model fitted by MultilayerPerceptronClassifier. .. versionadded:: 1.6.0 @@ -1169,6 +1183,8 @@ def getClassifier(self): @inherit_doc class OneVsRest(Estimator, OneVsRestParams, MLReadable, MLWritable): """ + .. note:: Experimental + Reduction of Multiclass Classification to Binary Classification. Performs reduction using one against all strategy. For a multiclass classification with k classes, train k models (one per class). @@ -1323,6 +1339,8 @@ def _to_java(self): class OneVsRestModel(Model, OneVsRestParams, MLReadable, MLWritable): """ + .. note:: Experimental + Model fitted by OneVsRest. This stores the models resulting from training k binary classifiers: one for each class. Each example is scored against all k models, and the model with the highest score From 1fa57e5ded4c8e47ac87cbc783184ca71f9ab699 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Thu, 5 May 2016 13:30:20 -0700 Subject: [PATCH 02/23] Add the rest --- python/pyspark/ml/classification.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 645ebdf722f4..f9d5009be59c 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -49,6 +49,8 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti HasElasticNetParam, HasFitIntercept, HasStandardization, HasThresholds, HasWeightCol, JavaMLWritable, JavaMLReadable): """ + .. note:: Experimental + Logistic regression. Currently, this class only supports binary classification. @@ -211,6 +213,8 @@ def _checkThresholdConsistency(self): class LogisticRegressionModel(JavaModel, JavaMLWritable, JavaMLReadable): """ + .. note:: Experimental + Model fitted by LogisticRegression. .. versionadded:: 1.3.0 @@ -492,6 +496,8 @@ class DecisionTreeClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred TreeClassifierParams, HasCheckpointInterval, HasSeed, JavaMLWritable, JavaMLReadable): """ + .. note:: Experimental + `http://en.wikipedia.org/wiki/Decision_tree_learning Decision tree` learning algorithm for classification. It supports both binary and multiclass labels, as well as both continuous and categorical @@ -586,6 +592,8 @@ def _create_model(self, java_model): @inherit_doc class DecisionTreeClassificationModel(DecisionTreeModel, JavaMLWritable, JavaMLReadable): """ + .. note:: Experimental + Model fitted by DecisionTreeClassifier. .. versionadded:: 1.4.0 @@ -714,6 +722,7 @@ def _create_model(self, java_model): class RandomForestClassificationModel(TreeEnsembleModels, JavaMLWritable, JavaMLReadable): """ .. note:: Experimental + Model fitted by RandomForestClassifier. .. versionadded:: 1.4.0 @@ -740,6 +749,8 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol GBTParams, HasCheckpointInterval, HasStepSize, HasSeed, JavaMLWritable, JavaMLReadable): """ + .. note:: Experimental + `http://en.wikipedia.org/wiki/Gradient_boosting Gradient-Boosted Trees (GBTs)` learning algorithm for classification. It supports binary labels, as well as both continuous and categorical features. @@ -843,6 +854,8 @@ def getLossType(self): class GBTClassificationModel(TreeEnsembleModels, JavaMLWritable, JavaMLReadable): """ + .. note:: Experimental + Model fitted by GBTClassifier. .. versionadded:: 1.4.0 From b1ce81779ef93e5a83ab68a30f1ae5a91e0f38ec Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Thu, 5 May 2016 14:51:53 -0700 Subject: [PATCH 03/23] Use mathjax for formula in PyDoc --- python/docs/conf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/docs/conf.py b/python/docs/conf.py index d35bf73c3051..50fb3175a7dc 100644 --- a/python/docs/conf.py +++ b/python/docs/conf.py @@ -32,6 +32,7 @@ 'sphinx.ext.autodoc', 'sphinx.ext.viewcode', 'epytext', + 'sphinx.ext.mathjax', ] # Add any paths that contain templates here, relative to this directory. From 8125c8c6a79cf55a74894a7d2e4efb68a331fcfe Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Thu, 5 May 2016 14:52:18 -0700 Subject: [PATCH 04/23] Switch to math highlighting and update legostic regresion get doc since it doesn't throw an an error --- python/pyspark/ml/classification.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index f9d5009be59c..bbbcc6de7c18 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -159,7 +159,7 @@ def getThreshold(self): """ Gets the value of threshold or attempt to convert thresholds to threshold if set, or default value if neither are set. - This conversion is equivalent to: {{{1 / (1 + thresholds(0) / thresholds(1))}}}. + This conversion is equivalent to: :math:`\\frac{1}{1 + \\frac{thresholds(0)}{thresholds(1)}}`. """ self._checkThresholdConsistency() if self.isSet(self.thresholds): @@ -188,7 +188,7 @@ def getThresholds(self): If :py:attr:`thresholds` is set, return its value. Otherwise, if :py:attr:`threshold` is set, return the equivalent thresholds for binary classification: (1-threshold, threshold). - If neither are set, throw an error. + If neither are set, return the default value. """ self._checkThresholdConsistency() if not self.isSet(self.thresholds) and self.isSet(self.threshold): From c72fa4679dd2fa56e8f590b1b93beaf8c939b523 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Thu, 5 May 2016 14:54:30 -0700 Subject: [PATCH 05/23] Long line fix --- python/pyspark/ml/classification.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index bbbcc6de7c18..14b1aa518f27 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -159,7 +159,8 @@ def getThreshold(self): """ Gets the value of threshold or attempt to convert thresholds to threshold if set, or default value if neither are set. - This conversion is equivalent to: :math:`\\frac{1}{1 + \\frac{thresholds(0)}{thresholds(1)}}`. + This conversion is equivalent to: + :math:`\\frac{1}{1 + \\frac{thresholds(0)}{thresholds(1)}}`. """ self._checkThresholdConsistency() if self.isSet(self.thresholds): From 3fd1dce92e123d89273490d7ad0e1d716efcb124 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Thu, 5 May 2016 16:40:17 -0700 Subject: [PATCH 06/23] Start adding the missing params to mutli-layer perceptron, also investigate how to handle the weights param being none --- .../MultilayerPerceptronClassifier.scala | 6 +- python/pyspark/ml/classification.py | 81 +++++++++++++++++-- python/pyspark/ml/wrapper.py | 5 +- 3 files changed, 81 insertions(+), 11 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala index 72cf55f6bb99..3906aa542f5a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala @@ -75,8 +75,8 @@ private[ml] trait MultilayerPerceptronParams extends PredictorParams * @group expertParam */ final val solver: Param[String] = new Param[String](this, "solver", - " Allows setting the solver: minibatch gradient descent (gd) or l-bfgs. " + - " l-bfgs is the default one.", + "Allows setting the solver: minibatch gradient descent (gd) or l-bfgs. " + + "(Default l-bfgs)", ParamValidators.inArray[String](Array("gd", "l-bfgs"))) /** @group getParam */ @@ -88,7 +88,7 @@ private[ml] trait MultilayerPerceptronParams extends PredictorParams * @group expertParam */ final val weights: Param[Vector] = new Param[Vector](this, "weights", - " Sets the weights of the model ") + "Sets the weights of the model") /** @group getParam */ final def getWeights: Vector = $(weights) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 14b1aa518f27..f377e8b66429 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -1028,7 +1028,8 @@ def theta(self): @inherit_doc class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, - HasMaxIter, HasTol, HasSeed, JavaMLWritable, JavaMLReadable): + HasMaxIter, HasTol, HasSeed, HasStepSize, JavaMLWritable, + JavaMLReadable): """ .. note:: Experimental @@ -1065,6 +1066,8 @@ class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, >>> mlp2 = MultilayerPerceptronClassifier.load(mlp_path) >>> mlp2.getBlockSize() 1 + >>> mlp2.getStepSize() + 0.03 >>> model_path = temp_path + "/mlp_model" >>> model.save(model_path) >>> model2 = MultilayerPerceptronClassificationModel.load(model_path) @@ -1072,6 +1075,12 @@ class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, True >>> model.weights == model2.weights True + >>> mlp2.setWeights([ + ... 2, 5, 1, -7, -5, -10, 0, 0.6, -1, 2, -2, 1, 2, -7, -1, -2, 2, 1, -1, 9, -9, 3, -3, -3, + ... 3.0, 0, -1]) + >>> model3 = mlp2.fit(df) + >>> model3.weights[0] + 2 .. versionadded:: 1.6.0 """ @@ -1085,28 +1094,38 @@ class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, "remaining data in a partition then it is adjusted to the size of this " + "data. Recommended size is between 10 and 1000, default is 128.", typeConverter=TypeConverters.toInt) + solver = Param(Params._dummy(), "solver", "Allows setting the solver: minibatch gradient " + + "descent (gd) or l-bfgs. (Default l-bfgs)", + typeConverter=TypeConverters.toString) + weights = Param(Params._dummy(), "weights", "Sets the weights of the model", + typeConverter=TypeConverters.toVector) @keyword_only def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", - maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128): + maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128, stepSize=0.03, + solver="l-bfgs", weights=None): """ __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ - maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128) + maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128, stepSize=0.03, + solver="l-bfgs", weights=None) """ super(MultilayerPerceptronClassifier, self).__init__() self._java_obj = self._new_java_obj( "org.apache.spark.ml.classification.MultilayerPerceptronClassifier", self.uid) - self._setDefault(maxIter=100, tol=1E-4, blockSize=128) + self._setDefault(maxIter=100, tol=1E-4, blockSize=128, stepSize=0.03, solver="l-bfgs", + weights=None) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @keyword_only @since("1.6.0") def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", - maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128): + maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128, stepSize=0.03, + solver="l-bfgs", weights=None): """ setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ - maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128) + maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128, stepSize=0.03, + solver="l-bfgs", weights=None) Sets params for MultilayerPerceptronClassifier. """ kwargs = self.setParams._input_kwargs @@ -1143,6 +1162,56 @@ def getBlockSize(self): """ return self.getOrDefault(self.blockSize) + @since("2.0.0") + def setStepSize(self, value): + """ + Sets the value of :py:attr:`stepSize`. + """ + return self._set(stepSize=value) + + @since("2.0.0") + def getStepSize(self): + """ + Gets the value of stepSize or its default value. + """ + return self.getOrDefault(self.stepSize) + + @since("2.0.0") + def setSolver(self, value): + """ + Sets the value of :py:attr:`solver`. + """ + return self._set(solver=value) + + @since("2.0.0") + def getSolver(self): + """ + Gets the value of solver or its default value. + """ + return self.getOrDefault(self.solver) + + @property + @since("2.0.0") + def getOptimizer(self): + """ + Gets the optimizer used. + """ + return self.getSolver() + + @since("2.0.0") + def setWeights(self, value): + """ + Sets the value of :py:attr:`weights`. + """ + return self._set(weights=value) + + @since("2.0.0") + def getWeights(self): + """ + Gets the value of weights or its default value. + """ + return self.getOrDefault(self.weights) + class MultilayerPerceptronClassificationModel(JavaModel, JavaMLWritable, JavaMLReadable): """ diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py index fef0040faf86..cc9a99bfb0e1 100644 --- a/python/pyspark/ml/wrapper.py +++ b/python/pyspark/ml/wrapper.py @@ -88,8 +88,9 @@ def _transfer_params_to_java(self): paramMap = self.extractParamMap() for param in self.params: if param in paramMap: - pair = self._make_java_param_pair(param, paramMap[param]) - self._java_obj.set(pair) + if paramMap[param] is not None: + pair = self._make_java_param_pair(param, paramMap[param]) + self._java_obj.set(pair) def _transfer_param_map_to_java(self, pyParamMap): """ From c7caa43b9da3655b5c28b5dd3b4e9e954f735945 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Thu, 5 May 2016 16:49:09 -0700 Subject: [PATCH 07/23] Or wait we just don't need to support None --- python/pyspark/ml/classification.py | 3 +-- python/pyspark/ml/wrapper.py | 5 ++--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index f377e8b66429..344e27949431 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -1112,8 +1112,7 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred super(MultilayerPerceptronClassifier, self).__init__() self._java_obj = self._new_java_obj( "org.apache.spark.ml.classification.MultilayerPerceptronClassifier", self.uid) - self._setDefault(maxIter=100, tol=1E-4, blockSize=128, stepSize=0.03, solver="l-bfgs", - weights=None) + self._setDefault(maxIter=100, tol=1E-4, blockSize=128, stepSize=0.03, solver="l-bfgs") kwargs = self.__init__._input_kwargs self.setParams(**kwargs) diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py index cc9a99bfb0e1..fef0040faf86 100644 --- a/python/pyspark/ml/wrapper.py +++ b/python/pyspark/ml/wrapper.py @@ -88,9 +88,8 @@ def _transfer_params_to_java(self): paramMap = self.extractParamMap() for param in self.params: if param in paramMap: - if paramMap[param] is not None: - pair = self._make_java_param_pair(param, paramMap[param]) - self._java_obj.set(pair) + pair = self._make_java_param_pair(param, paramMap[param]) + self._java_obj.set(pair) def _transfer_param_map_to_java(self, pyParamMap): """ From 4776221984d36e7beee8b8c5da70c1a1b9010815 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Thu, 5 May 2016 17:14:36 -0700 Subject: [PATCH 08/23] Update the doc string for weights param and add doctest that verifys layers stay same but weights change --- .../MultilayerPerceptronClassifier.scala | 4 ++-- python/pyspark/ml/classification.py | 12 +++++++----- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala index 3906aa542f5a..af046a095d6a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala @@ -88,7 +88,7 @@ private[ml] trait MultilayerPerceptronParams extends PredictorParams * @group expertParam */ final val weights: Param[Vector] = new Param[Vector](this, "weights", - "Sets the weights of the model") + "Weights (either initial if before training or actual on model)") /** @group getParam */ final def getWeights: Vector = $(weights) @@ -181,7 +181,7 @@ class MultilayerPerceptronClassifier @Since("1.5.0") ( def setSeed(value: Long): this.type = set(seed, value) /** - * Sets the model weights. + * Sets the initial weights used for the optimizer. * * @group expertParam */ diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 344e27949431..938d5b236062 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -1075,12 +1075,14 @@ class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, True >>> model.weights == model2.weights True - >>> mlp2.setWeights([ + >>> mlp2 = mlp2.setWeights([ ... 2, 5, 1, -7, -5, -10, 0, 0.6, -1, 2, -2, 1, 2, -7, -1, -2, 2, 1, -1, 9, -9, 3, -3, -3, ... 3.0, 0, -1]) >>> model3 = mlp2.fit(df) - >>> model3.weights[0] - 2 + >>> model3.weights != model2.weights + True + >>> model3.layers == model.layers + True .. versionadded:: 1.6.0 """ @@ -1097,8 +1099,8 @@ class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, solver = Param(Params._dummy(), "solver", "Allows setting the solver: minibatch gradient " + "descent (gd) or l-bfgs. (Default l-bfgs)", typeConverter=TypeConverters.toString) - weights = Param(Params._dummy(), "weights", "Sets the weights of the model", - typeConverter=TypeConverters.toVector) + weights = Param(Params._dummy(), "weights", "Weights (either initial if before training or " + + "actual on model)", typeConverter=TypeConverters.toVector) @keyword_only def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", From 2397004c8bb6d9482422276188149b274ef5411a Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 10 May 2016 10:38:46 -0700 Subject: [PATCH 09/23] mini fix --- python/pyspark/ml/classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 30935cefa975..88afbc1c02ea 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -1120,7 +1120,7 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred solver="l-bfgs", weights=None): """ __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ - maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128, stepSize=0.03, + maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128, stepSize=0.03, \ solver="l-bfgs", weights=None) """ super(MultilayerPerceptronClassifier, self).__init__() From a73913b3ea72b067ab9e7f19bdc5821145b003b4 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 10 May 2016 10:40:04 -0700 Subject: [PATCH 10/23] more pydoc fix --- python/pyspark/ml/classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 88afbc1c02ea..3cd8bb58cfd6 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -1137,7 +1137,7 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre solver="l-bfgs", weights=None): """ setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ - maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128, stepSize=0.03, + maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128, stepSize=0.03, \ solver="l-bfgs", weights=None) Sets params for MultilayerPerceptronClassifier. """ From 9e38ddf6088e0ce7342327b8f1ed83560c8b5a63 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 10 May 2016 11:45:22 -0700 Subject: [PATCH 11/23] Remove flaky doctet component --- python/pyspark/ml/classification.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 3cd8bb58cfd6..2d26b578a5f7 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -1078,8 +1078,6 @@ class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, >>> mlp2 = MultilayerPerceptronClassifier.load(mlp_path) >>> mlp2.getBlockSize() 1 - >>> mlp2.getStepSize() - 0.03 >>> model_path = temp_path + "/mlp_model" >>> model.save(model_path) >>> model2 = MultilayerPerceptronClassificationModel.load(model_path) From f4df8f087575b6a994cfa4cdaf3b6f0c7e612884 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 10 May 2016 12:42:12 -0700 Subject: [PATCH 12/23] Add a : as requested --- .../ml/classification/MultilayerPerceptronClassifier.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala index af046a095d6a..39496e0586a3 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala @@ -76,7 +76,7 @@ private[ml] trait MultilayerPerceptronParams extends PredictorParams */ final val solver: Param[String] = new Param[String](this, "solver", "Allows setting the solver: minibatch gradient descent (gd) or l-bfgs. " + - "(Default l-bfgs)", + "(Default: l-bfgs)", ParamValidators.inArray[String](Array("gd", "l-bfgs"))) /** @group getParam */ From 2eec9472d879d7136e5d5c0c2931e6b81f1e88a8 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Thu, 19 May 2016 13:42:46 -0700 Subject: [PATCH 13/23] Back out some unrelated changes that are in a seperate PR anyways --- python/pyspark/ml/classification.py | 80 +++-------------------------- 1 file changed, 6 insertions(+), 74 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 66940773b796..3d8261b5d8d5 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -1045,8 +1045,7 @@ def theta(self): @inherit_doc class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, - HasMaxIter, HasTol, HasSeed, HasStepSize, JavaMLWritable, - JavaMLReadable): + HasMaxIter, HasTol, HasSeed, JavaMLWritable, JavaMLReadable): """ .. note:: Experimental @@ -1090,14 +1089,6 @@ class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, True >>> model.weights == model2.weights True - >>> mlp2 = mlp2.setWeights([ - ... 2, 5, 1, -7, -5, -10, 0, 0.6, -1, 2, -2, 1, 2, -7, -1, -2, 2, 1, -1, 9, -9, 3, -3, -3, - ... 3.0, 0, -1]) - >>> model3 = mlp2.fit(df) - >>> model3.weights != model2.weights - True - >>> model3.layers == model.layers - True .. versionadded:: 1.6.0 """ @@ -1111,37 +1102,28 @@ class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, "remaining data in a partition then it is adjusted to the size of this " + "data. Recommended size is between 10 and 1000, default is 128.", typeConverter=TypeConverters.toInt) - solver = Param(Params._dummy(), "solver", "Allows setting the solver: minibatch gradient " + - "descent (gd) or l-bfgs. (Default l-bfgs)", - typeConverter=TypeConverters.toString) - weights = Param(Params._dummy(), "weights", "Weights (either initial if before training or " + - "actual on model)", typeConverter=TypeConverters.toVector) @keyword_only def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", - maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128, stepSize=0.03, - solver="l-bfgs", weights=None): + maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128): """ __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ - maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128, stepSize=0.03, \ - solver="l-bfgs", weights=None) + maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128) """ super(MultilayerPerceptronClassifier, self).__init__() self._java_obj = self._new_java_obj( "org.apache.spark.ml.classification.MultilayerPerceptronClassifier", self.uid) - self._setDefault(maxIter=100, tol=1E-4, blockSize=128, stepSize=0.03, solver="l-bfgs") + self._setDefault(maxIter=100, tol=1E-4, blockSize=128) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @keyword_only @since("1.6.0") def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", - maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128, stepSize=0.03, - solver="l-bfgs", weights=None): + maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128): """ setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ - maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128, stepSize=0.03, \ - solver="l-bfgs", weights=None) + maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128) Sets params for MultilayerPerceptronClassifier. """ kwargs = self.setParams._input_kwargs @@ -1178,56 +1160,6 @@ def getBlockSize(self): """ return self.getOrDefault(self.blockSize) - @since("2.0.0") - def setStepSize(self, value): - """ - Sets the value of :py:attr:`stepSize`. - """ - return self._set(stepSize=value) - - @since("2.0.0") - def getStepSize(self): - """ - Gets the value of stepSize or its default value. - """ - return self.getOrDefault(self.stepSize) - - @since("2.0.0") - def setSolver(self, value): - """ - Sets the value of :py:attr:`solver`. - """ - return self._set(solver=value) - - @since("2.0.0") - def getSolver(self): - """ - Gets the value of solver or its default value. - """ - return self.getOrDefault(self.solver) - - @property - @since("2.0.0") - def getOptimizer(self): - """ - Gets the optimizer used. - """ - return self.getSolver() - - @since("2.0.0") - def setWeights(self, value): - """ - Sets the value of :py:attr:`weights`. - """ - return self._set(weights=value) - - @since("2.0.0") - def getWeights(self): - """ - Gets the value of weights or its default value. - """ - return self.getOrDefault(self.weights) - class MultilayerPerceptronClassificationModel(JavaModel, JavaMLWritable, JavaMLReadable): """ From 4111b2d01c33fac3c2537fe9430ce05063530a48 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Thu, 26 May 2016 12:29:46 -0700 Subject: [PATCH 14/23] Update scaladoc and PyDoc to both have the correct chain for getThreshold (thresholds -> threshold -> threshold default value) --- .../spark/ml/classification/LogisticRegression.scala | 5 ++--- python/pyspark/ml/classification.py | 11 ++++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 0ab4459bdb9d..69c57133689a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -72,10 +72,9 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas /** * Get threshold for binary classification. * - * If [[threshold]] is set, returns that value. - * Otherwise, if [[thresholds]] is set with length 2 (i.e., binary classification), + * If [[thresholds]] is set with length 2 (i.e., binary classification), * this returns the equivalent threshold: {{{1 / (1 + thresholds(0) / thresholds(1))}}}. - * Otherwise, returns [[threshold]] default value. + * Otherwise, returns [[threshold]] if set, or its default value if unset. * * @group getParam * @throws IllegalArgumentException if [[thresholds]] is set to an array of length other than 2. diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 3d8261b5d8d5..ec44f12d2c82 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -99,7 +99,7 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti threshold = Param(Params._dummy(), "threshold", "Threshold in binary classification prediction, in range [0, 1]." + " If threshold and thresholds are both set, they must match." + - "e.g. threshold must be equal to [1-p, p].", + "e.g. if threshold is p, then thresholds must be equal to [1-p, p].", typeConverter=TypeConverters.toFloat) @keyword_only @@ -157,10 +157,11 @@ def setThreshold(self, value): @since("1.4.0") def getThreshold(self): """ - Gets the value of threshold or attempt to convert thresholds to threshold if set, or default - value if neither are set. - This conversion is equivalent to: - :math:`\\frac{1}{1 + \\frac{thresholds(0)}{thresholds(1)}}`. + Get threshold for binary classification. + + If :py:attr:`thresholds is set with length 2 (i.e., binary classification), + this returns the equivalent threshold: :math:`\\frac{1}{1 + \\frac{thresholds(0)}{thresholds(1)}}`. + Otherwise, returns :py:attr:`threshold` if set or its default value. """ self._checkThresholdConsistency() if self.isSet(self.thresholds): From 53ab7906d57f3ce2c954a5b54ff1ba5e97ebd00a Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Thu, 26 May 2016 12:46:19 -0700 Subject: [PATCH 15/23] pep8 --- python/pyspark/ml/classification.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index ec44f12d2c82..3e0e03bd2aa5 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -160,7 +160,8 @@ def getThreshold(self): Get threshold for binary classification. If :py:attr:`thresholds is set with length 2 (i.e., binary classification), - this returns the equivalent threshold: :math:`\\frac{1}{1 + \\frac{thresholds(0)}{thresholds(1)}}`. + this returns the equivalent threshold: + :math:`\\frac{1}{1 + \\frac{thresholds(0)}{thresholds(1)}}`. Otherwise, returns :py:attr:`threshold` if set or its default value. """ self._checkThresholdConsistency() From a7aadec43efbb523545db8806b189872727b786f Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 6 Jun 2016 13:24:52 -0700 Subject: [PATCH 16/23] Revert doc change --- python/pyspark/ml/classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 2b6951f0cde0..47e1b5c9afa8 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -191,7 +191,7 @@ def getThresholds(self): If :py:attr:`thresholds` is set, return its value. Otherwise, if :py:attr:`threshold` is set, return the equivalent thresholds for binary classification: (1-threshold, threshold). - If neither are set, return the default value. + If neither are set, throw an error. """ self._checkThresholdConsistency() if not self.isSet(self.thresholds) and self.isSet(self.threshold): From e4061f4f4c8531c97f31989e1723e44c6170e673 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 6 Jun 2016 13:27:06 -0700 Subject: [PATCH 17/23] minor fix --- python/pyspark/ml/classification.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 47e1b5c9afa8..ebc945368d61 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -159,10 +159,10 @@ def getThreshold(self): """ Get threshold for binary classification. - If :py:attr:`thresholds is set with length 2 (i.e., binary classification), + If :py:attr:`thresholds` is set with length 2 (i.e., binary classification), this returns the equivalent threshold: :math:`\\frac{1}{1 + \\frac{thresholds(0)}{thresholds(1)}}`. - Otherwise, returns :py:attr:`threshold` if set or its default value. + Otherwise, returns :py:attr:`threshold` if set or its default value if unset. """ self._checkThresholdConsistency() if self.isSet(self.thresholds): From 398161285321c976e850f18b486e512efbe0d24e Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 13 Jun 2016 19:21:48 -0700 Subject: [PATCH 18/23] oook lets try 86ing mathjax but... welll w/e --- python/docs/conf.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/docs/conf.py b/python/docs/conf.py index 50fb3175a7dc..d35bf73c3051 100644 --- a/python/docs/conf.py +++ b/python/docs/conf.py @@ -32,7 +32,6 @@ 'sphinx.ext.autodoc', 'sphinx.ext.viewcode', 'epytext', - 'sphinx.ext.mathjax', ] # Add any paths that contain templates here, relative to this directory. From 3d13c6c662cf18ed0dc390fb355b83c7989370f9 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 13 Jun 2016 21:27:48 -0700 Subject: [PATCH 19/23] reenable mathjax --- python/docs/conf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/docs/conf.py b/python/docs/conf.py index d35bf73c3051..50fb3175a7dc 100644 --- a/python/docs/conf.py +++ b/python/docs/conf.py @@ -32,6 +32,7 @@ 'sphinx.ext.autodoc', 'sphinx.ext.viewcode', 'epytext', + 'sphinx.ext.mathjax', ] # Add any paths that contain templates here, relative to this directory. From 2be8cdf15d6431c8d2ef0d7a6c5d136e2b71410e Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 13 Jun 2016 21:27:56 -0700 Subject: [PATCH 20/23] Revert "[SPARK-15745][SQL] Use classloader's getResource() for reading resource files in HiveTests" as it was causing Jenkins failures. This reverts commit f7288e166c696da15e790c28fc3ed78531fd362d. --- .../org/apache/spark/sql/hive/test/TestHive.scala | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala index 1d1d5e3f7bd6..81964db5477c 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala @@ -179,8 +179,19 @@ private[hive] class TestHiveSparkSession( hiveFilesTemp.mkdir() ShutdownHookManager.registerShutdownDeleteDir(hiveFilesTemp) + val inRepoTests = if (System.getProperty("user.dir").endsWith("sql" + File.separator + "hive")) { + new File("src" + File.separator + "test" + File.separator + "resources" + File.separator) + } else { + new File("sql" + File.separator + "hive" + File.separator + "src" + File.separator + "test" + + File.separator + "resources") + } + def getHiveFile(path: String): File = { - new File(Thread.currentThread().getContextClassLoader.getResource(path).getFile) + val stripped = path.replaceAll("""\.\.\/""", "").replace('/', File.separatorChar) + hiveDevHome + .map(new File(_, stripped)) + .filter(_.exists) + .getOrElse(new File(inRepoTests, stripped)) } val describedTable = "DESCRIBE (\\w+)".r From 4431daa9552426f02a993778eb5a5f2c2d87b183 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 14 Jun 2016 10:59:27 -0700 Subject: [PATCH 21/23] Support both methods --- .../apache/spark/sql/hive/test/TestHive.scala | 26 ++++++++++++------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala index 81964db5477c..30b0fb5b0cfc 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala @@ -179,19 +179,25 @@ private[hive] class TestHiveSparkSession( hiveFilesTemp.mkdir() ShutdownHookManager.registerShutdownDeleteDir(hiveFilesTemp) - val inRepoTests = if (System.getProperty("user.dir").endsWith("sql" + File.separator + "hive")) { - new File("src" + File.separator + "test" + File.separator + "resources" + File.separator) - } else { - new File("sql" + File.separator + "hive" + File.separator + "src" + File.separator + "test" + - File.separator + "resources") + lazy val inRepoTests = { + if (System.getProperty("user.dir").endsWith("sql" + File.separator + "hive")) { + new File("src" + File.separator + "test" + File.separator + "resources" + File.separator) + } else { + new File("sql" + File.separator + "hive" + File.separator + "src" + File.separator + "test" + + File.separator + "resources") + } } def getHiveFile(path: String): File = { - val stripped = path.replaceAll("""\.\.\/""", "").replace('/', File.separatorChar) - hiveDevHome - .map(new File(_, stripped)) - .filter(_.exists) - .getOrElse(new File(inRepoTests, stripped)) + // Attempt to load from class loader, fall back to old system property based. + val resourcePath = Option(Thread.currentThread().getContextClassLoader.getResource(path)) + resourcePath.map(rp => new File(rp.getFile)).getOrElse{ + val stripped = path.replaceAll("""\.\.\/""", "").replace('/', File.separatorChar) + hiveDevHome + .map(new File(_, stripped)) + .filter(_.exists) + .getOrElse(new File(inRepoTests, stripped)) + } } val describedTable = "DESCRIBE (\\w+)".r From d842309c749b817bdacb6a57bccba74cc7c0fbf4 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 21 Jun 2016 10:32:57 -0700 Subject: [PATCH 22/23] Revert "Support both methods" This reverts commit 4431daa9552426f02a993778eb5a5f2c2d87b183. --- .../apache/spark/sql/hive/test/TestHive.scala | 26 +++++++------------ 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala index 30b0fb5b0cfc..81964db5477c 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala @@ -179,25 +179,19 @@ private[hive] class TestHiveSparkSession( hiveFilesTemp.mkdir() ShutdownHookManager.registerShutdownDeleteDir(hiveFilesTemp) - lazy val inRepoTests = { - if (System.getProperty("user.dir").endsWith("sql" + File.separator + "hive")) { - new File("src" + File.separator + "test" + File.separator + "resources" + File.separator) - } else { - new File("sql" + File.separator + "hive" + File.separator + "src" + File.separator + "test" + - File.separator + "resources") - } + val inRepoTests = if (System.getProperty("user.dir").endsWith("sql" + File.separator + "hive")) { + new File("src" + File.separator + "test" + File.separator + "resources" + File.separator) + } else { + new File("sql" + File.separator + "hive" + File.separator + "src" + File.separator + "test" + + File.separator + "resources") } def getHiveFile(path: String): File = { - // Attempt to load from class loader, fall back to old system property based. - val resourcePath = Option(Thread.currentThread().getContextClassLoader.getResource(path)) - resourcePath.map(rp => new File(rp.getFile)).getOrElse{ - val stripped = path.replaceAll("""\.\.\/""", "").replace('/', File.separatorChar) - hiveDevHome - .map(new File(_, stripped)) - .filter(_.exists) - .getOrElse(new File(inRepoTests, stripped)) - } + val stripped = path.replaceAll("""\.\.\/""", "").replace('/', File.separatorChar) + hiveDevHome + .map(new File(_, stripped)) + .filter(_.exists) + .getOrElse(new File(inRepoTests, stripped)) } val describedTable = "DESCRIBE (\\w+)".r From de63f9f7451ae6e527fe383537a2afbdf273449a Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 21 Jun 2016 11:47:08 -0700 Subject: [PATCH 23/23] Revert "Revert "[SPARK-15745][SQL] Use classloader's getResource() for reading resource files in HiveTests" as it was causing Jenkins failures." This reverts commit 2be8cdf15d6431c8d2ef0d7a6c5d136e2b71410e. --- .../org/apache/spark/sql/hive/test/TestHive.scala | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala index 81964db5477c..1d1d5e3f7bd6 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala @@ -179,19 +179,8 @@ private[hive] class TestHiveSparkSession( hiveFilesTemp.mkdir() ShutdownHookManager.registerShutdownDeleteDir(hiveFilesTemp) - val inRepoTests = if (System.getProperty("user.dir").endsWith("sql" + File.separator + "hive")) { - new File("src" + File.separator + "test" + File.separator + "resources" + File.separator) - } else { - new File("sql" + File.separator + "hive" + File.separator + "src" + File.separator + "test" + - File.separator + "resources") - } - def getHiveFile(path: String): File = { - val stripped = path.replaceAll("""\.\.\/""", "").replace('/', File.separatorChar) - hiveDevHome - .map(new File(_, stripped)) - .filter(_.exists) - .getOrElse(new File(inRepoTests, stripped)) + new File(Thread.currentThread().getContextClassLoader.getResource(path).getFile) } val describedTable = "DESCRIBE (\\w+)".r