From a4ede3fff9c012bbd609d3fad13a8c100856f950 Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Mon, 13 Mar 2017 16:43:41 -0700 Subject: [PATCH 01/13] added regression test case for PySpark models not owning params --- python/pyspark/ml/tests.py | 81 ++++++++++++++++++++++++++------------ 1 file changed, 55 insertions(+), 26 deletions(-) diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index f052f5bb770c..c71b59c638fe 100755 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -405,6 +405,58 @@ def test_copy_param_extras(self): self.assertEqual(tp._paramMap, copied_no_extra) self.assertEqual(tp._defaultParamMap, tp_copy._defaultParamMap) + @staticmethod + def check_params(test_self, py_stage, check_params_exist=True): + """ + Checks common requirements for Params.params: + - set of params exist in Java and Python and are ordered by names + - param parent has the same UID as the object's UID + - default param value from Java matches value in Python + """ + py_stage_str = "%s %s" % (type(py_stage), py_stage) + if not hasattr(py_stage, "_to_java"): + return + java_stage = py_stage._to_java() + if java_stage is None: + return + test_self.assertEqual(py_stage.uid, java_stage.uid(), msg=py_stage_str) + if check_params_exist: + param_names = [p.name for p in py_stage.params] + java_params = list(java_stage.params()) + java_param_names = [jp.name() for jp in java_params] + test_self.assertEqual( + param_names, sorted(java_param_names), + "Param list in Python does not match Java for %s:\nJava = %s\nPython = %s" + % (py_stage_str, java_param_names, param_names)) + for p in py_stage.params: + test_self.assertEqual(p.parent, py_stage.uid) + java_param = java_stage.getParam(p.name) + py_has_default = py_stage.hasDefault(p) + java_has_default = java_stage.hasDefault(java_param) + test_self.assertEqual(py_has_default, java_has_default, + "Default value mismatch of param %s for Params %s" + % (p.name, str(py_stage))) + if py_has_default: + if p.name == "seed": + continue # Random seeds between Spark and PySpark are different + java_default = _java2py(test_self.sc, + java_stage.clear(java_param).getOrDefault(java_param)) + py_stage._clear(p) + py_default = py_stage.getOrDefault(p) + test_self.assertEqual( + java_default, py_default, + "Java default %s != python default %s of param %s for Params %s" + % (str(java_default), str(py_default), p.name, str(py_stage))) + ''' + test_self.assertTrue(isinstance(obj, Params)) + params = obj.params + paramNames = [p.name for p in params] + test_self.assertEqual(paramNames, sorted(paramNames)) + for p in params: + test_self.assertEqual(p.parent, obj.uid) + test_self.assertEqual(obj.getParam(p.name), p) + ''' + class EvaluatorTests(SparkSessionTestCase): @@ -461,6 +513,8 @@ def test_idf(self): "Model should inherit the UID from its parent estimator.") output = idf0m.transform(dataset) self.assertIsNotNone(output.head().idf) + # Test that parameters transferred to Python Model + ParamTests.check_params(self, idf0m) def test_ngram(self): dataset = self.spark.createDataFrame([ @@ -1271,31 +1325,6 @@ class DefaultValuesTests(PySparkTestCase): Test :py:class:`JavaParams` classes to see if their default Param values match those in their Scala counterparts. """ - - def check_params(self, py_stage): - if not hasattr(py_stage, "_to_java"): - return - java_stage = py_stage._to_java() - if java_stage is None: - return - for p in py_stage.params: - java_param = java_stage.getParam(p.name) - py_has_default = py_stage.hasDefault(p) - java_has_default = java_stage.hasDefault(java_param) - self.assertEqual(py_has_default, java_has_default, - "Default value mismatch of param %s for Params %s" - % (p.name, str(py_stage))) - if py_has_default: - if p.name == "seed": - return # Random seeds between Spark and PySpark are different - java_default =\ - _java2py(self.sc, java_stage.clear(java_param).getOrDefault(java_param)) - py_stage._clear(p) - py_default = py_stage.getOrDefault(p) - self.assertEqual(java_default, py_default, - "Java default %s != python default %s of param %s for Params %s" - % (str(java_default), str(py_default), p.name, str(py_stage))) - def test_java_params(self): import pyspark.ml.feature import pyspark.ml.classification @@ -1309,7 +1338,7 @@ def test_java_params(self): for name, cls in inspect.getmembers(module, inspect.isclass): if not name.endswith('Model') and issubclass(cls, JavaParams)\ and not inspect.isabstract(cls): - self.check_params(cls()) + ParamTests.check_params(self, cls(), check_params_exist=False) def _squared_distance(a, b): From 3b921a469a75b243c97e93548ed3de4bfae040a9 Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Mon, 13 Mar 2017 16:44:22 -0700 Subject: [PATCH 02/13] fixed default PySpark param value that was being overlooked by return instead of continue --- python/pyspark/ml/classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index b4fc357e42d7..8f22906b4606 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -1328,7 +1328,7 @@ def __init__(self, featuresCol="features", labelCol="label", predictionCol="pred super(MultilayerPerceptronClassifier, self).__init__() self._java_obj = self._new_java_obj( "org.apache.spark.ml.classification.MultilayerPerceptronClassifier", self.uid) - self._setDefault(maxIter=100, tol=1E-4, blockSize=128, stepSize=0.03, solver="l-bfgs") + self._setDefault(maxIter=100, tol=1E-6, blockSize=128, stepSize=0.03, solver="l-bfgs") kwargs = self._input_kwargs self.setParams(**kwargs) From dff7863d1ecd45a10c225a2ea95cc51814705ed0 Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Mon, 13 Mar 2017 17:29:10 -0700 Subject: [PATCH 03/13] added copy of param values to python model when estimator fit is called --- python/pyspark/ml/wrapper.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py index 80a0b31cd88d..27ba91af4089 100644 --- a/python/pyspark/ml/wrapper.py +++ b/python/pyspark/ml/wrapper.py @@ -263,7 +263,8 @@ def _fit_java(self, dataset): def _fit(self, dataset): java_model = self._fit_java(dataset) - return self._create_model(java_model) + model = self._create_model(java_model) + return self._copyValues(model) @inherit_doc From 398ef27874e59c615af77b3c838880c68de97e35 Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Tue, 14 Mar 2017 14:10:03 -0700 Subject: [PATCH 04/13] Added temporary fix to add Params when fitting and persisting models --- python/pyspark/ml/clustering.py | 5 ++++- python/pyspark/ml/wrapper.py | 20 ++++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index 88ac7e275e38..4bded3aea885 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -745,7 +745,10 @@ def toLocal(self): WARNING: This involves collecting a large :py:func:`topicsMatrix` to the driver. """ - return LocalLDAModel(self._call_java("toLocal")) + model = LocalLDAModel(self._call_java("toLocal")) + # SPARK-10931: Temporary fix to be removed once LDAModel defines Params + model._transfer_params_from_java() + return model @since("2.0.0") def trainingLogLikelihood(self): diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py index 27ba91af4089..367f8061f568 100644 --- a/python/pyspark/ml/wrapper.py +++ b/python/pyspark/ml/wrapper.py @@ -140,6 +140,18 @@ def _transfer_params_from_java(self): Transforms the embedded params from the companion Java object. """ sc = SparkContext._active_spark_context + + # SPARK-10931: Temporary fix to add params when loading model + java_params = list(self._java_obj.params()) + from pyspark.ml.param import Param + for java_param in java_params: + java_param_name = java_param.name() + if not hasattr(self, java_param_name): + param = Param(self, java_param_name, java_param.doc()) + setattr(self, java_param_name, param) + self._params = None + # end SPARK-10931 temporary fix + for param in self.params: if self._java_obj.hasParam(param.name): java_param = self._java_obj.getParam(param.name) @@ -264,6 +276,14 @@ def _fit_java(self, dataset): def _fit(self, dataset): java_model = self._fit_java(dataset) model = self._create_model(java_model) + # SPARK-10931: This is a temporary fix to allow models to own params + # from estimators. Eventually, these params should be in models through + # using common base classes between estimators and models. + for param in self.params: + if not hasattr(model, param.name) and java_model.hasParam(param.name): + setattr(model, param.name, param) + model._params = None + # end SPARK-10931 temporary fix return self._copyValues(model) From d621c8940421258518345eda41e775b7e65e8e8f Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Tue, 2 May 2017 16:38:07 -0700 Subject: [PATCH 05/13] added check for NaN default param values --- python/pyspark/ml/tests.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index 2570cde288cf..7450bc7ac2aa 100755 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -442,6 +442,10 @@ def check_params(test_self, py_stage, check_params_exist=True): java_stage.clear(java_param).getOrDefault(java_param)) py_stage._clear(p) py_default = py_stage.getOrDefault(p) + # equality test for NaN is always False + if isinstance(java_default, float) and np.isnan(java_default): + java_default = True + py_default = np.isnan(py_default) test_self.assertEqual( java_default, py_default, "Java default %s != python default %s of param %s for Params %s" From acdb4b94517f2c740fe53b80ff6870d894a885aa Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Wed, 3 May 2017 15:32:45 -0700 Subject: [PATCH 06/13] need to create params from java when model is fit and unpersisted in order to match --- python/pyspark/ml/clustering.py | 3 +++ python/pyspark/ml/wrapper.py | 30 ++++++++++++++++++------------ 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index 4bded3aea885..66fb00508522 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -746,8 +746,11 @@ def toLocal(self): WARNING: This involves collecting a large :py:func:`topicsMatrix` to the driver. """ model = LocalLDAModel(self._call_java("toLocal")) + # SPARK-10931: Temporary fix to be removed once LDAModel defines Params + model._create_params_from_java() model._transfer_params_from_java() + return model @since("2.0.0") diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py index 367f8061f568..ae75ec2c5731 100644 --- a/python/pyspark/ml/wrapper.py +++ b/python/pyspark/ml/wrapper.py @@ -135,22 +135,25 @@ def _transfer_param_map_to_java(self, pyParamMap): paramMap.put([pair]) return paramMap - def _transfer_params_from_java(self): + def _create_params_from_java(self): """ - Transforms the embedded params from the companion Java object. + SPARK-10931: Temporary fix to create params that are defined in the Java obj but not here """ - sc = SparkContext._active_spark_context - - # SPARK-10931: Temporary fix to add params when loading model java_params = list(self._java_obj.params()) from pyspark.ml.param import Param for java_param in java_params: java_param_name = java_param.name() if not hasattr(self, java_param_name): param = Param(self, java_param_name, java_param.doc()) + setattr(param, "created_from_java_param", True) setattr(self, java_param_name, param) - self._params = None - # end SPARK-10931 temporary fix + self._params = None # need to reset so self.params will discover new params + + def _transfer_params_from_java(self): + """ + Transforms the embedded params from the companion Java object. + """ + sc = SparkContext._active_spark_context for param in self.params: if self._java_obj.hasParam(param.name): @@ -216,6 +219,11 @@ def __get_class(clazz): # Load information from java_stage to the instance. py_stage = py_type() py_stage._java_obj = java_stage + + # SPARK-10931: Temporary fix so that persisted models would own params from Estimator + if issubclass(py_type, JavaModel): + py_stage._create_params_from_java() + py_stage._resetUid(java_stage.uid()) py_stage._transfer_params_from_java() elif hasattr(py_type, "_from_java"): @@ -276,14 +284,12 @@ def _fit_java(self, dataset): def _fit(self, dataset): java_model = self._fit_java(dataset) model = self._create_model(java_model) + # SPARK-10931: This is a temporary fix to allow models to own params # from estimators. Eventually, these params should be in models through # using common base classes between estimators and models. - for param in self.params: - if not hasattr(model, param.name) and java_model.hasParam(param.name): - setattr(model, param.name, param) - model._params = None - # end SPARK-10931 temporary fix + model._create_params_from_java() + return self._copyValues(model) From 9b7b886125eeb389d48ea398f6305d05b29840c9 Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Wed, 3 May 2017 15:40:34 -0700 Subject: [PATCH 07/13] removed blank line --- python/pyspark/ml/wrapper.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py index ae75ec2c5731..6d96c4f2e5b0 100644 --- a/python/pyspark/ml/wrapper.py +++ b/python/pyspark/ml/wrapper.py @@ -154,7 +154,6 @@ def _transfer_params_from_java(self): Transforms the embedded params from the companion Java object. """ sc = SparkContext._active_spark_context - for param in self.params: if self._java_obj.hasParam(param.name): java_param = self._java_obj.getParam(param.name) From 765eb5f77335232eff0889fbc7401f1e77e16dc9 Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Wed, 3 May 2017 15:55:37 -0700 Subject: [PATCH 08/13] cleaned old comment block in test --- python/pyspark/ml/tests.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index 7450bc7ac2aa..195b67eff631 100755 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -450,15 +450,6 @@ def check_params(test_self, py_stage, check_params_exist=True): java_default, py_default, "Java default %s != python default %s of param %s for Params %s" % (str(java_default), str(py_default), p.name, str(py_stage))) - ''' - test_self.assertTrue(isinstance(obj, Params)) - params = obj.params - paramNames = [p.name for p in params] - test_self.assertEqual(paramNames, sorted(paramNames)) - for p in params: - test_self.assertEqual(p.parent, obj.uid) - test_self.assertEqual(obj.getParam(p.name), p) - ''' class EvaluatorTests(SparkSessionTestCase): From ca52db43f949cbb7c8b0cb096d44493b48c80b74 Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Thu, 4 May 2017 11:57:36 -0700 Subject: [PATCH 09/13] moved call to create params to JavaModel constructor for case where make a model without fitting --- python/pyspark/ml/wrapper.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py index 6d96c4f2e5b0..8f29414e475c 100644 --- a/python/pyspark/ml/wrapper.py +++ b/python/pyspark/ml/wrapper.py @@ -283,12 +283,6 @@ def _fit_java(self, dataset): def _fit(self, dataset): java_model = self._fit_java(dataset) model = self._create_model(java_model) - - # SPARK-10931: This is a temporary fix to allow models to own params - # from estimators. Eventually, these params should be in models through - # using common base classes between estimators and models. - model._create_params_from_java() - return self._copyValues(model) @@ -333,4 +327,10 @@ def __init__(self, java_model=None): """ super(JavaModel, self).__init__(java_model) if java_model is not None: + + # SPARK-10931: This is a temporary fix to allow models to own params + # from estimators. Eventually, these params should be in models through + # using common base classes between estimators and models. + self._create_params_from_java() + self._resetUid(java_model.uid()) From a22a2ccdd7f8824fa3fb7f7125296cb2bddb44be Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Mon, 8 May 2017 15:01:58 -0700 Subject: [PATCH 10/13] need to copy param value if java has default but not defined in python when loading model --- python/pyspark/ml/wrapper.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py index 8f29414e475c..5878cff61a65 100644 --- a/python/pyspark/ml/wrapper.py +++ b/python/pyspark/ml/wrapper.py @@ -158,7 +158,9 @@ def _transfer_params_from_java(self): if self._java_obj.hasParam(param.name): java_param = self._java_obj.getParam(param.name) # SPARK-14931: Only check set params back to avoid default params mismatch. - if self._java_obj.isSet(java_param): + if self._java_obj.isSet(java_param) or ( + # SPARK-10931: Temporary fix for params that have a default in Java + self._java_obj.hasDefault(java_param) and not self.isDefined(param)): value = _java2py(sc, self._java_obj.getOrDefault(java_param)) self._set(**{param.name: value}) From 4a66e90814f14b4a64900f11c0704b83958f0b9a Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Mon, 8 May 2017 15:10:30 -0700 Subject: [PATCH 11/13] added some comments for test additions --- python/pyspark/ml/tests.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index 195b67eff631..22eb5d6efcbd 100755 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -411,6 +411,7 @@ def check_params(test_self, py_stage, check_params_exist=True): - set of params exist in Java and Python and are ordered by names - param parent has the same UID as the object's UID - default param value from Java matches value in Python + - optionally check if all params from Java also exist in Python """ py_stage_str = "%s %s" % (type(py_stage), py_stage) if not hasattr(py_stage, "_to_java"): @@ -1370,6 +1371,7 @@ def test_java_params(self): for name, cls in inspect.getmembers(module, inspect.isclass): if not name.endswith('Model') and issubclass(cls, JavaParams)\ and not inspect.isabstract(cls): + # NOTE: disable check_params_exist until there is parity with Scala API ParamTests.check_params(self, cls(), check_params_exist=False) From f4a657e08698c07f658f8d23465af899d212d099 Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Thu, 10 Aug 2017 11:55:10 -0700 Subject: [PATCH 12/13] Changed wrapper to use setDefault for undef default params from Java, made NaN check better message when fail --- python/pyspark/ml/tests.py | 4 ++-- python/pyspark/ml/wrapper.py | 9 ++++++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index c4512a930804..5c067cdea561 100755 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -458,8 +458,8 @@ def check_params(test_self, py_stage, check_params_exist=True): py_default = py_stage.getOrDefault(p) # equality test for NaN is always False if isinstance(java_default, float) and np.isnan(java_default): - java_default = True - py_default = np.isnan(py_default) + java_default = "NaN" + py_default = "NaN" if np.isnan(py_default) else "not NaN" test_self.assertEqual( java_default, py_default, "Java default %s != python default %s of param %s for Params %s" diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py index fc38624cc72b..c2082d8bb430 100644 --- a/python/pyspark/ml/wrapper.py +++ b/python/pyspark/ml/wrapper.py @@ -158,11 +158,14 @@ def _transfer_params_from_java(self): if self._java_obj.hasParam(param.name): java_param = self._java_obj.getParam(param.name) # SPARK-14931: Only check set params back to avoid default params mismatch. - if self._java_obj.isSet(java_param) or ( - # SPARK-10931: Temporary fix for params that have a default in Java - self._java_obj.hasDefault(java_param) and not self.isDefined(param)): + if self._java_obj.isSet(java_param): value = _java2py(sc, self._java_obj.getOrDefault(java_param)) self._set(**{param.name: value}) + # SPARK-10931: Temporary fix for params that have a default in Java + if self._java_obj.hasDefault(java_param) and not self.isDefined(param): + value = _java2py(sc, self._java_obj.getDefault(java_param)).get() + self._setDefault(**{param.name: value}) + def _transfer_param_map_from_java(self, javaParamMap): """ From 07f6e8594e46106830f3a1e8c7bb66bbaa26bb5a Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Thu, 10 Aug 2017 14:45:37 -0700 Subject: [PATCH 13/13] style fix --- python/pyspark/ml/wrapper.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/pyspark/ml/wrapper.py b/python/pyspark/ml/wrapper.py index c2082d8bb430..0f846fbc5b5e 100644 --- a/python/pyspark/ml/wrapper.py +++ b/python/pyspark/ml/wrapper.py @@ -166,7 +166,6 @@ def _transfer_params_from_java(self): value = _java2py(sc, self._java_obj.getDefault(java_param)).get() self._setDefault(**{param.name: value}) - def _transfer_param_map_from_java(self, javaParamMap): """ Transforms a Java ParamMap into a Python ParamMap.