From eb67bd16f1ce3b0ecfca03b4efef49271de1ceb3 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 19 Jan 2021 23:57:48 -0600 Subject: [PATCH] [REVIEW] Allow saving Dask RandomForest models immediately after training (fixes #3331) --- .../dask/ensemble/randomforestclassifier.py | 17 ++++++ .../dask/ensemble/randomforestregressor.py | 17 ++++++ python/cuml/test/dask/test_random_forest.py | 58 +++++++++++++++++++ 3 files changed, 92 insertions(+) diff --git a/python/cuml/dask/ensemble/randomforestclassifier.py b/python/cuml/dask/ensemble/randomforestclassifier.py index 2767fc870d..aedfb604d3 100755 --- a/python/cuml/dask/ensemble/randomforestclassifier.py +++ b/python/cuml/dask/ensemble/randomforestclassifier.py @@ -206,6 +206,23 @@ def get_summary_text(self): """ return self._get_summary_text() + def get_combined_model(self): + """ + Return single-GPU model for serialization. + + Returns + ------- + + model : Trained single-GPU model or None if the model has not + yet been trained. + """ + + # set internal model if it hasn't been accessed before + if self._get_internal_model() is None: + self._set_internal_model(self._concat_treelite_models()) + + return BaseEstimator.get_combined_model(self) + def get_detailed_text(self): """ Obtain the detailed information for the random forest model, as text diff --git a/python/cuml/dask/ensemble/randomforestregressor.py b/python/cuml/dask/ensemble/randomforestregressor.py index 0465840da9..008875568b 100755 --- a/python/cuml/dask/ensemble/randomforestregressor.py +++ b/python/cuml/dask/ensemble/randomforestregressor.py @@ -206,6 +206,23 @@ def get_summary_text(self): """ return self._get_summary_text() + def get_combined_model(self): + """ + Return single-GPU model for serialization. + + Returns + ------- + + model : Trained single-GPU model or None if the model has not + yet been trained. + """ + + # set internal model if it hasn't been accessed before + if self._get_internal_model() is None: + self._set_internal_model(self._concat_treelite_models()) + + return BaseEstimator.get_combined_model(self) + def get_detailed_text(self): """ Obtain the detailed information for the random forest model, as text diff --git a/python/cuml/test/dask/test_random_forest.py b/python/cuml/test/dask/test_random_forest.py index bf161ef63d..e52c8f4a36 100644 --- a/python/cuml/test/dask/test_random_forest.py +++ b/python/cuml/test/dask/test_random_forest.py @@ -43,6 +43,9 @@ from cuml.dask.ensemble import RandomForestRegressor as cuRFR_mg from cuml.dask.common import utils as dask_utils +from cuml.ensemble import RandomForestClassifier as cuRFC_sg +from cuml.ensemble import RandomForestRegressor as cuRFR_sg + from dask.array import from_array from sklearn.datasets import make_regression, make_classification from sklearn.model_selection import train_test_split @@ -436,6 +439,61 @@ def predict_with_json_rf_regressor(rf, x): np.testing.assert_almost_equal(pred, expected_pred, decimal=6) +@pytest.mark.parametrize('estimator_type', ['regression', 'classification']) +def test_rf_get_combined_model_right_aftter_fit(client, estimator_type): + max_depth = 3 + n_estimators = 5 + X, y = make_classification( + n_samples=350, + n_features=20, + n_clusters_per_class=1, + n_informative=10, + random_state=123, + n_classes=2 + ) + X = X.astype(np.float32) + if estimator_type == 'classification': + cu_rf_mg = cuRFC_mg( + max_features=1.0, + max_samples=1.0, + n_bins=16, + split_algo=0, + split_criterion=0, + min_samples_leaf=2, + seed=23707, + n_streams=1, + n_estimators=n_estimators, + max_leaves=-1, + max_depth=max_depth + ) + y = y.astype(np.int32) + elif estimator_type == 'regression': + cu_rf_mg = cuRFR_mg( + max_features=1.0, + max_samples=1.0, + n_bins=16, + split_algo=0, + min_samples_leaf=2, + seed=23707, + n_streams=1, + n_estimators=n_estimators, + max_leaves=-1, + max_depth=max_depth + ) + y = y.astype(np.float32) + else: + assert False + X_dask, y_dask = _prep_training_data(client, X, y, partitions_per_worker=2) + cu_rf_mg.fit(X_dask, y_dask) + single_gpu_model = cu_rf_mg.get_combined_model() + if estimator_type == 'classification': + assert isinstance(single_gpu_model, cuRFC_sg) + elif estimator_type == 'regression': + assert isinstance(single_gpu_model, cuRFR_sg) + else: + assert False + + @pytest.mark.parametrize('n_estimators', [5, 10, 20]) @pytest.mark.parametrize('detailed_text', [True, False]) def test_rf_get_text(client, n_estimators, detailed_text):