From eb67bd16f1ce3b0ecfca03b4efef49271de1ceb3 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Tue, 19 Jan 2021 23:57:48 -0600
Subject: [PATCH] [REVIEW] Allow saving Dask RandomForest models immediately
 after training (fixes #3331)

---
 .../dask/ensemble/randomforestclassifier.py   | 17 ++++++
 .../dask/ensemble/randomforestregressor.py    | 17 ++++++
 python/cuml/test/dask/test_random_forest.py   | 58 +++++++++++++++++++
 3 files changed, 92 insertions(+)

diff --git a/python/cuml/dask/ensemble/randomforestclassifier.py b/python/cuml/dask/ensemble/randomforestclassifier.py
index 2767fc870d..aedfb604d3 100755
--- a/python/cuml/dask/ensemble/randomforestclassifier.py
+++ b/python/cuml/dask/ensemble/randomforestclassifier.py
@@ -206,6 +206,23 @@ def get_summary_text(self):
         """
         return self._get_summary_text()
 
+    def get_combined_model(self):
+        """
+        Return single-GPU model for serialization.
+
+        Returns
+        -------
+
+        model : Trained single-GPU model or None if the model has not
+               yet been trained.
+        """
+
+        # set internal model if it hasn't been accessed before
+        if self._get_internal_model() is None:
+            self._set_internal_model(self._concat_treelite_models())
+
+        return BaseEstimator.get_combined_model(self)
+
     def get_detailed_text(self):
         """
         Obtain the detailed information for the random forest model, as text
diff --git a/python/cuml/dask/ensemble/randomforestregressor.py b/python/cuml/dask/ensemble/randomforestregressor.py
index 0465840da9..008875568b 100755
--- a/python/cuml/dask/ensemble/randomforestregressor.py
+++ b/python/cuml/dask/ensemble/randomforestregressor.py
@@ -206,6 +206,23 @@ def get_summary_text(self):
         """
         return self._get_summary_text()
 
+    def get_combined_model(self):
+        """
+        Return single-GPU model for serialization.
+
+        Returns
+        -------
+
+        model : Trained single-GPU model or None if the model has not
+               yet been trained.
+        """
+
+        # set internal model if it hasn't been accessed before
+        if self._get_internal_model() is None:
+            self._set_internal_model(self._concat_treelite_models())
+
+        return BaseEstimator.get_combined_model(self)
+
     def get_detailed_text(self):
         """
         Obtain the detailed information for the random forest model, as text
diff --git a/python/cuml/test/dask/test_random_forest.py b/python/cuml/test/dask/test_random_forest.py
index bf161ef63d..e52c8f4a36 100644
--- a/python/cuml/test/dask/test_random_forest.py
+++ b/python/cuml/test/dask/test_random_forest.py
@@ -43,6 +43,9 @@
 from cuml.dask.ensemble import RandomForestRegressor as cuRFR_mg
 from cuml.dask.common import utils as dask_utils
 
+from cuml.ensemble import RandomForestClassifier as cuRFC_sg
+from cuml.ensemble import RandomForestRegressor as cuRFR_sg
+
 from dask.array import from_array
 from sklearn.datasets import make_regression, make_classification
 from sklearn.model_selection import train_test_split
@@ -436,6 +439,61 @@ def predict_with_json_rf_regressor(rf, x):
         np.testing.assert_almost_equal(pred, expected_pred, decimal=6)
 
 
+@pytest.mark.parametrize('estimator_type', ['regression', 'classification'])
+def test_rf_get_combined_model_right_aftter_fit(client, estimator_type):
+    max_depth = 3
+    n_estimators = 5
+    X, y = make_classification(
+        n_samples=350,
+        n_features=20,
+        n_clusters_per_class=1,
+        n_informative=10,
+        random_state=123,
+        n_classes=2
+    )
+    X = X.astype(np.float32)
+    if estimator_type == 'classification':
+        cu_rf_mg = cuRFC_mg(
+            max_features=1.0,
+            max_samples=1.0,
+            n_bins=16,
+            split_algo=0,
+            split_criterion=0,
+            min_samples_leaf=2,
+            seed=23707,
+            n_streams=1,
+            n_estimators=n_estimators,
+            max_leaves=-1,
+            max_depth=max_depth
+        )
+        y = y.astype(np.int32)
+    elif estimator_type == 'regression':
+        cu_rf_mg = cuRFR_mg(
+            max_features=1.0,
+            max_samples=1.0,
+            n_bins=16,
+            split_algo=0,
+            min_samples_leaf=2,
+            seed=23707,
+            n_streams=1,
+            n_estimators=n_estimators,
+            max_leaves=-1,
+            max_depth=max_depth
+        )
+        y = y.astype(np.float32)
+    else:
+        assert False
+    X_dask, y_dask = _prep_training_data(client, X, y, partitions_per_worker=2)
+    cu_rf_mg.fit(X_dask, y_dask)
+    single_gpu_model = cu_rf_mg.get_combined_model()
+    if estimator_type == 'classification':
+        assert isinstance(single_gpu_model, cuRFC_sg)
+    elif estimator_type == 'regression':
+        assert isinstance(single_gpu_model, cuRFR_sg)
+    else:
+        assert False
+
+
 @pytest.mark.parametrize('n_estimators', [5, 10, 20])
 @pytest.mark.parametrize('detailed_text', [True, False])
 def test_rf_get_text(client, n_estimators, detailed_text):