Allow saving Dask RandomForest models immediately after training (fixes #3331) (#3388)

jameslamb · web-flow · commit df67553234d3 · 2021-02-01T19:30:05.000Z
This attempts to fix #3331. See that issue for a lot more details. Today, `.get_combined_model()` for the Dask RandomForest model objects returns `None` if it's called immediately after training. That pattern is recommended in ["Distributed Model Pickling"](https://docs.rapids.ai/api/cuml/stable/pickling_cuml_models.html#Distributed-Model-Pickling). Without this support, there is not a way to save a Dask RandomForest model using only public methods / attributes on those classes. Per #3331 (comment), this PR proposes populating the internal model object whenever `get_combined_model()` is called. ## Notes for Reviewers * I have not tested this locally. I spent about 3 hours trying to build `cuml` from source following https://github.com/rapidsai/cuml/blob/main/BUILD.md, and was not successful. If there is a containerized setup for developing `cuml`, I'd greatly appreciate it and would be happy to try it out. I've added a unit test for this change, so I hope that will be enough to confirm that this works and that CI will catch any mistakes I've made. Thanks for your time and consideration. Authors: - James Lamb (@jameslamb) - John Zedlewski (@JohnZed) Approvers: - John Zedlewski (@JohnZed) URL: #3388
diff --git a/python/cuml/dask/ensemble/base.py b/python/cuml/dask/ensemble/base.py
@@ -19,6 +19,9 @@
 import numpy as np
 import warnings
 
+from collections.abc import Iterable
+from dask.distributed import Future
+
 from cuml.dask.common.input_utils import DistributedDataHandler, \
     concatenate
 from cuml.dask.common.utils import get_client, wait_and_raise_from_futures
@@ -257,6 +260,34 @@ def _get_json(self):
             combined_dump.extend(obj)
         return json.dumps(combined_dump)
 
+    def get_combined_model(self):
+        """
+        Return single-GPU model for serialization.
+
+        Returns
+        -------
+
+        model : Trained single-GPU model or None if the model has not
+               yet been trained.
+        """
+
+        # set internal model if it hasn't been accessed before
+        if self._get_internal_model() is None:
+            self._set_internal_model(self._concat_treelite_models())
+
+        internal_model = self._check_internal_model(self._get_internal_model())
+
+        if isinstance(self.internal_model, Iterable):
+            # This function needs to return a single instance of cuml.Base,
+            # even if the class is just a composite.
+            raise ValueError("Expected a single instance of cuml.Base "
+                             "but got %s instead." % type(self.internal_model))
+
+        elif isinstance(self.internal_model, Future):
+            internal_model = self.internal_model.result()
+
+        return internal_model
+
 
 def _func_fit(model, input_data, convert_dtype):
     X = concatenate([item[0] for item in input_data])
diff --git a/python/cuml/test/dask/test_random_forest.py b/python/cuml/test/dask/test_random_forest.py
@@ -43,6 +43,9 @@
 from cuml.dask.ensemble import RandomForestRegressor as cuRFR_mg
 from cuml.dask.common import utils as dask_utils
 
+from cuml.ensemble import RandomForestClassifier as cuRFC_sg
+from cuml.ensemble import RandomForestRegressor as cuRFR_sg
+
 from dask.array import from_array
 from sklearn.datasets import make_regression, make_classification
 from sklearn.model_selection import train_test_split
@@ -436,6 +439,47 @@ def predict_with_json_rf_regressor(rf, x):
         np.testing.assert_almost_equal(pred, expected_pred, decimal=6)
 
 
+@pytest.mark.parametrize('estimator_type', ['regression', 'classification'])
+def test_rf_get_combined_model_right_aftter_fit(client, estimator_type):
+    max_depth = 3
+    n_estimators = 5
+    X, y = make_classification()
+    X = X.astype(np.float32)
+    if estimator_type == 'classification':
+        cu_rf_mg = cuRFC_mg(
+            max_features=1.0,
+            max_samples=1.0,
+            n_bins=16,
+            n_streams=1,
+            n_estimators=n_estimators,
+            max_leaves=-1,
+            max_depth=max_depth
+        )
+        y = y.astype(np.int32)
+    elif estimator_type == 'regression':
+        cu_rf_mg = cuRFR_mg(
+            max_features=1.0,
+            max_samples=1.0,
+            n_bins=16,
+            n_streams=1,
+            n_estimators=n_estimators,
+            max_leaves=-1,
+            max_depth=max_depth
+        )
+        y = y.astype(np.float32)
+    else:
+        assert False
+    X_dask, y_dask = _prep_training_data(client, X, y, partitions_per_worker=2)
+    cu_rf_mg.fit(X_dask, y_dask)
+    single_gpu_model = cu_rf_mg.get_combined_model()
+    if estimator_type == 'classification':
+        assert isinstance(single_gpu_model, cuRFC_sg)
+    elif estimator_type == 'regression':
+        assert isinstance(single_gpu_model, cuRFR_sg)
+    else:
+        assert False
+
+
 @pytest.mark.parametrize('n_estimators', [5, 10, 20])
 @pytest.mark.parametrize('detailed_text', [True, False])
 def test_rf_get_text(client, n_estimators, detailed_text):