diff --git a/src/python/nimbusml/internal/core/base_pipeline_item.py b/src/python/nimbusml/internal/core/base_pipeline_item.py index e45b3d0c..fa02a3c3 100644 --- a/src/python/nimbusml/internal/core/base_pipeline_item.py +++ b/src/python/nimbusml/internal/core/base_pipeline_item.py @@ -375,6 +375,8 @@ def _get_node(self, **params): def __getstate__(self): "Selects what to pickle." odict = self.__dict__.copy() + odict['export_version'] = 1 + if hasattr(self, 'model_') and \ self.model_ is not None and os.path.isfile(self.model_): with open(self.model_, "rb") as mfile: @@ -387,8 +389,11 @@ def __getstate__(self): def __setstate__(self, state): "Restore a pickled object." for k, v in state.items(): - if k not in {'modelbytes', 'type'}: + if k not in {'modelbytes', 'type', 'export_version'}: setattr(self, k, v) + + # Note: modelbytes and type were + # added before export_version 1 if 'modelbytes' in state: (fd, modelfile) = tempfile.mkstemp() fl = os.fdopen(fd, "wb") diff --git a/src/python/nimbusml/pipeline.py b/src/python/nimbusml/pipeline.py index 5a15bac4..8e9510d1 100644 --- a/src/python/nimbusml/pipeline.py +++ b/src/python/nimbusml/pipeline.py @@ -5,6 +5,7 @@ import inspect import itertools import os +import tempfile import time import warnings from collections import OrderedDict, namedtuple, defaultdict @@ -2265,6 +2266,38 @@ def load_model(self, src): self.model = src self.steps = [] + def __getstate__(self): + odict = {'export_version': 1} + + if hasattr(self, 'steps'): + odict['steps'] = self.steps + + if (hasattr(self, 'model') and + self.model is not None and + os.path.isfile(self.model)): + + with open(self.model, "rb") as f: + odict['modelbytes'] = f.read() + + return odict + + def __setstate__(self, state): + self.steps = [] + self.model = None + self.random_state = None + + for k, v in state.items(): + if k not in {'modelbytes', 'export_version'}: + setattr(self, k, v) + + if state.get('export_version', 0) == 1: + if 'modelbytes' in state: + (fd, modelfile) = tempfile.mkstemp() + fl = os.fdopen(fd, "wb") + fl.write(state['modelbytes']) + fl.close() + self.model = modelfile + @trace def score( self, diff --git a/src/python/nimbusml/tests/pipeline/test_load_save.py b/src/python/nimbusml/tests/pipeline/test_load_save.py index 309650b5..7102c066 100644 --- a/src/python/nimbusml/tests/pipeline/test_load_save.py +++ b/src/python/nimbusml/tests/pipeline/test_load_save.py @@ -3,6 +3,7 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------------------------- +import os import pickle import unittest @@ -44,8 +45,14 @@ def test_model_dataframe(self): model_nimbusml.fit(train, label) # Save with pickle - pickle.dump(model_nimbusml, open('nimbusml_model.p', 'wb')) - model_nimbusml_pickle = pickle.load(open("nimbusml_model.p", "rb")) + pickle_filename = 'nimbusml_model.p' + with open(pickle_filename, 'wb') as f: + pickle.dump(model_nimbusml, f) + + with open(pickle_filename, "rb") as f: + model_nimbusml_pickle = pickle.load(f) + + os.remove(pickle_filename) score1 = model_nimbusml.predict(test).head(5) score2 = model_nimbusml_pickle.predict(test).head(5) @@ -72,6 +79,8 @@ def test_model_dataframe(self): model_nimbusml_load.sum().sum(), decimal=2) + os.remove('model.nimbusml.m') + def test_model_datastream(self): model_nimbusml = Pipeline( steps=[ @@ -85,8 +94,14 @@ def test_model_datastream(self): model_nimbusml.fit(train, label) # Save with pickle - pickle.dump(model_nimbusml, open('nimbusml_model.p', 'wb')) - model_nimbusml_pickle = pickle.load(open("nimbusml_model.p", "rb")) + pickle_filename = 'nimbusml_model.p' + with open(pickle_filename, 'wb') as f: + pickle.dump(model_nimbusml, f) + + with open(pickle_filename, "rb") as f: + model_nimbusml_pickle = pickle.load(f) + + os.remove(pickle_filename) score1 = model_nimbusml.predict(test).head(5) score2 = model_nimbusml_pickle.predict(test).head(5) @@ -119,6 +134,94 @@ def test_model_datastream(self): model_nimbusml_load.sum().sum(), decimal=2) + os.remove('model.nimbusml.m') + + def test_pipeline_saves_complete_model_file_when_pickled(self): + model_nimbusml = Pipeline( + steps=[ + ('cat', + OneHotVectorizer() << categorical_columns), + ('linear', + FastLinearBinaryClassifier( + shuffle=False, + number_of_threads=1))]) + + model_nimbusml.fit(train, label) + metrics, score = model_nimbusml.test(test, test_label, output_scores=True) + + pickle_filename = 'nimbusml_model.p' + + # Save with pickle + with open(pickle_filename, 'wb') as f: + pickle.dump(model_nimbusml, f) + + # Remove the pipeline model from disk so + # that the unpickled pipeline is forced + # to get its model from the pickled file. + os.remove(model_nimbusml.model) + + with open(pickle_filename, "rb") as f: + model_nimbusml_pickle = pickle.load(f) + + os.remove(pickle_filename) + + metrics_pickle, score_pickle = model_nimbusml_pickle.test( + test, test_label, output_scores=True) + + assert_almost_equal(score.sum().sum(), + score_pickle.sum().sum(), + decimal=2) + + assert_almost_equal(metrics.sum().sum(), + metrics_pickle.sum().sum(), + decimal=2) + + def test_unfitted_pickled_pipeline_can_be_fit(self): + pipeline = Pipeline( + steps=[ + ('cat', + OneHotVectorizer() << categorical_columns), + ('linear', + FastLinearBinaryClassifier( + shuffle=False, + number_of_threads=1))]) + + pipeline.fit(train, label) + metrics, score = pipeline.test(test, test_label, output_scores=True) + + # Create a new unfitted pipeline + pipeline = Pipeline( + steps=[ + ('cat', + OneHotVectorizer() << categorical_columns), + ('linear', + FastLinearBinaryClassifier( + shuffle=False, + number_of_threads=1))]) + + pickle_filename = 'nimbusml_model.p' + + # Save with pickle + with open(pickle_filename, 'wb') as f: + pickle.dump(pipeline, f) + + with open(pickle_filename, "rb") as f: + pipeline_pickle = pickle.load(f) + + os.remove(pickle_filename) + + pipeline_pickle.fit(train, label) + metrics_pickle, score_pickle = pipeline_pickle.test( + test, test_label, output_scores=True) + + assert_almost_equal(score.sum().sum(), + score_pickle.sum().sum(), + decimal=2) + + assert_almost_equal(metrics.sum().sum(), + metrics_pickle.sum().sum(), + decimal=2) + if __name__ == '__main__': unittest.main() diff --git a/src/python/nimbusml/tests/scikit/test_uci_adult_scikit.py b/src/python/nimbusml/tests/scikit/test_uci_adult_scikit.py index 503c21a6..a08ce3b9 100644 --- a/src/python/nimbusml/tests/scikit/test_uci_adult_scikit.py +++ b/src/python/nimbusml/tests/scikit/test_uci_adult_scikit.py @@ -3,6 +3,7 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------------------------- +import os import pickle import unittest @@ -111,6 +112,7 @@ def test_pickle_predictor(self): # Unpickle model and score. We should get the exact same accuracy as # above s = pickle.dumps(ftree) + os.remove(ftree.model_) ftree2 = pickle.loads(s) scores2 = ftree2.predict(X_test) accu2 = np.mean(y_test.values.ravel() == scores2.values) @@ -130,6 +132,7 @@ def test_pickle_transform(self): # Unpickle transform and generate output. # We should get the exact same output as above s = pickle.dumps(cat) + os.remove(cat.model_) cat2 = pickle.loads(s) out2 = cat2.transform(X_train) assert_equal( @@ -158,7 +161,10 @@ def test_pickle_pipeline(self): # Unpickle model and score. We should get the exact same accuracy as # above s = pickle.dumps(pipe) + os.remove(cat.model_) + os.remove(ftree.model_) pipe2 = pickle.loads(s) + scores2 = pipe2.predict(X_test) accu2 = np.mean(y_test.values.ravel() == scores2.values) assert_equal(