diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 9d5e7f4..b80450c 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -31,6 +31,7 @@ Version 0.1.* .. |Fix| replace:: :raw-html:`Fix` :raw-latex:`{\small\sc [Fix]}` .. |API| replace:: :raw-html:`API Change` :raw-latex:`{\small\sc [API Change]}` +- |Feature| support configurable criterion (`#28 `__) @tczhao - |Feature| support regression prediction (`#25 `__) @tczhao - |Fix| fix accepted data types on the :obj:`binner` (`#23 `__) @xuyxu - |Feature| implement the :meth:`get_forest` method for efficient indexing (`#22 `__) @xuyxu diff --git a/deepforest/_estimator.py b/deepforest/_estimator.py index 501c5c9..5419af9 100644 --- a/deepforest/_estimator.py +++ b/deepforest/_estimator.py @@ -13,6 +13,7 @@ def make_classifier_estimator( name, + criterion, n_trees=100, max_depth=None, min_samples_leaf=1, @@ -22,6 +23,7 @@ def make_classifier_estimator( # RandomForestClassifier if name == "rf": estimator = RandomForestClassifier( + criterion=criterion, n_estimators=n_trees, max_depth=max_depth, min_samples_leaf=min_samples_leaf, @@ -31,6 +33,7 @@ def make_classifier_estimator( # ExtraTreesClassifier elif name == "erf": estimator = ExtraTreesClassifier( + criterion=criterion, n_estimators=n_trees, max_depth=max_depth, min_samples_leaf=min_samples_leaf, @@ -46,6 +49,7 @@ def make_classifier_estimator( def make_regressor_estimator( name, + criterion, n_trees=100, max_depth=None, min_samples_leaf=1, @@ -55,6 +59,7 @@ def make_regressor_estimator( # RandomForestRegressor if name == "rf": estimator = RandomForestRegressor( + criterion=criterion, n_estimators=n_trees, max_depth=max_depth, min_samples_leaf=min_samples_leaf, @@ -64,6 +69,7 @@ def make_regressor_estimator( # ExtraTreesRegressor elif name == "erf": estimator = ExtraTreesRegressor( + criterion=criterion, n_estimators=n_trees, max_depth=max_depth, min_samples_leaf=min_samples_leaf, @@ -81,6 +87,7 @@ class Estimator(object): def __init__( self, name, + criterion, n_trees=100, max_depth=None, min_samples_leaf=1, @@ -93,6 +100,7 @@ def __init__( if self.is_classifier: self.estimator_ = make_classifier_estimator( name, + criterion, n_trees, max_depth, min_samples_leaf, @@ -102,6 +110,7 @@ def __init__( else: self.estimator_ = make_regressor_estimator( name, + criterion, n_trees, max_depth, min_samples_leaf, diff --git a/deepforest/_io.py b/deepforest/_io.py index 0db1dea..419857d 100644 --- a/deepforest/_io.py +++ b/deepforest/_io.py @@ -319,6 +319,7 @@ def model_loadobj(dirname, obj_type, d=None): layer_ = Layer( layer_idx=layer_idx, n_classes=d["n_outputs"], + criterion=d["criterion"], n_estimators=d["n_estimators"], partial_mode=d["partial_mode"], buffer=d["buffer"], diff --git a/deepforest/_layer.py b/deepforest/_layer.py index ee76956..4af360c 100644 --- a/deepforest/_layer.py +++ b/deepforest/_layer.py @@ -47,6 +47,7 @@ def __init__( self, layer_idx, n_classes, + criterion, n_estimators=2, n_trees=100, max_depth=None, @@ -60,6 +61,7 @@ def __init__( ): self.layer_idx = layer_idx self.n_classes = n_classes + self.criterion = criterion self.n_estimators = n_estimators * 2 # internal conversion self.n_trees = n_trees self.max_depth = max_depth @@ -89,6 +91,7 @@ def _make_estimator(self, estimator_idx, estimator_name): estimator = Estimator( name=estimator_name, + criterion=self.criterion, n_trees=self.n_trees, max_depth=self.max_depth, min_samples_leaf=self.min_samples_leaf, diff --git a/deepforest/cascade.py b/deepforest/cascade.py index 8d25485..25ed539 100644 --- a/deepforest/cascade.py +++ b/deepforest/cascade.py @@ -28,6 +28,7 @@ def _get_predictor_kwargs(predictor_kwargs, **kwargs) -> dict: def _build_classifier_predictor( predictor_name, + criterion, n_estimators, n_outputs, max_depth=None, @@ -46,6 +47,7 @@ def _build_classifier_predictor( predictor = RandomForestClassifier( **_get_predictor_kwargs( predictor_kwargs, + criterion=criterion, n_estimators=n_estimators, max_depth=max_depth, min_samples_leaf=min_samples_leaf, @@ -110,6 +112,7 @@ def _build_classifier_predictor( def _build_regressor_predictor( predictor_name, + criterion, n_estimators, n_outputs, max_depth=None, @@ -128,6 +131,7 @@ def _build_regressor_predictor( predictor = RandomForestRegressor( **_get_predictor_kwargs( predictor_kwargs, + criterion=criterion, n_estimators=n_estimators, max_depth=max_depth, min_samples_leaf=min_samples_leaf, @@ -205,6 +209,10 @@ def _build_regressor_predictor( The maximum number of cascade layers in the deep forest. Notice that the actual number of layers can be smaller than ``max_layers`` because of the internal early stopping stage. + criterion : :obj:`{"gini", "entropy"}`, default="gini" + The function to measure the quality of a split. Supported criteria + are ``gini`` for the Gini impurity and ``entropy`` for the information + gain. Note: this parameter is tree-specific. n_estimators : :obj:`int`, default=2 The number of estimator in each cascade layer. It will be multiplied by 2 internally because each estimator contains a @@ -311,6 +319,10 @@ def _build_regressor_predictor( The maximum number of cascade layers in the deep forest. Notice that the actual number of layers can be smaller than ``max_layers`` because of the internal early stopping stage. + criterion : :obj:`{"mse", "mae"}`, default="mse" + The function to measure the quality of a split. Supported criteria are + ``mse`` for the mean squared error, which is equal to variance reduction + as feature selection criterion, and ``mae`` for the mean absolute error. n_estimators : :obj:`int`, default=2 The number of estimator in each cascade layer. It will be multiplied by 2 internally because each estimator contains a @@ -441,6 +453,7 @@ def __init__( bin_subsample=2e5, bin_type="percentile", max_layers=20, + criterion="", n_estimators=2, n_trees=100, max_depth=None, @@ -459,6 +472,7 @@ def __init__( self.bin_subsample = bin_subsample self.bin_type = bin_type self.max_layers = max_layers + self.criterion = criterion self.n_estimators = n_estimators self.n_trees = n_trees self.max_depth = max_depth @@ -710,6 +724,7 @@ def fit(self, X, y, sample_weight=None): layer_ = Layer( 0, self.n_outputs_, + self.criterion, self.n_estimators, self._set_n_trees(0), self.max_depth, @@ -785,6 +800,7 @@ def fit(self, X, y, sample_weight=None): layer_ = Layer( layer_idx, self.n_outputs_, + self.criterion, self.n_estimators, self._set_n_trees(layer_idx), self.max_depth, @@ -881,6 +897,7 @@ def fit(self, X, y, sample_weight=None): if is_classifier(self): self.predictor_ = _build_classifier_predictor( self.predictor_name, + self.criterion, self.n_trees, self.n_outputs_, self.max_depth, @@ -892,6 +909,7 @@ def fit(self, X, y, sample_weight=None): else: self.predictor_ = _build_regressor_predictor( self.predictor_name, + self.criterion, self.n_trees, self.n_outputs_, self.max_depth, @@ -1016,6 +1034,7 @@ def save(self, dirname="model"): # Save each object sequentially d = {} d["n_estimators"] = self.n_estimators + d["criterion"] = self.criterion d["n_layers"] = self.n_layers_ d["n_features"] = self.n_features_ d["n_outputs"] = self.n_outputs_ @@ -1107,6 +1126,7 @@ def __init__( bin_subsample=2e5, bin_type="percentile", max_layers=20, + criterion="gini", n_estimators=2, n_trees=100, max_depth=None, @@ -1126,6 +1146,7 @@ def __init__( bin_subsample=bin_subsample, bin_type=bin_type, max_layers=max_layers, + criterion=criterion, n_estimators=n_estimators, n_trees=n_trees, max_depth=max_depth, @@ -1302,6 +1323,7 @@ def __init__( bin_subsample=2e5, bin_type="percentile", max_layers=20, + criterion="mse", n_estimators=2, n_trees=100, max_depth=None, @@ -1321,6 +1343,7 @@ def __init__( bin_subsample=bin_subsample, bin_type=bin_type, max_layers=max_layers, + criterion=criterion, n_estimators=n_estimators, n_trees=n_trees, max_depth=max_depth, diff --git a/deepforest/tree/tree.py b/deepforest/tree/tree.py index ead6dc2..971b1b1 100644 --- a/deepforest/tree/tree.py +++ b/deepforest/tree/tree.py @@ -49,7 +49,7 @@ DOUBLE = _tree.DOUBLE CRITERIA_CLF = {"gini": _criterion.Gini, "entropy": _criterion.Entropy} -CRITERIA_REG = {"mse": _criterion.MSE} +CRITERIA_REG = {"mse": _criterion.MSE, "mae": _criterion.MAE} DENSE_SPLITTERS = { "best": _splitter.BestSplitter, diff --git a/tests/test_layer_estimator.py b/tests/test_layer_estimator.py index 425f93c..e596689 100644 --- a/tests/test_layer_estimator.py +++ b/tests/test_layer_estimator.py @@ -19,9 +19,10 @@ ) # Parameters -layer_kwargs = { +classifier_layer_kwargs = { "layer_idx": 0, "n_classes": 10, + "criterion": "gini", "n_estimators": 1, "n_trees": 10, "max_depth": 3, @@ -33,8 +34,34 @@ "verbose": 2, } -estimator_kwargs = { +classifier_estimator_kwargs = { "name": "rf", + "criterion": "gini", + "n_trees": 10, + "max_depth": 3, + "min_samples_leaf": 10, + "n_jobs": -1, + "random_state": 42, +} + +regressor_layer_kwargs = { + "layer_idx": 0, + "n_classes": 1, + "criterion": "mse", + "n_estimators": 1, + "n_trees": 10, + "max_depth": 3, + "min_samples_leaf": 10, + "partial_mode": False, + "buffer": None, + "n_jobs": -1, + "random_state": 42, + "verbose": 2, +} + +regressor_estimator_kwargs = { + "name": "rf", + "criterion": "mse", "n_trees": 10, "max_depth": 3, "min_samples_leaf": 10, @@ -45,18 +72,24 @@ def test_classifier_layer_properties_after_fitting(): - layer = Layer(**layer_kwargs) + layer = Layer(**classifier_layer_kwargs) X_aug = layer.fit_transform(X_train, y_train) y_pred_full = layer.predict_full(X_test, is_classifier=True) # n_trees assert ( layer.n_trees_ - == 2 * layer_kwargs["n_estimators"] * layer_kwargs["n_trees"] + == 2 + * classifier_layer_kwargs["n_estimators"] + * classifier_layer_kwargs["n_trees"] ) # Output dim - expect_dim = 2 * layer_kwargs["n_classes"] * layer_kwargs["n_estimators"] + expect_dim = ( + 2 + * classifier_layer_kwargs["n_classes"] + * classifier_layer_kwargs["n_estimators"] + ) assert X_aug.shape[1] == expect_dim assert y_pred_full.shape[1] == expect_dim @@ -70,7 +103,7 @@ def test_regressor_layer_properties_after_fitting(): X_train, X_test, y_train, y_test = train_test_split( X_binned, y, test_size=0.42, random_state=42 ) - layer = Layer(**layer_kwargs) + layer = Layer(**regressor_layer_kwargs) layer.is_classifier = False X_aug = layer.fit_transform(X_train, y_train) y_pred_full = layer.predict_full(X_test, is_classifier=False) @@ -78,11 +111,13 @@ def test_regressor_layer_properties_after_fitting(): # n_trees assert ( layer.n_trees_ - == 2 * layer_kwargs["n_estimators"] * layer_kwargs["n_trees"] + == 2 + * regressor_layer_kwargs["n_estimators"] + * regressor_layer_kwargs["n_trees"] ) # Output dim - expect_dim = 2 * layer_kwargs["n_estimators"] + expect_dim = 2 * regressor_layer_kwargs["n_estimators"] assert X_aug.shape[1] == expect_dim assert y_pred_full.shape[1] == expect_dim @@ -90,7 +125,10 @@ def test_regressor_layer_properties_after_fitting(): @pytest.mark.parametrize( "param", [(0, {"n_estimators": 0}), (1, {"n_trees": 0})] ) -def test_layer_invalid_training_params(param): +@pytest.mark.parametrize( + "layer_kwargs", [(classifier_layer_kwargs), (regressor_layer_kwargs)] +) +def test_layer_invalid_training_params(param, layer_kwargs): case_kwargs = copy.deepcopy(layer_kwargs) case_kwargs.update(param[1]) @@ -105,7 +143,11 @@ def test_layer_invalid_training_params(param): layer.fit_transform(X_train, y_train) -def test_estimator_unknown(): +@pytest.mark.parametrize( + "estimator_kwargs", + [(classifier_estimator_kwargs), (regressor_estimator_kwargs)], +) +def test_estimator_unknown(estimator_kwargs): case_kwargs = copy.deepcopy(estimator_kwargs) case_kwargs.update({"name": "unknown"}) diff --git a/tests/test_model_classifier.py b/tests/test_model_classifier.py index b304de4..6546858 100644 --- a/tests/test_model_classifier.py +++ b/tests/test_model_classifier.py @@ -25,6 +25,7 @@ "bin_subsample": 2e5, "max_layers": 10, "n_estimators": 1, + "criterion": "gini", "n_trees": 100, "max_depth": 3, "min_samples_leaf": 1, @@ -43,6 +44,7 @@ "bin_subsample": 2e5, "max_layers": 10, "n_estimators": 2, + "criterion": "gini", "n_trees": 100, "max_depth": None, "min_samples_leaf": 1, @@ -239,14 +241,14 @@ def test_model_invalid_training_params(param): @pytest.mark.parametrize("predictor", ["forest", "xgboost", "lightgbm"]) def test_classifier_predictor_normal(predictor): deepforest.cascade._build_classifier_predictor( - predictor, n_estimators=1, n_outputs=2 + predictor, criterion="gini", n_estimators=1, n_outputs=2 ) def test_classifier_predictor_unknown(): with pytest.raises(NotImplementedError) as excinfo: deepforest.cascade._build_classifier_predictor( - "unknown", n_estimators=1, n_outputs=2 + "unknown", criterion="gini", n_estimators=1, n_outputs=2 ) assert "name of the predictor should be one of" in str(excinfo.value) diff --git a/tests/test_model_regressor.py b/tests/test_model_regressor.py index 1243e09..6d13916 100644 --- a/tests/test_model_regressor.py +++ b/tests/test_model_regressor.py @@ -24,6 +24,7 @@ "n_bins": 10, "bin_subsample": 2e5, "max_layers": 10, + "criterion": "mse", "n_estimators": 1, "n_trees": 100, "max_depth": 3, @@ -42,6 +43,7 @@ "n_bins": 255, "bin_subsample": 2e5, "max_layers": 10, + "criterion": "mse", "n_estimators": 2, "n_trees": 100, "max_depth": None, @@ -186,14 +188,14 @@ def test_model_invalid_training_params(param): @pytest.mark.parametrize("predictor", ["forest", "xgboost", "lightgbm"]) def test_regressor_predictor_normal(predictor): deepforest.cascade._build_regressor_predictor( - predictor, n_estimators=1, n_outputs=2 + predictor, criterion="mse", n_estimators=1, n_outputs=2 ) def test_regressor_predictor_unknown(): with pytest.raises(NotImplementedError) as excinfo: deepforest.cascade._build_regressor_predictor( - "unknown", n_estimators=1, n_outputs=2 + "unknown", criterion="mse", n_estimators=1, n_outputs=2 ) assert "name of the predictor should be one of" in str(excinfo.value)