LAMDA-NJU · xuyxu · Feb 12, 2021 · Feb 11, 2021 · Feb 11, 2021
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -31,6 +31,7 @@ Version 0.1.*
 .. |Fix| replace:: :raw-html:`<span class="badge badge-danger">Fix</span>` :raw-latex:`{\small\sc [Fix]}`
 .. |API| replace:: :raw-html:`<span class="badge badge-warning">API Change</span>` :raw-latex:`{\small\sc [API Change]}`
 
+- |Feature| support configurable criterion (`#28 <https://github.com/LAMDA-NJU/Deep-Forest/issues/28>`__) @tczhao
 - |Feature| support regression prediction (`#25 <https://github.com/LAMDA-NJU/Deep-Forest/issues/25>`__) @tczhao
 - |Fix| fix accepted data types on the :obj:`binner` (`#23 <https://github.com/LAMDA-NJU/Deep-Forest/pull/23>`__) @xuyxu
 - |Feature| implement the :meth:`get_forest` method for efficient indexing (`#22 <https://github.com/LAMDA-NJU/Deep-Forest/pull/22>`__) @xuyxu

diff --git a/deepforest/_estimator.py b/deepforest/_estimator.py
@@ -13,6 +13,7 @@
 
 def make_classifier_estimator(
     name,
+    criterion,
     n_trees=100,
     max_depth=None,
     min_samples_leaf=1,
@@ -22,6 +23,7 @@ def make_classifier_estimator(
     # RandomForestClassifier
     if name == "rf":
         estimator = RandomForestClassifier(
+            criterion=criterion,
             n_estimators=n_trees,
             max_depth=max_depth,
             min_samples_leaf=min_samples_leaf,
@@ -31,6 +33,7 @@ def make_classifier_estimator(
     # ExtraTreesClassifier
     elif name == "erf":
         estimator = ExtraTreesClassifier(
+            criterion=criterion,
             n_estimators=n_trees,
             max_depth=max_depth,
             min_samples_leaf=min_samples_leaf,
@@ -46,6 +49,7 @@ def make_classifier_estimator(
 
 def make_regressor_estimator(
     name,
+    criterion,
     n_trees=100,
     max_depth=None,
     min_samples_leaf=1,
@@ -55,6 +59,7 @@ def make_regressor_estimator(
     # RandomForestRegressor
     if name == "rf":
         estimator = RandomForestRegressor(
+            criterion=criterion,
             n_estimators=n_trees,
             max_depth=max_depth,
             min_samples_leaf=min_samples_leaf,
@@ -64,6 +69,7 @@ def make_regressor_estimator(
     # ExtraTreesRegressor
     elif name == "erf":
         estimator = ExtraTreesRegressor(
+            criterion=criterion,
             n_estimators=n_trees,
             max_depth=max_depth,
             min_samples_leaf=min_samples_leaf,
@@ -81,6 +87,7 @@ class Estimator(object):
     def __init__(
         self,
         name,
+        criterion,
         n_trees=100,
         max_depth=None,
         min_samples_leaf=1,
@@ -93,6 +100,7 @@ def __init__(
         if self.is_classifier:
             self.estimator_ = make_classifier_estimator(
                 name,
+                criterion,
                 n_trees,
                 max_depth,
                 min_samples_leaf,
@@ -102,6 +110,7 @@ def __init__(
         else:
             self.estimator_ = make_regressor_estimator(
                 name,
+                criterion,
                 n_trees,
                 max_depth,
                 min_samples_leaf,

diff --git a/deepforest/_io.py b/deepforest/_io.py
@@ -319,6 +319,7 @@ def model_loadobj(dirname, obj_type, d=None):
             layer_ = Layer(
                 layer_idx=layer_idx,
                 n_classes=d["n_outputs"],
+                criterion=d["criterion"],
                 n_estimators=d["n_estimators"],
                 partial_mode=d["partial_mode"],
                 buffer=d["buffer"],

diff --git a/deepforest/_layer.py b/deepforest/_layer.py
@@ -47,6 +47,7 @@ def __init__(
         self,
         layer_idx,
         n_classes,
+        criterion,
         n_estimators=2,
         n_trees=100,
         max_depth=None,
@@ -60,6 +61,7 @@ def __init__(
     ):
         self.layer_idx = layer_idx
         self.n_classes = n_classes
+        self.criterion = criterion
         self.n_estimators = n_estimators * 2  # internal conversion
         self.n_trees = n_trees
         self.max_depth = max_depth
@@ -89,6 +91,7 @@ def _make_estimator(self, estimator_idx, estimator_name):
 
         estimator = Estimator(
             name=estimator_name,
+            criterion=self.criterion,
             n_trees=self.n_trees,
             max_depth=self.max_depth,
             min_samples_leaf=self.min_samples_leaf,

diff --git a/deepforest/cascade.py b/deepforest/cascade.py
@@ -28,6 +28,7 @@ def _get_predictor_kwargs(predictor_kwargs, **kwargs) -> dict:
 
 def _build_classifier_predictor(
     predictor_name,
+    criterion,
     n_estimators,
     n_outputs,
     max_depth=None,
@@ -46,6 +47,7 @@ def _build_classifier_predictor(
         predictor = RandomForestClassifier(
             **_get_predictor_kwargs(
                 predictor_kwargs,
+                criterion=criterion,
                 n_estimators=n_estimators,
                 max_depth=max_depth,
                 min_samples_leaf=min_samples_leaf,
@@ -110,6 +112,7 @@ def _build_classifier_predictor(
 
 def _build_regressor_predictor(
     predictor_name,
+    criterion,
     n_estimators,
     n_outputs,
     max_depth=None,
@@ -128,6 +131,7 @@ def _build_regressor_predictor(
         predictor = RandomForestRegressor(
             **_get_predictor_kwargs(
                 predictor_kwargs,
+                criterion=criterion,
                 n_estimators=n_estimators,
                 max_depth=max_depth,
                 min_samples_leaf=min_samples_leaf,
@@ -205,6 +209,10 @@ def _build_regressor_predictor(
         The maximum number of cascade layers in the deep forest. Notice that
         the actual number of layers can be smaller than ``max_layers`` because
         of the internal early stopping stage.
+    criterion : :obj:`{"gini", "entropy"}`, default="gini"
+        The function to measure the quality of a split. Supported criteria 
+        are ``gini`` for the Gini impurity and ``entropy`` for the information 
+        gain. Note: this parameter is tree-specific.
     n_estimators : :obj:`int`, default=2
         The number of estimator in each cascade layer. It will be multiplied
         by 2 internally because each estimator contains a
@@ -311,6 +319,10 @@ def _build_regressor_predictor(
         The maximum number of cascade layers in the deep forest. Notice that
         the actual number of layers can be smaller than ``max_layers`` because
         of the internal early stopping stage.
+    criterion : :obj:`{"mse", "mae"}`, default="mse"
+        The function to measure the quality of a split. Supported criteria are 
+        ``mse`` for the mean squared error, which is equal to variance reduction 
+        as feature selection criterion, and ``mae`` for the mean absolute error.
     n_estimators : :obj:`int`, default=2
         The number of estimator in each cascade layer. It will be multiplied
         by 2 internally because each estimator contains a
@@ -441,6 +453,7 @@ def __init__(
         bin_subsample=2e5,
         bin_type="percentile",
         max_layers=20,
+        criterion="",
         n_estimators=2,
         n_trees=100,
         max_depth=None,
@@ -459,6 +472,7 @@ def __init__(
         self.bin_subsample = bin_subsample
         self.bin_type = bin_type
         self.max_layers = max_layers
+        self.criterion = criterion
         self.n_estimators = n_estimators
         self.n_trees = n_trees
         self.max_depth = max_depth
@@ -710,6 +724,7 @@ def fit(self, X, y, sample_weight=None):
         layer_ = Layer(
             0,
             self.n_outputs_,
+            self.criterion,
             self.n_estimators,
             self._set_n_trees(0),
             self.max_depth,
@@ -785,6 +800,7 @@ def fit(self, X, y, sample_weight=None):
             layer_ = Layer(
                 layer_idx,
                 self.n_outputs_,
+                self.criterion,
                 self.n_estimators,
                 self._set_n_trees(layer_idx),
                 self.max_depth,
@@ -881,6 +897,7 @@ def fit(self, X, y, sample_weight=None):
             if is_classifier(self):
                 self.predictor_ = _build_classifier_predictor(
                     self.predictor_name,
+                    self.criterion,
                     self.n_trees,
                     self.n_outputs_,
                     self.max_depth,
@@ -892,6 +909,7 @@ def fit(self, X, y, sample_weight=None):
             else:
                 self.predictor_ = _build_regressor_predictor(
                     self.predictor_name,
+                    self.criterion,
                     self.n_trees,
                     self.n_outputs_,
                     self.max_depth,
@@ -1016,6 +1034,7 @@ def save(self, dirname="model"):
         # Save each object sequentially
         d = {}
         d["n_estimators"] = self.n_estimators
+        d["criterion"] = self.criterion
         d["n_layers"] = self.n_layers_
         d["n_features"] = self.n_features_
         d["n_outputs"] = self.n_outputs_
@@ -1107,6 +1126,7 @@ def __init__(
         bin_subsample=2e5,
         bin_type="percentile",
         max_layers=20,
+        criterion="gini",
         n_estimators=2,
         n_trees=100,
         max_depth=None,
@@ -1126,6 +1146,7 @@ def __init__(
             bin_subsample=bin_subsample,
             bin_type=bin_type,
             max_layers=max_layers,
+            criterion=criterion,
             n_estimators=n_estimators,
             n_trees=n_trees,
             max_depth=max_depth,
@@ -1302,6 +1323,7 @@ def __init__(
         bin_subsample=2e5,
         bin_type="percentile",
         max_layers=20,
+        criterion="mse",
         n_estimators=2,
         n_trees=100,
         max_depth=None,
@@ -1321,6 +1343,7 @@ def __init__(
             bin_subsample=bin_subsample,
             bin_type=bin_type,
             max_layers=max_layers,
+            criterion=criterion,
             n_estimators=n_estimators,
             n_trees=n_trees,
             max_depth=max_depth,

diff --git a/deepforest/tree/tree.py b/deepforest/tree/tree.py
@@ -49,7 +49,7 @@
 DOUBLE = _tree.DOUBLE
 
 CRITERIA_CLF = {"gini": _criterion.Gini, "entropy": _criterion.Entropy}
-CRITERIA_REG = {"mse": _criterion.MSE}
+CRITERIA_REG = {"mse": _criterion.MSE, "mae": _criterion.MAE}
 
 DENSE_SPLITTERS = {
     "best": _splitter.BestSplitter,

diff --git a/tests/test_layer_estimator.py b/tests/test_layer_estimator.py
@@ -19,9 +19,10 @@
 )
 
 # Parameters
-layer_kwargs = {
+classifier_layer_kwargs = {
     "layer_idx": 0,
     "n_classes": 10,
+    "criterion": "gini",
     "n_estimators": 1,
     "n_trees": 10,
     "max_depth": 3,
@@ -33,8 +34,34 @@
     "verbose": 2,
 }
 
-estimator_kwargs = {
+classifier_estimator_kwargs = {
     "name": "rf",
+    "criterion": "gini",
+    "n_trees": 10,
+    "max_depth": 3,
+    "min_samples_leaf": 10,
+    "n_jobs": -1,
+    "random_state": 42,
+}
+
+regressor_layer_kwargs = {
+    "layer_idx": 0,
+    "n_classes": 1,
+    "criterion": "mse",
+    "n_estimators": 1,
+    "n_trees": 10,
+    "max_depth": 3,
+    "min_samples_leaf": 10,
+    "partial_mode": False,
+    "buffer": None,
+    "n_jobs": -1,
+    "random_state": 42,
+    "verbose": 2,
+}
+
+regressor_estimator_kwargs = {
+    "name": "rf",
+    "criterion": "mse",
     "n_trees": 10,
     "max_depth": 3,
     "min_samples_leaf": 10,
@@ -45,18 +72,24 @@
 
 def test_classifier_layer_properties_after_fitting():
 
-    layer = Layer(**layer_kwargs)
+    layer = Layer(**classifier_layer_kwargs)
     X_aug = layer.fit_transform(X_train, y_train)
     y_pred_full = layer.predict_full(X_test, is_classifier=True)
 
     # n_trees
     assert (
         layer.n_trees_
-        == 2 * layer_kwargs["n_estimators"] * layer_kwargs["n_trees"]
+        == 2
+        * classifier_layer_kwargs["n_estimators"]
+        * classifier_layer_kwargs["n_trees"]
     )
 
     # Output dim
-    expect_dim = 2 * layer_kwargs["n_classes"] * layer_kwargs["n_estimators"]
+    expect_dim = (
+        2
+        * classifier_layer_kwargs["n_classes"]
+        * classifier_layer_kwargs["n_estimators"]
+    )
     assert X_aug.shape[1] == expect_dim
     assert y_pred_full.shape[1] == expect_dim
 
@@ -70,27 +103,32 @@ def test_regressor_layer_properties_after_fitting():
     X_train, X_test, y_train, y_test = train_test_split(
         X_binned, y, test_size=0.42, random_state=42
     )
-    layer = Layer(**layer_kwargs)
+    layer = Layer(**regressor_layer_kwargs)
     layer.is_classifier = False
     X_aug = layer.fit_transform(X_train, y_train)
     y_pred_full = layer.predict_full(X_test, is_classifier=False)
 
     # n_trees
     assert (
         layer.n_trees_
-        == 2 * layer_kwargs["n_estimators"] * layer_kwargs["n_trees"]
+        == 2
+        * regressor_layer_kwargs["n_estimators"]
+        * regressor_layer_kwargs["n_trees"]
     )
 
     # Output dim
-    expect_dim = 2 * layer_kwargs["n_estimators"]
+    expect_dim = 2 * regressor_layer_kwargs["n_estimators"]
     assert X_aug.shape[1] == expect_dim
     assert y_pred_full.shape[1] == expect_dim
 
 
 @pytest.mark.parametrize(
     "param", [(0, {"n_estimators": 0}), (1, {"n_trees": 0})]
 )
-def test_layer_invalid_training_params(param):
+@pytest.mark.parametrize(
+    "layer_kwargs", [(classifier_layer_kwargs), (regressor_layer_kwargs)]
+)
+def test_layer_invalid_training_params(param, layer_kwargs):
     case_kwargs = copy.deepcopy(layer_kwargs)
     case_kwargs.update(param[1])
 
@@ -105,7 +143,11 @@ def test_layer_invalid_training_params(param):
         layer.fit_transform(X_train, y_train)
 
 
-def test_estimator_unknown():
+@pytest.mark.parametrize(
+    "estimator_kwargs",
+    [(classifier_estimator_kwargs), (regressor_estimator_kwargs)],
+)
+def test_estimator_unknown(estimator_kwargs):
     case_kwargs = copy.deepcopy(estimator_kwargs)
     case_kwargs.update({"name": "unknown"})