diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 9d5e7f4..b80450c 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -31,6 +31,7 @@ Version 0.1.*
.. |Fix| replace:: :raw-html:`Fix` :raw-latex:`{\small\sc [Fix]}`
.. |API| replace:: :raw-html:`API Change` :raw-latex:`{\small\sc [API Change]}`
+- |Feature| support configurable criterion (`#28 `__) @tczhao
- |Feature| support regression prediction (`#25 `__) @tczhao
- |Fix| fix accepted data types on the :obj:`binner` (`#23 `__) @xuyxu
- |Feature| implement the :meth:`get_forest` method for efficient indexing (`#22 `__) @xuyxu
diff --git a/deepforest/_estimator.py b/deepforest/_estimator.py
index 501c5c9..5419af9 100644
--- a/deepforest/_estimator.py
+++ b/deepforest/_estimator.py
@@ -13,6 +13,7 @@
def make_classifier_estimator(
name,
+ criterion,
n_trees=100,
max_depth=None,
min_samples_leaf=1,
@@ -22,6 +23,7 @@ def make_classifier_estimator(
# RandomForestClassifier
if name == "rf":
estimator = RandomForestClassifier(
+ criterion=criterion,
n_estimators=n_trees,
max_depth=max_depth,
min_samples_leaf=min_samples_leaf,
@@ -31,6 +33,7 @@ def make_classifier_estimator(
# ExtraTreesClassifier
elif name == "erf":
estimator = ExtraTreesClassifier(
+ criterion=criterion,
n_estimators=n_trees,
max_depth=max_depth,
min_samples_leaf=min_samples_leaf,
@@ -46,6 +49,7 @@ def make_classifier_estimator(
def make_regressor_estimator(
name,
+ criterion,
n_trees=100,
max_depth=None,
min_samples_leaf=1,
@@ -55,6 +59,7 @@ def make_regressor_estimator(
# RandomForestRegressor
if name == "rf":
estimator = RandomForestRegressor(
+ criterion=criterion,
n_estimators=n_trees,
max_depth=max_depth,
min_samples_leaf=min_samples_leaf,
@@ -64,6 +69,7 @@ def make_regressor_estimator(
# ExtraTreesRegressor
elif name == "erf":
estimator = ExtraTreesRegressor(
+ criterion=criterion,
n_estimators=n_trees,
max_depth=max_depth,
min_samples_leaf=min_samples_leaf,
@@ -81,6 +87,7 @@ class Estimator(object):
def __init__(
self,
name,
+ criterion,
n_trees=100,
max_depth=None,
min_samples_leaf=1,
@@ -93,6 +100,7 @@ def __init__(
if self.is_classifier:
self.estimator_ = make_classifier_estimator(
name,
+ criterion,
n_trees,
max_depth,
min_samples_leaf,
@@ -102,6 +110,7 @@ def __init__(
else:
self.estimator_ = make_regressor_estimator(
name,
+ criterion,
n_trees,
max_depth,
min_samples_leaf,
diff --git a/deepforest/_io.py b/deepforest/_io.py
index 0db1dea..419857d 100644
--- a/deepforest/_io.py
+++ b/deepforest/_io.py
@@ -319,6 +319,7 @@ def model_loadobj(dirname, obj_type, d=None):
layer_ = Layer(
layer_idx=layer_idx,
n_classes=d["n_outputs"],
+ criterion=d["criterion"],
n_estimators=d["n_estimators"],
partial_mode=d["partial_mode"],
buffer=d["buffer"],
diff --git a/deepforest/_layer.py b/deepforest/_layer.py
index ee76956..4af360c 100644
--- a/deepforest/_layer.py
+++ b/deepforest/_layer.py
@@ -47,6 +47,7 @@ def __init__(
self,
layer_idx,
n_classes,
+ criterion,
n_estimators=2,
n_trees=100,
max_depth=None,
@@ -60,6 +61,7 @@ def __init__(
):
self.layer_idx = layer_idx
self.n_classes = n_classes
+ self.criterion = criterion
self.n_estimators = n_estimators * 2 # internal conversion
self.n_trees = n_trees
self.max_depth = max_depth
@@ -89,6 +91,7 @@ def _make_estimator(self, estimator_idx, estimator_name):
estimator = Estimator(
name=estimator_name,
+ criterion=self.criterion,
n_trees=self.n_trees,
max_depth=self.max_depth,
min_samples_leaf=self.min_samples_leaf,
diff --git a/deepforest/cascade.py b/deepforest/cascade.py
index 8d25485..25ed539 100644
--- a/deepforest/cascade.py
+++ b/deepforest/cascade.py
@@ -28,6 +28,7 @@ def _get_predictor_kwargs(predictor_kwargs, **kwargs) -> dict:
def _build_classifier_predictor(
predictor_name,
+ criterion,
n_estimators,
n_outputs,
max_depth=None,
@@ -46,6 +47,7 @@ def _build_classifier_predictor(
predictor = RandomForestClassifier(
**_get_predictor_kwargs(
predictor_kwargs,
+ criterion=criterion,
n_estimators=n_estimators,
max_depth=max_depth,
min_samples_leaf=min_samples_leaf,
@@ -110,6 +112,7 @@ def _build_classifier_predictor(
def _build_regressor_predictor(
predictor_name,
+ criterion,
n_estimators,
n_outputs,
max_depth=None,
@@ -128,6 +131,7 @@ def _build_regressor_predictor(
predictor = RandomForestRegressor(
**_get_predictor_kwargs(
predictor_kwargs,
+ criterion=criterion,
n_estimators=n_estimators,
max_depth=max_depth,
min_samples_leaf=min_samples_leaf,
@@ -205,6 +209,10 @@ def _build_regressor_predictor(
The maximum number of cascade layers in the deep forest. Notice that
the actual number of layers can be smaller than ``max_layers`` because
of the internal early stopping stage.
+ criterion : :obj:`{"gini", "entropy"}`, default="gini"
+ The function to measure the quality of a split. Supported criteria
+ are ``gini`` for the Gini impurity and ``entropy`` for the information
+ gain. Note: this parameter is tree-specific.
n_estimators : :obj:`int`, default=2
The number of estimator in each cascade layer. It will be multiplied
by 2 internally because each estimator contains a
@@ -311,6 +319,10 @@ def _build_regressor_predictor(
The maximum number of cascade layers in the deep forest. Notice that
the actual number of layers can be smaller than ``max_layers`` because
of the internal early stopping stage.
+ criterion : :obj:`{"mse", "mae"}`, default="mse"
+ The function to measure the quality of a split. Supported criteria are
+ ``mse`` for the mean squared error, which is equal to variance reduction
+ as feature selection criterion, and ``mae`` for the mean absolute error.
n_estimators : :obj:`int`, default=2
The number of estimator in each cascade layer. It will be multiplied
by 2 internally because each estimator contains a
@@ -441,6 +453,7 @@ def __init__(
bin_subsample=2e5,
bin_type="percentile",
max_layers=20,
+ criterion="",
n_estimators=2,
n_trees=100,
max_depth=None,
@@ -459,6 +472,7 @@ def __init__(
self.bin_subsample = bin_subsample
self.bin_type = bin_type
self.max_layers = max_layers
+ self.criterion = criterion
self.n_estimators = n_estimators
self.n_trees = n_trees
self.max_depth = max_depth
@@ -710,6 +724,7 @@ def fit(self, X, y, sample_weight=None):
layer_ = Layer(
0,
self.n_outputs_,
+ self.criterion,
self.n_estimators,
self._set_n_trees(0),
self.max_depth,
@@ -785,6 +800,7 @@ def fit(self, X, y, sample_weight=None):
layer_ = Layer(
layer_idx,
self.n_outputs_,
+ self.criterion,
self.n_estimators,
self._set_n_trees(layer_idx),
self.max_depth,
@@ -881,6 +897,7 @@ def fit(self, X, y, sample_weight=None):
if is_classifier(self):
self.predictor_ = _build_classifier_predictor(
self.predictor_name,
+ self.criterion,
self.n_trees,
self.n_outputs_,
self.max_depth,
@@ -892,6 +909,7 @@ def fit(self, X, y, sample_weight=None):
else:
self.predictor_ = _build_regressor_predictor(
self.predictor_name,
+ self.criterion,
self.n_trees,
self.n_outputs_,
self.max_depth,
@@ -1016,6 +1034,7 @@ def save(self, dirname="model"):
# Save each object sequentially
d = {}
d["n_estimators"] = self.n_estimators
+ d["criterion"] = self.criterion
d["n_layers"] = self.n_layers_
d["n_features"] = self.n_features_
d["n_outputs"] = self.n_outputs_
@@ -1107,6 +1126,7 @@ def __init__(
bin_subsample=2e5,
bin_type="percentile",
max_layers=20,
+ criterion="gini",
n_estimators=2,
n_trees=100,
max_depth=None,
@@ -1126,6 +1146,7 @@ def __init__(
bin_subsample=bin_subsample,
bin_type=bin_type,
max_layers=max_layers,
+ criterion=criterion,
n_estimators=n_estimators,
n_trees=n_trees,
max_depth=max_depth,
@@ -1302,6 +1323,7 @@ def __init__(
bin_subsample=2e5,
bin_type="percentile",
max_layers=20,
+ criterion="mse",
n_estimators=2,
n_trees=100,
max_depth=None,
@@ -1321,6 +1343,7 @@ def __init__(
bin_subsample=bin_subsample,
bin_type=bin_type,
max_layers=max_layers,
+ criterion=criterion,
n_estimators=n_estimators,
n_trees=n_trees,
max_depth=max_depth,
diff --git a/deepforest/tree/tree.py b/deepforest/tree/tree.py
index ead6dc2..971b1b1 100644
--- a/deepforest/tree/tree.py
+++ b/deepforest/tree/tree.py
@@ -49,7 +49,7 @@
DOUBLE = _tree.DOUBLE
CRITERIA_CLF = {"gini": _criterion.Gini, "entropy": _criterion.Entropy}
-CRITERIA_REG = {"mse": _criterion.MSE}
+CRITERIA_REG = {"mse": _criterion.MSE, "mae": _criterion.MAE}
DENSE_SPLITTERS = {
"best": _splitter.BestSplitter,
diff --git a/tests/test_layer_estimator.py b/tests/test_layer_estimator.py
index 425f93c..e596689 100644
--- a/tests/test_layer_estimator.py
+++ b/tests/test_layer_estimator.py
@@ -19,9 +19,10 @@
)
# Parameters
-layer_kwargs = {
+classifier_layer_kwargs = {
"layer_idx": 0,
"n_classes": 10,
+ "criterion": "gini",
"n_estimators": 1,
"n_trees": 10,
"max_depth": 3,
@@ -33,8 +34,34 @@
"verbose": 2,
}
-estimator_kwargs = {
+classifier_estimator_kwargs = {
"name": "rf",
+ "criterion": "gini",
+ "n_trees": 10,
+ "max_depth": 3,
+ "min_samples_leaf": 10,
+ "n_jobs": -1,
+ "random_state": 42,
+}
+
+regressor_layer_kwargs = {
+ "layer_idx": 0,
+ "n_classes": 1,
+ "criterion": "mse",
+ "n_estimators": 1,
+ "n_trees": 10,
+ "max_depth": 3,
+ "min_samples_leaf": 10,
+ "partial_mode": False,
+ "buffer": None,
+ "n_jobs": -1,
+ "random_state": 42,
+ "verbose": 2,
+}
+
+regressor_estimator_kwargs = {
+ "name": "rf",
+ "criterion": "mse",
"n_trees": 10,
"max_depth": 3,
"min_samples_leaf": 10,
@@ -45,18 +72,24 @@
def test_classifier_layer_properties_after_fitting():
- layer = Layer(**layer_kwargs)
+ layer = Layer(**classifier_layer_kwargs)
X_aug = layer.fit_transform(X_train, y_train)
y_pred_full = layer.predict_full(X_test, is_classifier=True)
# n_trees
assert (
layer.n_trees_
- == 2 * layer_kwargs["n_estimators"] * layer_kwargs["n_trees"]
+ == 2
+ * classifier_layer_kwargs["n_estimators"]
+ * classifier_layer_kwargs["n_trees"]
)
# Output dim
- expect_dim = 2 * layer_kwargs["n_classes"] * layer_kwargs["n_estimators"]
+ expect_dim = (
+ 2
+ * classifier_layer_kwargs["n_classes"]
+ * classifier_layer_kwargs["n_estimators"]
+ )
assert X_aug.shape[1] == expect_dim
assert y_pred_full.shape[1] == expect_dim
@@ -70,7 +103,7 @@ def test_regressor_layer_properties_after_fitting():
X_train, X_test, y_train, y_test = train_test_split(
X_binned, y, test_size=0.42, random_state=42
)
- layer = Layer(**layer_kwargs)
+ layer = Layer(**regressor_layer_kwargs)
layer.is_classifier = False
X_aug = layer.fit_transform(X_train, y_train)
y_pred_full = layer.predict_full(X_test, is_classifier=False)
@@ -78,11 +111,13 @@ def test_regressor_layer_properties_after_fitting():
# n_trees
assert (
layer.n_trees_
- == 2 * layer_kwargs["n_estimators"] * layer_kwargs["n_trees"]
+ == 2
+ * regressor_layer_kwargs["n_estimators"]
+ * regressor_layer_kwargs["n_trees"]
)
# Output dim
- expect_dim = 2 * layer_kwargs["n_estimators"]
+ expect_dim = 2 * regressor_layer_kwargs["n_estimators"]
assert X_aug.shape[1] == expect_dim
assert y_pred_full.shape[1] == expect_dim
@@ -90,7 +125,10 @@ def test_regressor_layer_properties_after_fitting():
@pytest.mark.parametrize(
"param", [(0, {"n_estimators": 0}), (1, {"n_trees": 0})]
)
-def test_layer_invalid_training_params(param):
+@pytest.mark.parametrize(
+ "layer_kwargs", [(classifier_layer_kwargs), (regressor_layer_kwargs)]
+)
+def test_layer_invalid_training_params(param, layer_kwargs):
case_kwargs = copy.deepcopy(layer_kwargs)
case_kwargs.update(param[1])
@@ -105,7 +143,11 @@ def test_layer_invalid_training_params(param):
layer.fit_transform(X_train, y_train)
-def test_estimator_unknown():
+@pytest.mark.parametrize(
+ "estimator_kwargs",
+ [(classifier_estimator_kwargs), (regressor_estimator_kwargs)],
+)
+def test_estimator_unknown(estimator_kwargs):
case_kwargs = copy.deepcopy(estimator_kwargs)
case_kwargs.update({"name": "unknown"})
diff --git a/tests/test_model_classifier.py b/tests/test_model_classifier.py
index b304de4..6546858 100644
--- a/tests/test_model_classifier.py
+++ b/tests/test_model_classifier.py
@@ -25,6 +25,7 @@
"bin_subsample": 2e5,
"max_layers": 10,
"n_estimators": 1,
+ "criterion": "gini",
"n_trees": 100,
"max_depth": 3,
"min_samples_leaf": 1,
@@ -43,6 +44,7 @@
"bin_subsample": 2e5,
"max_layers": 10,
"n_estimators": 2,
+ "criterion": "gini",
"n_trees": 100,
"max_depth": None,
"min_samples_leaf": 1,
@@ -239,14 +241,14 @@ def test_model_invalid_training_params(param):
@pytest.mark.parametrize("predictor", ["forest", "xgboost", "lightgbm"])
def test_classifier_predictor_normal(predictor):
deepforest.cascade._build_classifier_predictor(
- predictor, n_estimators=1, n_outputs=2
+ predictor, criterion="gini", n_estimators=1, n_outputs=2
)
def test_classifier_predictor_unknown():
with pytest.raises(NotImplementedError) as excinfo:
deepforest.cascade._build_classifier_predictor(
- "unknown", n_estimators=1, n_outputs=2
+ "unknown", criterion="gini", n_estimators=1, n_outputs=2
)
assert "name of the predictor should be one of" in str(excinfo.value)
diff --git a/tests/test_model_regressor.py b/tests/test_model_regressor.py
index 1243e09..6d13916 100644
--- a/tests/test_model_regressor.py
+++ b/tests/test_model_regressor.py
@@ -24,6 +24,7 @@
"n_bins": 10,
"bin_subsample": 2e5,
"max_layers": 10,
+ "criterion": "mse",
"n_estimators": 1,
"n_trees": 100,
"max_depth": 3,
@@ -42,6 +43,7 @@
"n_bins": 255,
"bin_subsample": 2e5,
"max_layers": 10,
+ "criterion": "mse",
"n_estimators": 2,
"n_trees": 100,
"max_depth": None,
@@ -186,14 +188,14 @@ def test_model_invalid_training_params(param):
@pytest.mark.parametrize("predictor", ["forest", "xgboost", "lightgbm"])
def test_regressor_predictor_normal(predictor):
deepforest.cascade._build_regressor_predictor(
- predictor, n_estimators=1, n_outputs=2
+ predictor, criterion="mse", n_estimators=1, n_outputs=2
)
def test_regressor_predictor_unknown():
with pytest.raises(NotImplementedError) as excinfo:
deepforest.cascade._build_regressor_predictor(
- "unknown", n_estimators=1, n_outputs=2
+ "unknown", criterion="mse", n_estimators=1, n_outputs=2
)
assert "name of the predictor should be one of" in str(excinfo.value)