Skip to content

Commit

Permalink
[END] Add criterion for cascade forest (#28)
Browse files Browse the repository at this point in the history
* feat(model): add criterion

* doc(changelog): configurable criterion
  • Loading branch information
tczhao authored Feb 12, 2021
1 parent 39e61bf commit 313a5fa
Show file tree
Hide file tree
Showing 9 changed files with 98 additions and 15 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ Version 0.1.*
.. |Fix| replace:: :raw-html:`<span class="badge badge-danger">Fix</span>` :raw-latex:`{\small\sc [Fix]}`
.. |API| replace:: :raw-html:`<span class="badge badge-warning">API Change</span>` :raw-latex:`{\small\sc [API Change]}`

- |Feature| support configurable criterion (`#28 <https://github.com/LAMDA-NJU/Deep-Forest/issues/28>`__) @tczhao
- |Feature| support regression prediction (`#25 <https://github.com/LAMDA-NJU/Deep-Forest/issues/25>`__) @tczhao
- |Fix| fix accepted data types on the :obj:`binner` (`#23 <https://github.com/LAMDA-NJU/Deep-Forest/pull/23>`__) @xuyxu
- |Feature| implement the :meth:`get_forest` method for efficient indexing (`#22 <https://github.com/LAMDA-NJU/Deep-Forest/pull/22>`__) @xuyxu
Expand Down
9 changes: 9 additions & 0 deletions deepforest/_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

def make_classifier_estimator(
name,
criterion,
n_trees=100,
max_depth=None,
min_samples_leaf=1,
Expand All @@ -22,6 +23,7 @@ def make_classifier_estimator(
# RandomForestClassifier
if name == "rf":
estimator = RandomForestClassifier(
criterion=criterion,
n_estimators=n_trees,
max_depth=max_depth,
min_samples_leaf=min_samples_leaf,
Expand All @@ -31,6 +33,7 @@ def make_classifier_estimator(
# ExtraTreesClassifier
elif name == "erf":
estimator = ExtraTreesClassifier(
criterion=criterion,
n_estimators=n_trees,
max_depth=max_depth,
min_samples_leaf=min_samples_leaf,
Expand All @@ -46,6 +49,7 @@ def make_classifier_estimator(

def make_regressor_estimator(
name,
criterion,
n_trees=100,
max_depth=None,
min_samples_leaf=1,
Expand All @@ -55,6 +59,7 @@ def make_regressor_estimator(
# RandomForestRegressor
if name == "rf":
estimator = RandomForestRegressor(
criterion=criterion,
n_estimators=n_trees,
max_depth=max_depth,
min_samples_leaf=min_samples_leaf,
Expand All @@ -64,6 +69,7 @@ def make_regressor_estimator(
# ExtraTreesRegressor
elif name == "erf":
estimator = ExtraTreesRegressor(
criterion=criterion,
n_estimators=n_trees,
max_depth=max_depth,
min_samples_leaf=min_samples_leaf,
Expand All @@ -81,6 +87,7 @@ class Estimator(object):
def __init__(
self,
name,
criterion,
n_trees=100,
max_depth=None,
min_samples_leaf=1,
Expand All @@ -93,6 +100,7 @@ def __init__(
if self.is_classifier:
self.estimator_ = make_classifier_estimator(
name,
criterion,
n_trees,
max_depth,
min_samples_leaf,
Expand All @@ -102,6 +110,7 @@ def __init__(
else:
self.estimator_ = make_regressor_estimator(
name,
criterion,
n_trees,
max_depth,
min_samples_leaf,
Expand Down
1 change: 1 addition & 0 deletions deepforest/_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,7 @@ def model_loadobj(dirname, obj_type, d=None):
layer_ = Layer(
layer_idx=layer_idx,
n_classes=d["n_outputs"],
criterion=d["criterion"],
n_estimators=d["n_estimators"],
partial_mode=d["partial_mode"],
buffer=d["buffer"],
Expand Down
3 changes: 3 additions & 0 deletions deepforest/_layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def __init__(
self,
layer_idx,
n_classes,
criterion,
n_estimators=2,
n_trees=100,
max_depth=None,
Expand All @@ -60,6 +61,7 @@ def __init__(
):
self.layer_idx = layer_idx
self.n_classes = n_classes
self.criterion = criterion
self.n_estimators = n_estimators * 2 # internal conversion
self.n_trees = n_trees
self.max_depth = max_depth
Expand Down Expand Up @@ -89,6 +91,7 @@ def _make_estimator(self, estimator_idx, estimator_name):

estimator = Estimator(
name=estimator_name,
criterion=self.criterion,
n_trees=self.n_trees,
max_depth=self.max_depth,
min_samples_leaf=self.min_samples_leaf,
Expand Down
23 changes: 23 additions & 0 deletions deepforest/cascade.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def _get_predictor_kwargs(predictor_kwargs, **kwargs) -> dict:

def _build_classifier_predictor(
predictor_name,
criterion,
n_estimators,
n_outputs,
max_depth=None,
Expand All @@ -46,6 +47,7 @@ def _build_classifier_predictor(
predictor = RandomForestClassifier(
**_get_predictor_kwargs(
predictor_kwargs,
criterion=criterion,
n_estimators=n_estimators,
max_depth=max_depth,
min_samples_leaf=min_samples_leaf,
Expand Down Expand Up @@ -110,6 +112,7 @@ def _build_classifier_predictor(

def _build_regressor_predictor(
predictor_name,
criterion,
n_estimators,
n_outputs,
max_depth=None,
Expand All @@ -128,6 +131,7 @@ def _build_regressor_predictor(
predictor = RandomForestRegressor(
**_get_predictor_kwargs(
predictor_kwargs,
criterion=criterion,
n_estimators=n_estimators,
max_depth=max_depth,
min_samples_leaf=min_samples_leaf,
Expand Down Expand Up @@ -205,6 +209,10 @@ def _build_regressor_predictor(
The maximum number of cascade layers in the deep forest. Notice that
the actual number of layers can be smaller than ``max_layers`` because
of the internal early stopping stage.
criterion : :obj:`{"gini", "entropy"}`, default="gini"
The function to measure the quality of a split. Supported criteria
are ``gini`` for the Gini impurity and ``entropy`` for the information
gain. Note: this parameter is tree-specific.
n_estimators : :obj:`int`, default=2
The number of estimator in each cascade layer. It will be multiplied
by 2 internally because each estimator contains a
Expand Down Expand Up @@ -311,6 +319,10 @@ def _build_regressor_predictor(
The maximum number of cascade layers in the deep forest. Notice that
the actual number of layers can be smaller than ``max_layers`` because
of the internal early stopping stage.
criterion : :obj:`{"mse", "mae"}`, default="mse"
The function to measure the quality of a split. Supported criteria are
``mse`` for the mean squared error, which is equal to variance reduction
as feature selection criterion, and ``mae`` for the mean absolute error.
n_estimators : :obj:`int`, default=2
The number of estimator in each cascade layer. It will be multiplied
by 2 internally because each estimator contains a
Expand Down Expand Up @@ -441,6 +453,7 @@ def __init__(
bin_subsample=2e5,
bin_type="percentile",
max_layers=20,
criterion="",
n_estimators=2,
n_trees=100,
max_depth=None,
Expand All @@ -459,6 +472,7 @@ def __init__(
self.bin_subsample = bin_subsample
self.bin_type = bin_type
self.max_layers = max_layers
self.criterion = criterion
self.n_estimators = n_estimators
self.n_trees = n_trees
self.max_depth = max_depth
Expand Down Expand Up @@ -710,6 +724,7 @@ def fit(self, X, y, sample_weight=None):
layer_ = Layer(
0,
self.n_outputs_,
self.criterion,
self.n_estimators,
self._set_n_trees(0),
self.max_depth,
Expand Down Expand Up @@ -785,6 +800,7 @@ def fit(self, X, y, sample_weight=None):
layer_ = Layer(
layer_idx,
self.n_outputs_,
self.criterion,
self.n_estimators,
self._set_n_trees(layer_idx),
self.max_depth,
Expand Down Expand Up @@ -881,6 +897,7 @@ def fit(self, X, y, sample_weight=None):
if is_classifier(self):
self.predictor_ = _build_classifier_predictor(
self.predictor_name,
self.criterion,
self.n_trees,
self.n_outputs_,
self.max_depth,
Expand All @@ -892,6 +909,7 @@ def fit(self, X, y, sample_weight=None):
else:
self.predictor_ = _build_regressor_predictor(
self.predictor_name,
self.criterion,
self.n_trees,
self.n_outputs_,
self.max_depth,
Expand Down Expand Up @@ -1016,6 +1034,7 @@ def save(self, dirname="model"):
# Save each object sequentially
d = {}
d["n_estimators"] = self.n_estimators
d["criterion"] = self.criterion
d["n_layers"] = self.n_layers_
d["n_features"] = self.n_features_
d["n_outputs"] = self.n_outputs_
Expand Down Expand Up @@ -1107,6 +1126,7 @@ def __init__(
bin_subsample=2e5,
bin_type="percentile",
max_layers=20,
criterion="gini",
n_estimators=2,
n_trees=100,
max_depth=None,
Expand All @@ -1126,6 +1146,7 @@ def __init__(
bin_subsample=bin_subsample,
bin_type=bin_type,
max_layers=max_layers,
criterion=criterion,
n_estimators=n_estimators,
n_trees=n_trees,
max_depth=max_depth,
Expand Down Expand Up @@ -1302,6 +1323,7 @@ def __init__(
bin_subsample=2e5,
bin_type="percentile",
max_layers=20,
criterion="mse",
n_estimators=2,
n_trees=100,
max_depth=None,
Expand All @@ -1321,6 +1343,7 @@ def __init__(
bin_subsample=bin_subsample,
bin_type=bin_type,
max_layers=max_layers,
criterion=criterion,
n_estimators=n_estimators,
n_trees=n_trees,
max_depth=max_depth,
Expand Down
2 changes: 1 addition & 1 deletion deepforest/tree/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
DOUBLE = _tree.DOUBLE

CRITERIA_CLF = {"gini": _criterion.Gini, "entropy": _criterion.Entropy}
CRITERIA_REG = {"mse": _criterion.MSE}
CRITERIA_REG = {"mse": _criterion.MSE, "mae": _criterion.MAE}

DENSE_SPLITTERS = {
"best": _splitter.BestSplitter,
Expand Down
62 changes: 52 additions & 10 deletions tests/test_layer_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,10 @@
)

# Parameters
layer_kwargs = {
classifier_layer_kwargs = {
"layer_idx": 0,
"n_classes": 10,
"criterion": "gini",
"n_estimators": 1,
"n_trees": 10,
"max_depth": 3,
Expand All @@ -33,8 +34,34 @@
"verbose": 2,
}

estimator_kwargs = {
classifier_estimator_kwargs = {
"name": "rf",
"criterion": "gini",
"n_trees": 10,
"max_depth": 3,
"min_samples_leaf": 10,
"n_jobs": -1,
"random_state": 42,
}

regressor_layer_kwargs = {
"layer_idx": 0,
"n_classes": 1,
"criterion": "mse",
"n_estimators": 1,
"n_trees": 10,
"max_depth": 3,
"min_samples_leaf": 10,
"partial_mode": False,
"buffer": None,
"n_jobs": -1,
"random_state": 42,
"verbose": 2,
}

regressor_estimator_kwargs = {
"name": "rf",
"criterion": "mse",
"n_trees": 10,
"max_depth": 3,
"min_samples_leaf": 10,
Expand All @@ -45,18 +72,24 @@

def test_classifier_layer_properties_after_fitting():

layer = Layer(**layer_kwargs)
layer = Layer(**classifier_layer_kwargs)
X_aug = layer.fit_transform(X_train, y_train)
y_pred_full = layer.predict_full(X_test, is_classifier=True)

# n_trees
assert (
layer.n_trees_
== 2 * layer_kwargs["n_estimators"] * layer_kwargs["n_trees"]
== 2
* classifier_layer_kwargs["n_estimators"]
* classifier_layer_kwargs["n_trees"]
)

# Output dim
expect_dim = 2 * layer_kwargs["n_classes"] * layer_kwargs["n_estimators"]
expect_dim = (
2
* classifier_layer_kwargs["n_classes"]
* classifier_layer_kwargs["n_estimators"]
)
assert X_aug.shape[1] == expect_dim
assert y_pred_full.shape[1] == expect_dim

Expand All @@ -70,27 +103,32 @@ def test_regressor_layer_properties_after_fitting():
X_train, X_test, y_train, y_test = train_test_split(
X_binned, y, test_size=0.42, random_state=42
)
layer = Layer(**layer_kwargs)
layer = Layer(**regressor_layer_kwargs)
layer.is_classifier = False
X_aug = layer.fit_transform(X_train, y_train)
y_pred_full = layer.predict_full(X_test, is_classifier=False)

# n_trees
assert (
layer.n_trees_
== 2 * layer_kwargs["n_estimators"] * layer_kwargs["n_trees"]
== 2
* regressor_layer_kwargs["n_estimators"]
* regressor_layer_kwargs["n_trees"]
)

# Output dim
expect_dim = 2 * layer_kwargs["n_estimators"]
expect_dim = 2 * regressor_layer_kwargs["n_estimators"]
assert X_aug.shape[1] == expect_dim
assert y_pred_full.shape[1] == expect_dim


@pytest.mark.parametrize(
"param", [(0, {"n_estimators": 0}), (1, {"n_trees": 0})]
)
def test_layer_invalid_training_params(param):
@pytest.mark.parametrize(
"layer_kwargs", [(classifier_layer_kwargs), (regressor_layer_kwargs)]
)
def test_layer_invalid_training_params(param, layer_kwargs):
case_kwargs = copy.deepcopy(layer_kwargs)
case_kwargs.update(param[1])

Expand All @@ -105,7 +143,11 @@ def test_layer_invalid_training_params(param):
layer.fit_transform(X_train, y_train)


def test_estimator_unknown():
@pytest.mark.parametrize(
"estimator_kwargs",
[(classifier_estimator_kwargs), (regressor_estimator_kwargs)],
)
def test_estimator_unknown(estimator_kwargs):
case_kwargs = copy.deepcopy(estimator_kwargs)
case_kwargs.update({"name": "unknown"})

Expand Down
Loading

0 comments on commit 313a5fa

Please sign in to comment.