Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(model): add criterion #28

Merged
merged 2 commits into from
Feb 12, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ Version 0.1.*
.. |Fix| replace:: :raw-html:`<span class="badge badge-danger">Fix</span>` :raw-latex:`{\small\sc [Fix]}`
.. |API| replace:: :raw-html:`<span class="badge badge-warning">API Change</span>` :raw-latex:`{\small\sc [API Change]}`

- |Feature| support configurable criterion (`#28 <https://github.com/LAMDA-NJU/Deep-Forest/issues/28>`__) @tczhao
- |Feature| support regression prediction (`#25 <https://github.com/LAMDA-NJU/Deep-Forest/issues/25>`__) @tczhao
- |Fix| fix accepted data types on the :obj:`binner` (`#23 <https://github.com/LAMDA-NJU/Deep-Forest/pull/23>`__) @xuyxu
- |Feature| implement the :meth:`get_forest` method for efficient indexing (`#22 <https://github.com/LAMDA-NJU/Deep-Forest/pull/22>`__) @xuyxu
Expand Down
9 changes: 9 additions & 0 deletions deepforest/_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

def make_classifier_estimator(
name,
criterion,
n_trees=100,
max_depth=None,
min_samples_leaf=1,
Expand All @@ -22,6 +23,7 @@ def make_classifier_estimator(
# RandomForestClassifier
if name == "rf":
estimator = RandomForestClassifier(
criterion=criterion,
n_estimators=n_trees,
max_depth=max_depth,
min_samples_leaf=min_samples_leaf,
Expand All @@ -31,6 +33,7 @@ def make_classifier_estimator(
# ExtraTreesClassifier
elif name == "erf":
estimator = ExtraTreesClassifier(
criterion=criterion,
n_estimators=n_trees,
max_depth=max_depth,
min_samples_leaf=min_samples_leaf,
Expand All @@ -46,6 +49,7 @@ def make_classifier_estimator(

def make_regressor_estimator(
name,
criterion,
n_trees=100,
max_depth=None,
min_samples_leaf=1,
Expand All @@ -55,6 +59,7 @@ def make_regressor_estimator(
# RandomForestRegressor
if name == "rf":
estimator = RandomForestRegressor(
criterion=criterion,
n_estimators=n_trees,
max_depth=max_depth,
min_samples_leaf=min_samples_leaf,
Expand All @@ -64,6 +69,7 @@ def make_regressor_estimator(
# ExtraTreesRegressor
elif name == "erf":
estimator = ExtraTreesRegressor(
criterion=criterion,
n_estimators=n_trees,
max_depth=max_depth,
min_samples_leaf=min_samples_leaf,
Expand All @@ -81,6 +87,7 @@ class Estimator(object):
def __init__(
self,
name,
criterion,
n_trees=100,
max_depth=None,
min_samples_leaf=1,
Expand All @@ -93,6 +100,7 @@ def __init__(
if self.is_classifier:
self.estimator_ = make_classifier_estimator(
name,
criterion,
n_trees,
max_depth,
min_samples_leaf,
Expand All @@ -102,6 +110,7 @@ def __init__(
else:
self.estimator_ = make_regressor_estimator(
name,
criterion,
n_trees,
max_depth,
min_samples_leaf,
Expand Down
1 change: 1 addition & 0 deletions deepforest/_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,7 @@ def model_loadobj(dirname, obj_type, d=None):
layer_ = Layer(
layer_idx=layer_idx,
n_classes=d["n_outputs"],
criterion=d["criterion"],
n_estimators=d["n_estimators"],
partial_mode=d["partial_mode"],
buffer=d["buffer"],
Expand Down
3 changes: 3 additions & 0 deletions deepforest/_layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def __init__(
self,
layer_idx,
n_classes,
criterion,
n_estimators=2,
n_trees=100,
max_depth=None,
Expand All @@ -60,6 +61,7 @@ def __init__(
):
self.layer_idx = layer_idx
self.n_classes = n_classes
self.criterion = criterion
self.n_estimators = n_estimators * 2 # internal conversion
self.n_trees = n_trees
self.max_depth = max_depth
Expand Down Expand Up @@ -89,6 +91,7 @@ def _make_estimator(self, estimator_idx, estimator_name):

estimator = Estimator(
name=estimator_name,
criterion=self.criterion,
n_trees=self.n_trees,
max_depth=self.max_depth,
min_samples_leaf=self.min_samples_leaf,
Expand Down
23 changes: 23 additions & 0 deletions deepforest/cascade.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def _get_predictor_kwargs(predictor_kwargs, **kwargs) -> dict:

def _build_classifier_predictor(
predictor_name,
criterion,
xuyxu marked this conversation as resolved.
Show resolved Hide resolved
n_estimators,
n_outputs,
max_depth=None,
Expand All @@ -46,6 +47,7 @@ def _build_classifier_predictor(
predictor = RandomForestClassifier(
**_get_predictor_kwargs(
predictor_kwargs,
criterion=criterion,
n_estimators=n_estimators,
max_depth=max_depth,
min_samples_leaf=min_samples_leaf,
Expand Down Expand Up @@ -110,6 +112,7 @@ def _build_classifier_predictor(

def _build_regressor_predictor(
predictor_name,
criterion,
xuyxu marked this conversation as resolved.
Show resolved Hide resolved
n_estimators,
n_outputs,
max_depth=None,
Expand All @@ -128,6 +131,7 @@ def _build_regressor_predictor(
predictor = RandomForestRegressor(
**_get_predictor_kwargs(
predictor_kwargs,
criterion=criterion,
n_estimators=n_estimators,
max_depth=max_depth,
min_samples_leaf=min_samples_leaf,
Expand Down Expand Up @@ -205,6 +209,10 @@ def _build_regressor_predictor(
The maximum number of cascade layers in the deep forest. Notice that
the actual number of layers can be smaller than ``max_layers`` because
of the internal early stopping stage.
criterion : :obj:`{"gini", "entropy"}`, default="gini"
The function to measure the quality of a split. Supported criteria
are ``gini`` for the Gini impurity and ``entropy`` for the information
gain. Note: this parameter is tree-specific.
n_estimators : :obj:`int`, default=2
The number of estimator in each cascade layer. It will be multiplied
by 2 internally because each estimator contains a
Expand Down Expand Up @@ -311,6 +319,10 @@ def _build_regressor_predictor(
The maximum number of cascade layers in the deep forest. Notice that
the actual number of layers can be smaller than ``max_layers`` because
of the internal early stopping stage.
criterion : :obj:`{"mse", "mae"}`, default="mse"
The function to measure the quality of a split. Supported criteria are
``mse`` for the mean squared error, which is equal to variance reduction
as feature selection criterion, and ``mae`` for the mean absolute error.
n_estimators : :obj:`int`, default=2
The number of estimator in each cascade layer. It will be multiplied
by 2 internally because each estimator contains a
Expand Down Expand Up @@ -441,6 +453,7 @@ def __init__(
bin_subsample=2e5,
bin_type="percentile",
max_layers=20,
criterion="",
n_estimators=2,
n_trees=100,
max_depth=None,
Expand All @@ -459,6 +472,7 @@ def __init__(
self.bin_subsample = bin_subsample
self.bin_type = bin_type
self.max_layers = max_layers
self.criterion = criterion
self.n_estimators = n_estimators
self.n_trees = n_trees
self.max_depth = max_depth
Expand Down Expand Up @@ -710,6 +724,7 @@ def fit(self, X, y, sample_weight=None):
layer_ = Layer(
0,
self.n_outputs_,
self.criterion,
self.n_estimators,
self._set_n_trees(0),
self.max_depth,
Expand Down Expand Up @@ -785,6 +800,7 @@ def fit(self, X, y, sample_weight=None):
layer_ = Layer(
layer_idx,
self.n_outputs_,
self.criterion,
self.n_estimators,
self._set_n_trees(layer_idx),
self.max_depth,
Expand Down Expand Up @@ -881,6 +897,7 @@ def fit(self, X, y, sample_weight=None):
if is_classifier(self):
self.predictor_ = _build_classifier_predictor(
self.predictor_name,
self.criterion,
self.n_trees,
self.n_outputs_,
self.max_depth,
Expand All @@ -892,6 +909,7 @@ def fit(self, X, y, sample_weight=None):
else:
self.predictor_ = _build_regressor_predictor(
self.predictor_name,
self.criterion,
self.n_trees,
self.n_outputs_,
self.max_depth,
Expand Down Expand Up @@ -1016,6 +1034,7 @@ def save(self, dirname="model"):
# Save each object sequentially
d = {}
d["n_estimators"] = self.n_estimators
d["criterion"] = self.criterion
d["n_layers"] = self.n_layers_
d["n_features"] = self.n_features_
d["n_outputs"] = self.n_outputs_
Expand Down Expand Up @@ -1107,6 +1126,7 @@ def __init__(
bin_subsample=2e5,
bin_type="percentile",
max_layers=20,
criterion="gini",
n_estimators=2,
n_trees=100,
max_depth=None,
Expand All @@ -1126,6 +1146,7 @@ def __init__(
bin_subsample=bin_subsample,
bin_type=bin_type,
max_layers=max_layers,
criterion=criterion,
n_estimators=n_estimators,
n_trees=n_trees,
max_depth=max_depth,
Expand Down Expand Up @@ -1302,6 +1323,7 @@ def __init__(
bin_subsample=2e5,
bin_type="percentile",
max_layers=20,
criterion="mse",
n_estimators=2,
n_trees=100,
max_depth=None,
Expand All @@ -1321,6 +1343,7 @@ def __init__(
bin_subsample=bin_subsample,
bin_type=bin_type,
max_layers=max_layers,
criterion=criterion,
n_estimators=n_estimators,
n_trees=n_trees,
max_depth=max_depth,
Expand Down
2 changes: 1 addition & 1 deletion deepforest/tree/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
DOUBLE = _tree.DOUBLE

CRITERIA_CLF = {"gini": _criterion.Gini, "entropy": _criterion.Entropy}
CRITERIA_REG = {"mse": _criterion.MSE}
CRITERIA_REG = {"mse": _criterion.MSE, "mae": _criterion.MAE}

DENSE_SPLITTERS = {
"best": _splitter.BestSplitter,
Expand Down
62 changes: 52 additions & 10 deletions tests/test_layer_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,10 @@
)

# Parameters
layer_kwargs = {
classifier_layer_kwargs = {
"layer_idx": 0,
"n_classes": 10,
"criterion": "gini",
"n_estimators": 1,
"n_trees": 10,
"max_depth": 3,
Expand All @@ -33,8 +34,34 @@
"verbose": 2,
}

estimator_kwargs = {
classifier_estimator_kwargs = {
"name": "rf",
"criterion": "gini",
"n_trees": 10,
"max_depth": 3,
"min_samples_leaf": 10,
"n_jobs": -1,
"random_state": 42,
}

regressor_layer_kwargs = {
"layer_idx": 0,
"n_classes": 1,
"criterion": "mse",
"n_estimators": 1,
"n_trees": 10,
"max_depth": 3,
"min_samples_leaf": 10,
"partial_mode": False,
"buffer": None,
"n_jobs": -1,
"random_state": 42,
"verbose": 2,
}

regressor_estimator_kwargs = {
"name": "rf",
"criterion": "mse",
"n_trees": 10,
"max_depth": 3,
"min_samples_leaf": 10,
Expand All @@ -45,18 +72,24 @@

def test_classifier_layer_properties_after_fitting():

layer = Layer(**layer_kwargs)
layer = Layer(**classifier_layer_kwargs)
X_aug = layer.fit_transform(X_train, y_train)
y_pred_full = layer.predict_full(X_test, is_classifier=True)

# n_trees
assert (
layer.n_trees_
== 2 * layer_kwargs["n_estimators"] * layer_kwargs["n_trees"]
== 2
* classifier_layer_kwargs["n_estimators"]
* classifier_layer_kwargs["n_trees"]
)

# Output dim
expect_dim = 2 * layer_kwargs["n_classes"] * layer_kwargs["n_estimators"]
expect_dim = (
2
* classifier_layer_kwargs["n_classes"]
* classifier_layer_kwargs["n_estimators"]
)
assert X_aug.shape[1] == expect_dim
assert y_pred_full.shape[1] == expect_dim

Expand All @@ -70,27 +103,32 @@ def test_regressor_layer_properties_after_fitting():
X_train, X_test, y_train, y_test = train_test_split(
X_binned, y, test_size=0.42, random_state=42
)
layer = Layer(**layer_kwargs)
layer = Layer(**regressor_layer_kwargs)
layer.is_classifier = False
X_aug = layer.fit_transform(X_train, y_train)
y_pred_full = layer.predict_full(X_test, is_classifier=False)

# n_trees
assert (
layer.n_trees_
== 2 * layer_kwargs["n_estimators"] * layer_kwargs["n_trees"]
== 2
* regressor_layer_kwargs["n_estimators"]
* regressor_layer_kwargs["n_trees"]
)

# Output dim
expect_dim = 2 * layer_kwargs["n_estimators"]
expect_dim = 2 * regressor_layer_kwargs["n_estimators"]
assert X_aug.shape[1] == expect_dim
assert y_pred_full.shape[1] == expect_dim


@pytest.mark.parametrize(
"param", [(0, {"n_estimators": 0}), (1, {"n_trees": 0})]
)
def test_layer_invalid_training_params(param):
@pytest.mark.parametrize(
"layer_kwargs", [(classifier_layer_kwargs), (regressor_layer_kwargs)]
)
def test_layer_invalid_training_params(param, layer_kwargs):
case_kwargs = copy.deepcopy(layer_kwargs)
case_kwargs.update(param[1])

Expand All @@ -105,7 +143,11 @@ def test_layer_invalid_training_params(param):
layer.fit_transform(X_train, y_train)


def test_estimator_unknown():
@pytest.mark.parametrize(
"estimator_kwargs",
[(classifier_estimator_kwargs), (regressor_estimator_kwargs)],
)
def test_estimator_unknown(estimator_kwargs):
case_kwargs = copy.deepcopy(estimator_kwargs)
case_kwargs.update({"name": "unknown"})

Expand Down
Loading