From cd50598a9f0082de30b350a7995b4164a2770613 Mon Sep 17 00:00:00 2001 From: Thierry Moudiki Date: Sun, 27 Oct 2024 16:59:48 +0100 Subject: [PATCH 01/26] init version with hist-gbooster --- examples/hist_genboost_classifier.py | 55 +++++ examples/hist_genboost_regressor.py | 45 ++++ examples/lazy_histbooster_classification.py | 29 +++ examples/lazy_histbooster_regression.py | 59 +++++ mlsauce/__init__.py | 5 + mlsauce/booster/__init__.py | 4 + mlsauce/booster/_booster_classifier.py | 197 ++++++++++++++++- mlsauce/booster/_booster_regressor.py | 208 +++++++++++++++++- mlsauce/lazybooster/lazyboosterclassif.py | 116 +++++++--- mlsauce/lazybooster/lazyboosterregression.py | 46 +++- mlsauce/utils/__init__.py | 2 + mlsauce/utils/histofeatures/__init__.py | 3 + .../utils/histofeatures/gethistofeatures.py | 99 +++++++++ 13 files changed, 815 insertions(+), 53 deletions(-) create mode 100644 examples/hist_genboost_classifier.py create mode 100644 examples/hist_genboost_regressor.py create mode 100644 examples/lazy_histbooster_classification.py create mode 100644 examples/lazy_histbooster_regression.py create mode 100644 mlsauce/utils/histofeatures/__init__.py create mode 100644 mlsauce/utils/histofeatures/gethistofeatures.py diff --git a/examples/hist_genboost_classifier.py b/examples/hist_genboost_classifier.py new file mode 100644 index 0000000..f0ef839 --- /dev/null +++ b/examples/hist_genboost_classifier.py @@ -0,0 +1,55 @@ +import numpy as np +from sklearn.datasets import load_digits, load_breast_cancer, load_wine, load_iris +from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score +from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor +from sklearn.kernel_ridge import KernelRidge +from sklearn.linear_model import LinearRegression +from time import time +from os import chdir +from sklearn import metrics +import os + +print(f"\n ----- Running: {os.path.basename(__file__)}... ----- \n") + +print(os.path.relpath(os.path.dirname(__file__))) + +#wd="/workspace/mlsauce/mlsauce/examples" +# +#chdir(wd) + +import mlsauce as ms + +#ridge + +print("\n") +print("GenericBoosting Decision tree -----") +print("\n") + +print("\n") +print("breast_cancer data -----") + +# data 1 +breast_cancer = load_breast_cancer() +X = breast_cancer.data +y = breast_cancer.target +# split data into training test and test set +np.random.seed(15029) +X_train, X_test, y_train, y_test = train_test_split(X, y, + test_size=0.2) + +clf = ExtraTreeRegressor() +clf2 = LinearRegression() + +obj = ms.HistGenericBoostingClassifier(clf) +print(obj.get_params()) +start = time() +obj.fit(X_train, y_train) +print(time()-start) +start = time() +print(obj.score(X_test, y_test)) +print(time()-start) + +print(obj.obj['loss']) + +print(obj.obj['fit_obj_i']) + diff --git a/examples/hist_genboost_regressor.py b/examples/hist_genboost_regressor.py new file mode 100644 index 0000000..857d96a --- /dev/null +++ b/examples/hist_genboost_regressor.py @@ -0,0 +1,45 @@ +import subprocess +import sys +import os + +print(f"\n ----- Running: {os.path.basename(__file__)}... ----- \n") + + +subprocess.check_call([sys.executable, "-m", "pip", "install", "matplotlib"]) + +import mlsauce as ms +import numpy as np +import matplotlib.pyplot as plt +from sklearn.datasets import load_diabetes, fetch_california_housing +from sklearn.linear_model import Ridge, LinearRegression +from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score +from sklearn.tree import ExtraTreeRegressor +from time import time +from os import chdir +from sklearn import metrics + + +print("\n") +print("diabetes data -----") + +regr = ExtraTreeRegressor() + +diabetes = load_diabetes() +X = diabetes.data +y = diabetes.target +# split data into training test and test set +np.random.seed(15029) +X_train, X_test, y_train, y_test = train_test_split(X, y, + test_size=0.2) + + +obj = ms.HistGenericBoostingRegressor(regr) +print(obj.get_params()) +start = time() +obj.fit(X_train, y_train) +print(time()-start) +start = time() +print(np.sqrt(np.mean(np.square(obj.predict(X_test) - y_test)))) +print(time()-start) +print(obj.obj['loss']) + diff --git a/examples/lazy_histbooster_classification.py b/examples/lazy_histbooster_classification.py new file mode 100644 index 0000000..2b625e6 --- /dev/null +++ b/examples/lazy_histbooster_classification.py @@ -0,0 +1,29 @@ +import os +import mlsauce as ms +from sklearn.datasets import load_breast_cancer, load_iris, load_wine, load_digits +from sklearn.model_selection import train_test_split +from time import time + +print(f"\n ----- Running: {os.path.basename(__file__)}... ----- \n") + +#load_models = [load_breast_cancer, load_iris, load_wine, load_digits] +load_models = [load_breast_cancer, load_iris, load_wine] +#load_models = [load_digits] + +for model in load_models: + + data = model() + X = data.data + y= data.target + + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 13) + + clf = ms.LazyBoostingClassifier(verbose=0, ignore_warnings=True, #n_jobs=2, + custom_metric=None, preprocess=False) + + start = time() + models, predictioms = clf.fit(X_train, X_test, y_train, y_test, hist=True) + print(f"\nElapsed: {time() - start} seconds\n") + + print(models) + diff --git a/examples/lazy_histbooster_regression.py b/examples/lazy_histbooster_regression.py new file mode 100644 index 0000000..aa68da6 --- /dev/null +++ b/examples/lazy_histbooster_regression.py @@ -0,0 +1,59 @@ +import os +import mlsauce as ms +import numpy as np +from sklearn.datasets import load_diabetes +from sklearn.datasets import fetch_california_housing +from sklearn.model_selection import train_test_split + +print(f"\n ----- Running: {os.path.basename(__file__)}... ----- \n") + +data = load_diabetes() +X = data.data +y= data.target +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 123) + +regr = ms.LazyBoostingRegressor(verbose=0, ignore_warnings=True, #n_jobs=2, + custom_metric=None, preprocess=True) +models, predictioms = regr.fit(X_train, X_test, y_train, y_test) +model_dictionary = regr.provide_models(X_train, X_test, y_train, y_test) +print(models) + +data = fetch_california_housing() +X = data.data[0:1000,:] +y= data.target[0:1000] +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 123) + +regr = ms.LazyBoostingRegressor(verbose=0, ignore_warnings=True, + custom_metric=None, preprocess=True) +models, predictioms = regr.fit(X_train, X_test, y_train, y_test, hist=True) +model_dictionary = regr.provide_models(X_train, X_test, y_train, y_test) +print(models) + + +from sklearn.datasets import fetch_openml + +# Load the dataset from OpenML +boston = fetch_openml(name='boston', version=1, as_frame=True) + +# Get the features and target +X = boston.data +y = boston.target + +# Display the first few rows +print(X.head()) +print(y.head()) + +np.random.seed(1509) +X_train, X_test, y_train, y_test = train_test_split(X, y, + test_size=0.2) + +X_train = X_train.astype(np.float64) +X_test = X_test.astype(np.float64) +y_train = y_train.astype(np.float64) +y_test = y_test.astype(np.float64) + +regr = ms.LazyBoostingRegressor(verbose=0, ignore_warnings=True, #n_jobs=2, + custom_metric=None, preprocess=True) +models, predictioms = regr.fit(X_train, X_test, y_train, y_test, hist=True) +model_dictionary = regr.provide_models(X_train, X_test, y_train, y_test) +print(models) diff --git a/mlsauce/__init__.py b/mlsauce/__init__.py index 2dddb7d..9767523 100644 --- a/mlsauce/__init__.py +++ b/mlsauce/__init__.py @@ -61,6 +61,9 @@ LSBoostRegressor, GenericBoostingClassifier, GenericBoostingRegressor, + HistGenericBoostingRegressor, + HistGenericBoostingClassifier, + ) from .lazybooster import LazyBoostingClassifier, LazyBoostingRegressor from .multitaskregressor import MultiTaskRegressor @@ -77,6 +80,8 @@ "LSBoostClassifier", "GenericBoostingClassifier", "GenericBoostingRegressor", + "HistGenericBoostingClassifier", + "HistGenericBoostingRegressor", "StumpClassifier", "ElasticNetRegressor", "LassoRegressor", diff --git a/mlsauce/booster/__init__.py b/mlsauce/booster/__init__.py index 786fe8f..b8941dc 100644 --- a/mlsauce/booster/__init__.py +++ b/mlsauce/booster/__init__.py @@ -1,11 +1,15 @@ from ._booster_regressor import LSBoostRegressor from ._booster_regressor import GenericBoostingRegressor +from ._booster_regressor import HistGenericBoostingRegressor from ._booster_classifier import LSBoostClassifier from ._booster_classifier import GenericBoostingClassifier +from ._booster_classifier import HistGenericBoostingClassifier __all__ = [ "LSBoostClassifier", "LSBoostRegressor", "GenericBoostingClassifier", "GenericBoostingRegressor", + "HistGenericBoostingRegressor", + "HistGenericBoostingClassifier" ] diff --git a/mlsauce/booster/_booster_classifier.py b/mlsauce/booster/_booster_classifier.py index 0d395a8..ef126d0 100644 --- a/mlsauce/booster/_booster_classifier.py +++ b/mlsauce/booster/_booster_classifier.py @@ -11,7 +11,7 @@ from . import _boosterc as boosterc except ImportError: import _boosterc as boosterc -from ..utils import cluster, check_and_install +from ..utils import cluster, check_and_install, get_histo_features class LSBoostClassifier(BaseEstimator, ClassifierMixin): @@ -672,3 +672,198 @@ def __init__( weights_distr=weights_distr, base_model=self.base_model, ) + +class HistGenericBoostingClassifier(GenericBoostingClassifier): + """Histogram-based Generic Boosting classifier (using any classifier as base learner). + + Attributes: + + base_model: object + base learner (default is ExtraTreeRegressor) to be boosted. + + n_estimators: int + number of boosting iterations. + + learning_rate: float + controls the learning speed at training time. + + n_hidden_features: int + number of nodes in successive hidden layers. + + reg_lambda: float + L2 regularization parameter for successive errors in the optimizer + (at training time). + + alpha: float + compromise between L1 and L2 regularization (must be in [0, 1]), + for `solver` == 'enet'. + + row_sample: float + percentage of rows chosen from the training set. + + col_sample: float + percentage of columns chosen from the training set. + + dropout: float + percentage of nodes dropped from the training set. + + tolerance: float + controls early stopping in gradient descent (at training time). + + direct_link: bool + indicates whether the original features are included (True) in model's + fitting or not (False). + + verbose: int + progress bar (yes = 1) or not (no = 0) (currently). + + seed: int + reproducibility seed for nodes_sim=='uniform', clustering and dropout. + + backend: str + type of backend; must be in ('cpu', 'gpu', 'tpu') + + solver: str + type of 'weak' learner; currently in ('ridge', 'lasso', 'enet'). + 'enet' is a combination of 'ridge' and 'lasso' called Elastic Net. + + activation: str + activation function: currently 'relu', 'relu6', 'sigmoid', 'tanh' + + n_clusters: int + number of clusters for clustering the features + + clustering_method: str + clustering method: currently 'kmeans', 'gmm' + + cluster_scaling: str + scaling method for clustering: currently 'standard', 'robust', 'minmax' + + degree: int + degree of features interactions to include in the model + + weights_distr: str + distribution of weights for constructing the model's hidden layer; + currently 'uniform', 'gaussian' + + """ + + def __init__( + self, + base_model=ExtraTreeRegressor(), + n_estimators=100, + learning_rate=0, + n_hidden_features=5, + reg_lambda=0.1, + alpha=0.5, + row_sample=1, + col_sample=1, + dropout=0, + tolerance=1e-4, + direct_link=1, + verbose=1, + seed=123, + backend="cpu", + solver="ridge", + activation="relu", + n_clusters=0, + clustering_method="kmeans", + cluster_scaling="standard", + degree=None, + weights_distr="uniform", + ): + super().__init__( + base_model=base_model, + n_estimators=n_estimators, + learning_rate=learning_rate, + n_hidden_features=n_hidden_features, + reg_lambda=reg_lambda, + alpha=alpha, + row_sample=row_sample, + col_sample=col_sample, + dropout=dropout, + tolerance=tolerance, + direct_link=direct_link, + verbose=verbose, + seed=seed, + backend=backend, + solver=solver, + activation=activation, + n_clusters=n_clusters, + clustering_method=clustering_method, + cluster_scaling=cluster_scaling, + degree=degree, + weights_distr=weights_distr, + ) + self.base_model = base_model + self.hist_bins = None + super().__init__( + base_model=base_model, + n_estimators=n_estimators, + learning_rate=learning_rate, + n_hidden_features=n_hidden_features, + reg_lambda=reg_lambda, + alpha=alpha, + row_sample=row_sample, + col_sample=col_sample, + dropout=dropout, + tolerance=tolerance, + direct_link=direct_link, + verbose=verbose, + seed=seed, + backend=backend, + solver=solver, + activation=activation, + n_clusters=n_clusters, + clustering_method=clustering_method, + cluster_scaling=cluster_scaling, + degree=degree, + weights_distr=weights_distr, + ) + + def fit(self, X, y, **kwargs): + """Fit Booster (classifier) to training data (X, y) + + Args: + + X: {array-like}, shape = [n_samples, n_features] + Training vectors, where n_samples is the number + of samples and n_features is the number of features. + + y: array-like, shape = [n_samples] + Target values. + + **kwargs: additional parameters to be passed to self.cook_training_set. + + Returns: + + self: object. + """ + X, self.hist_bins = get_histo_features(X) + return super().fit(X, y, **kwargs) + + def predict_proba(self, X, **kwargs): + """Predict probabilites for test data X. + + Args: + + X: {array-like}, shape = [n_samples, n_features] + Training vectors, where n_samples is the number + of samples and n_features is the number of features. + + **kwargs: additional parameters to be passed to + self.cook_test_set + + Returns: + + predicted values estimates for test data: {array-like} + """ + assert self.hist_bins is not None, "You must fit the model first" + X = get_histo_features(X, self.hist_bins) + try: + return super().predict_proba(np.asarray(X, order="C"), + **kwargs) + except Exception: + return super().predict_proba(X, + **kwargs) + diff --git a/mlsauce/booster/_booster_regressor.py b/mlsauce/booster/_booster_regressor.py index 09a2ec5..3ff146d 100644 --- a/mlsauce/booster/_booster_regressor.py +++ b/mlsauce/booster/_booster_regressor.py @@ -12,7 +12,7 @@ except ImportError: import _boosterc as boosterc from ..predictioninterval import PredictionInterval -from ..utils import cluster, check_and_install +from ..utils import cluster, check_and_install, get_histo_features class LSBoostRegressor(BaseEstimator, RegressorMixin): @@ -240,7 +240,7 @@ def __init__( check_and_install("jax") check_and_install("jaxlib") - def fit(self, X, y, **kwargs): + def fit(self, X, y, histo=False, **kwargs): """Fit Booster (regressor) to training data (X, y) Args: @@ -311,8 +311,8 @@ def fit(self, X, y, **kwargs): return self - def predict(self, X, level=95, method=None, **kwargs): - """Predict probabilities for test data X. + def predict(self, X, level=95, method=None, histo=False, **kwargs): + """Predict values for test data X. Args: @@ -326,13 +326,16 @@ def predict(self, X, level=95, method=None, **kwargs): method: str `None`, or 'splitconformal', 'localconformal' prediction (if you specify `return_pi = True`) + + histo: bool + whether to use histogram features or not **kwargs: additional parameters to be passed to self.cook_test_set Returns: - probability estimates for test data: {array-like} + predicted values estimates for test data: {array-like} """ if isinstance(X, pd.DataFrame): @@ -432,7 +435,7 @@ def update(self, X, y, eta=0.9): class GenericBoostingRegressor(LSBoostRegressor): - """LSBoost regressor. + """Generic Boosting regressor. Attributes: @@ -569,3 +572,196 @@ def __init__( weights_distr=weights_distr, base_model=self.base_model, ) + +class HistGenericBoostingRegressor(GenericBoostingRegressor): + """Generic Boosting regressor with histogram-based features. + + Attributes: + + base_model: object + base learner (default is ExtraTreeRegressor) to be boosted. + + n_estimators: int + number of boosting iterations. + + learning_rate: float + controls the learning speed at training time. + + n_hidden_features: int + number of nodes in successive hidden layers. + + reg_lambda: float + L2 regularization parameter for successive errors in the optimizer + (at training time). + + alpha: float + compromise between L1 and L2 regularization (must be in [0, 1]), + for `solver` == 'enet' + + row_sample: float + percentage of rows chosen from the training set. + + col_sample: float + percentage of columns chosen from the training set. + + dropout: float + percentage of nodes dropped from the training set. + + tolerance: float + controls early stopping in gradient descent (at training time). + + direct_link: bool + indicates whether the original features are included (True) in model's + fitting or not (False). + + verbose: int + progress bar (yes = 1) or not (no = 0) (currently). + + seed: int + reproducibility seed for nodes_sim=='uniform', clustering and dropout. + + backend: str + type of backend; must be in ('cpu', 'gpu', 'tpu') + + solver: str + type of 'weak' learner; currently in ('ridge', 'lasso') + + activation: str + activation function: currently 'relu', 'relu6', 'sigmoid', 'tanh' + + type_pi: str. + type of prediction interval; currently "kde" (default) or "bootstrap". + Used only in `self.predict`, for `self.replications` > 0 and `self.kernel` + in ('gaussian', 'tophat'). Default is `None`. + + replications: int. + number of replications (if needed) for predictive simulation. + Used only in `self.predict`, for `self.kernel` in ('gaussian', + 'tophat') and `self.type_pi = 'kde'`. Default is `None`. + + n_clusters: int + number of clusters for clustering the features + + clustering_method: str + clustering method: currently 'kmeans', 'gmm' + + cluster: bool + whether to cluster the features or not + + cluster_scaling: str + scaling method for clustering: currently 'standard', 'robust', 'minmax' + + degree: int + degree of features interactions to include in the model + + weights_distr: str + distribution of weights for constructing the model's hidden layer; + either 'uniform' or 'gaussian' + """ + def __init__( + self, + base_model=ExtraTreeRegressor(), + n_estimators=100, + learning_rate=0.1, + n_hidden_features=5, + reg_lambda=0.1, + alpha=0.5, + row_sample=1, + col_sample=1, + dropout=0, + tolerance=1e-4, + direct_link=1, + verbose=1, + seed=123, + backend="cpu", + solver="ridge", + activation="relu", + type_pi=None, + replications=None, + kernel=None, + n_clusters=0, + clustering_method="kmeans", + cluster_scaling="standard", + degree=None, + weights_distr="uniform", + ): + self.base_model = base_model + self.hist_bins = None + super().__init__( + n_estimators=n_estimators, + learning_rate=learning_rate, + n_hidden_features=n_hidden_features, + reg_lambda=reg_lambda, + alpha=alpha, + row_sample=row_sample, + col_sample=col_sample, + dropout=dropout, + tolerance=tolerance, + direct_link=direct_link, + verbose=verbose, + seed=seed, + backend=backend, + solver=solver, + activation=activation, + type_pi=type_pi, + replications=replications, + kernel=kernel, + n_clusters=n_clusters, + clustering_method=clustering_method, + cluster_scaling=cluster_scaling, + degree=degree, + weights_distr=weights_distr, + base_model=self.base_model, + ) + + def fit(self, X, y, **kwargs): + """Fit Booster (regressor) to training data (X, y) + + Args: + + X: {array-like}, shape = [n_samples, n_features] + Training vectors, where n_samples is the number + of samples and n_features is the number of features. + + y: array-like, shape = [n_samples] + Target values. + + **kwargs: additional parameters to be passed to self.cook_training_set. + + Returns: + + self: object. + """ + X, self.hist_bins = get_histo_features(X) + return super().fit(X, y, **kwargs) + + def predict(self, X, level=95, method=None, **kwargs): + """Predict values for test data X. + + Args: + + X: {array-like}, shape = [n_samples, n_features] + Training vectors, where n_samples is the number + of samples and n_features is the number of features. + + level: int + Level of confidence (default = 95) + + method: str + `None`, or 'splitconformal', 'localconformal' + prediction (if you specify `return_pi = True`) + + histo: bool + whether to use histogram features or not + + **kwargs: additional parameters to be passed to + self.cook_test_set + + Returns: + + predicted values estimates for test data: {array-like} + """ + assert self.hist_bins is not None, "You must fit the model first" + X = get_histo_features(X, self.hist_bins) + return super().predict(X, level=level, method=method, **kwargs) + diff --git a/mlsauce/lazybooster/lazyboosterclassif.py b/mlsauce/lazybooster/lazyboosterclassif.py index b76ceab..2ef1398 100644 --- a/mlsauce/lazybooster/lazyboosterclassif.py +++ b/mlsauce/lazybooster/lazyboosterclassif.py @@ -26,7 +26,7 @@ f1_score, ) from .config import REGRESSORS, MTASKREGRESSORS -from ..booster import GenericBoostingClassifier +from ..booster import GenericBoostingClassifier, HistGenericBoostingClassifier from ..multitaskregressor import MultiTaskRegressor import warnings @@ -194,7 +194,7 @@ def __init__( self.preprocess = preprocess self.n_jobs = n_jobs - def fit(self, X_train, X_test, y_train, y_test, **kwargs): + def fit(self, X_train, X_test, y_train, y_test, hist=False, **kwargs): """Fit classifiers to X_train and y_train, predict and score on X_test, y_test. @@ -215,6 +215,9 @@ def fit(self, X_train, X_test, y_train, y_test, **kwargs): y_test: array-like, Testing vectors, where rows is the number of samples and columns is the number of features. + + hist: bool, optional (default=False) + When set to True, the model is a HistGenericBoostingClassifier. **kwargs: dict, Additional arguments to be passed to the fit GenericBoostingClassifier. @@ -376,19 +379,34 @@ def fit(self, X_train, X_test, y_train, y_test, **kwargs): try: if "random_state" in model().get_params().keys(): - fitted_clf = GenericBoostingClassifier( - {**other_args, **kwargs}, - verbose=self.verbose, - base_model=model( - random_state=self.random_state - ), - ) + if hist: + fitted_clf = GenericBoostingClassifier( + {**other_args, **kwargs}, + verbose=self.verbose, + base_model=model( + random_state=self.random_state + ), + ) + else: + fitted_clf = HistGenericBoostingClassifier( + {**other_args, **kwargs}, + verbose=self.verbose, + base_model=model( + random_state=self.random_state + ), + ) else: - fitted_clf = GenericBoostingClassifier( - base_model=model(**kwargs), - verbose=self.verbose, - ) + if hist: + fitted_clf = GenericBoostingClassifier( + base_model=model(**kwargs), + verbose=self.verbose, + ) + else: + fitted_clf = HistGenericBoostingClassifier( + base_model=model(**kwargs), + verbose=self.verbose, + ) if self.verbose > 0: print("\n Fitting boosted " + name + " model...") @@ -500,20 +518,36 @@ def fit(self, X_train, X_test, y_train, y_test, **kwargs): start = time.time() try: if "random_state" in model().get_params().keys(): - fitted_clf = GenericBoostingClassifier( - base_model=model( - random_state=self.random_state - ), - verbose=self.verbose, - **kwargs - ) + if hist: + fitted_clf = GenericBoostingClassifier( + base_model=model( + random_state=self.random_state + ), + verbose=self.verbose, + **kwargs + ) + else: + fitted_clf = HistGenericBoostingClassifier( + base_model=model( + random_state=self.random_state + ), + verbose=self.verbose, + **kwargs + ) else: - fitted_clf = GenericBoostingClassifier( - base_model=model(), - verbose=self.verbose, - **kwargs - ) + if hist: + fitted_clf = GenericBoostingClassifier( + base_model=model(), + verbose=self.verbose, + **kwargs + ) + else: + fitted_clf = HistGenericBoostingClassifier( + base_model=model(), + verbose=self.verbose, + **kwargs + ) fitted_clf.fit(X_train, y_train) @@ -689,6 +723,7 @@ def train_model( y_test, use_preprocessing=False, preprocessor=None, + hist=False, **kwargs ): """ @@ -711,16 +746,29 @@ def train_model( try: # Handle random_state parameter if "random_state" in model().get_params().keys(): - fitted_clf = GenericBoostingClassifier( - {**other_args, **kwargs}, - verbose=self.verbose, - base_model=model(random_state=self.random_state), - ) + if hist: + fitted_clf = GenericBoostingClassifier( + {**other_args, **kwargs}, + verbose=self.verbose, + base_model=model(random_state=self.random_state), + ) + else: + fitted_clf = HistGenericBoostingClassifier( + {**other_args, **kwargs}, + verbose=self.verbose, + base_model=model(random_state=self.random_state), + ) else: - fitted_clf = GenericBoostingClassifier( - base_model=model(**kwargs), - verbose=self.verbose, - ) + if hist: + fitted_clf = GenericBoostingClassifier( + base_model=model(**kwargs), + verbose=self.verbose, + ) + else: + fitted_clf = HistGenericBoostingClassifier( + base_model=model(**kwargs), + verbose=self.verbose, + ) if self.verbose > 0: print("\n Fitting boosted " + name + " model...") diff --git a/mlsauce/lazybooster/lazyboosterregression.py b/mlsauce/lazybooster/lazyboosterregression.py index 2957383..8230222 100644 --- a/mlsauce/lazybooster/lazyboosterregression.py +++ b/mlsauce/lazybooster/lazyboosterregression.py @@ -22,7 +22,7 @@ r2_score ) from .config import REGRESSORS -from ..booster import GenericBoostingRegressor +from ..booster import GenericBoostingRegressor, HistGenericBoostingRegressor import warnings @@ -185,7 +185,7 @@ def __init__( self.preprocess = preprocess self.n_jobs = n_jobs - def fit(self, X_train, X_test, y_train, y_test, **kwargs): + def fit(self, X_train, X_test, y_train, y_test, hist=False, **kwargs): """Fit Regression algorithms to X_train and y_train, predict and score on X_test, y_test. Parameters: @@ -205,6 +205,9 @@ def fit(self, X_train, X_test, y_train, y_test, **kwargs): y_test : array-like, Testing vectors, where rows is the number of samples and columns is the number of features. + + hist: bool, optional (default=False) + When set to True, the model is a HistGenericBoostingRegressor. **kwargs: dict, Additional parameters to be passed to the GenericBoostingRegressor. @@ -345,9 +348,17 @@ def fit(self, X_train, X_test, y_train, y_test, **kwargs): try: - model = GenericBoostingRegressor( - base_model=regr(), verbose=self.verbose, **kwargs - ) + if hist: + + model = GenericBoostingRegressor( + base_model=regr(), verbose=self.verbose, **kwargs + ) + + else: + + model = HistGenericBoostingRegressor( + base_model=regr(), verbose=self.verbose, **kwargs + ) model.fit(X_train, y_train) @@ -456,10 +467,15 @@ def fit(self, X_train, X_test, y_train, y_test, **kwargs): for name, regr in tqdm(self.regressors): # do parallel exec start = time.time() try: - - model = GenericBoostingRegressor( - base_model=regr(), verbose=self.verbose, **kwargs - ) + + if hist: + model = GenericBoostingRegressor( + base_model=regr(), verbose=self.verbose, **kwargs + ) + else: + model = HistGenericBoostingRegressor( + base_model=regr(), verbose=self.verbose, **kwargs + ) if self.verbose > 0: print("\n Fitting boosted " + name + " model...") @@ -630,6 +646,7 @@ def train_model( y_test, use_preprocessing=False, preprocessor=None, + hist=False, **kwargs ): """ @@ -638,9 +655,14 @@ def train_model( start = time.time() try: - model = GenericBoostingRegressor( - base_model=regr(), verbose=self.verbose, **kwargs - ) + if hist: + model = GenericBoostingRegressor( + base_model=regr(), verbose=self.verbose, **kwargs + ) + else: + model = HistGenericBoostingRegressor( + base_model=regr(), verbose=self.verbose, **kwargs + ) if use_preprocessing and preprocessor is not None: pipe = Pipeline( diff --git a/mlsauce/utils/__init__.py b/mlsauce/utils/__init__.py index 99dec6d..156e393 100644 --- a/mlsauce/utils/__init__.py +++ b/mlsauce/utils/__init__.py @@ -10,6 +10,7 @@ ) from .progress_bar import Progbar from .get_beta import get_beta +from .histofeatures.gethistofeatures import get_histo_features __all__ = [ "cluster", @@ -22,4 +23,5 @@ "get_beta", "check_and_install", "is_multitask_estimator", + "get_histo_features" ] diff --git a/mlsauce/utils/histofeatures/__init__.py b/mlsauce/utils/histofeatures/__init__.py new file mode 100644 index 0000000..a20796b --- /dev/null +++ b/mlsauce/utils/histofeatures/__init__.py @@ -0,0 +1,3 @@ +from .gethistofeatures import get_histo_features + +__all__ = ['get_histo_features'] \ No newline at end of file diff --git a/mlsauce/utils/histofeatures/gethistofeatures.py b/mlsauce/utils/histofeatures/gethistofeatures.py new file mode 100644 index 0000000..2b762c8 --- /dev/null +++ b/mlsauce/utils/histofeatures/gethistofeatures.py @@ -0,0 +1,99 @@ +import numpy as np +import pandas as pd + +def create_histogram_with_bin_values(x): + """ + Computes a histogram for the input data and assigns a value to each bin + reflecting the ordering of the input. + + Args: + x (list or np.array): Input data. + num_bins (int): Number of bins for the histogram. + + Returns: + bin_edges (np.array): The edges of the bins. + bin_value_dict (dict): A dictionary where keys are the bin ranges (tuples) and values reflect the ordering. + """ + # Compute the histogram + hist, bin_edges = np.histogram(x, bins="auto") + + bin_edges = np.concatenate([[1e-10], bin_edges, [1e10]]).ravel() + + # Create a dict to store bin ranges and assigned values + bin_value_dict = {} + + for i in range(len(bin_edges) - 1): + bin_range = (bin_edges[i], bin_edges[i + 1]) + bin_value_dict[i] = (bin_range, np.median(list(bin_range))) + + return bin_edges, bin_value_dict + +def assign_values_to_input(new_data, bin_value_dict): + """ + Assigns values to a new input based on the provided bin ranges and values. + + Args: + new_data (list or np.array): New input data to assign values to. + bin_value_dict (dict): Dictionary where keys are bin ranges (tuples) and values are the assigned values. + + Returns: + assigned_values (list): List of assigned values for the new input data. + """ + assigned_values = [] + + for value in new_data: + assigned = None + # Find the appropriate bin for each value + for elt in bin_value_dict.items(): + if elt[1][0][0] <= value < elt[1][0][1]: + assigned = elt[1][1] + break + + assigned_values.append(assigned) + + return assigned_values + +def get_histo_features(X, bin_value_dict=None): + """ + Computes histogram features for the input data. + + Args: + X {np.array or pd.DataFrame}: Input data. + + Returns: + X_hist {np.array or pd.DataFrame}: Input data with histogram features. + """ + + if bin_value_dict is None: # training set case + + if isinstance(X, pd.DataFrame): + colnames = X.columns + X = X.values + X_hist = pd.DataFrame(np.zeros(X.shape), + columns=colnames) + for i, col in enumerate(colnames): + _, bin_value_dict = create_histogram_with_bin_values(X[:, i]) + X_hist[col] = assign_values_to_input(X[:, i], bin_value_dict) + else: + X_hist = np.zeros(X.shape) + for i in range(X.shape[1]): + _, bin_value_dict = create_histogram_with_bin_values(X[:, i]) + X_hist[:, i] = assign_values_to_input(X[:, i], bin_value_dict) + + return X_hist, bin_value_dict + + else: # test set case + + if isinstance(X, pd.DataFrame): + colnames = X.columns + X = X.values + X_hist = pd.DataFrame(np.zeros(X.shape), + columns=colnames) + for i, col in enumerate(colnames): + X_hist[col] = assign_values_to_input(X[:, i], bin_value_dict) + else: + X_hist = np.zeros(X.shape) + for i in range(X.shape[1]): + X_hist[:, i] = assign_values_to_input(X[:, i], bin_value_dict) + + return X_hist \ No newline at end of file From 24c830eba55829afc46149edf6f994e03564718b Mon Sep 17 00:00:00 2001 From: Thierry Moudiki Date: Sun, 27 Oct 2024 17:23:46 +0100 Subject: [PATCH 02/26] ravel responses --- mlsauce/booster/_booster_classifier.py | 4 ++-- mlsauce/booster/_booster_regressor.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/mlsauce/booster/_booster_classifier.py b/mlsauce/booster/_booster_classifier.py index ef126d0..98bd5cb 100644 --- a/mlsauce/booster/_booster_classifier.py +++ b/mlsauce/booster/_booster_classifier.py @@ -414,7 +414,7 @@ def fit(self, X, y, **kwargs): self.obj = boosterc.fit_booster_classifier( np.asarray(X, order="C"), - np.asarray(y, order="C"), + np.asarray(y, order="C").ravel(), n_estimators=self.n_estimators, learning_rate=self.learning_rate, n_hidden_features=self.n_hidden_features, @@ -840,7 +840,7 @@ def fit(self, X, y, **kwargs): self: object. """ X, self.hist_bins = get_histo_features(X) - return super().fit(X, y, **kwargs) + return super().fit(X, y.ravel(), **kwargs) def predict_proba(self, X, **kwargs): """Predict probabilites for test data X. diff --git a/mlsauce/booster/_booster_regressor.py b/mlsauce/booster/_booster_regressor.py index 3ff146d..b9ab67e 100644 --- a/mlsauce/booster/_booster_regressor.py +++ b/mlsauce/booster/_booster_regressor.py @@ -284,7 +284,7 @@ def fit(self, X, y, histo=False, **kwargs): self.obj = boosterc.fit_booster_regressor( X=np.asarray(X, order="C"), - y=np.asarray(y, order="C"), + y=np.asarray(y, order="C").ravel(), n_estimators=self.n_estimators, learning_rate=self.learning_rate, n_hidden_features=self.n_hidden_features, @@ -733,7 +733,7 @@ def fit(self, X, y, **kwargs): self: object. """ X, self.hist_bins = get_histo_features(X) - return super().fit(X, y, **kwargs) + return super().fit(X, y.ravel(), **kwargs) def predict(self, X, level=95, method=None, **kwargs): """Predict values for test data X. From 8b5b0f8769f41780fae6e8ecce1144904ffabeb5 Mon Sep 17 00:00:00 2001 From: Thierry Moudiki Date: Sun, 27 Oct 2024 17:56:25 +0100 Subject: [PATCH 03/26] fix gbooster Pt.1 --- mlsauce/booster/_booster_classifier.py | 5 +++-- mlsauce/booster/_booster_regressor.py | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/mlsauce/booster/_booster_classifier.py b/mlsauce/booster/_booster_classifier.py index 98bd5cb..f845104 100644 --- a/mlsauce/booster/_booster_classifier.py +++ b/mlsauce/booster/_booster_classifier.py @@ -414,7 +414,7 @@ def fit(self, X, y, **kwargs): self.obj = boosterc.fit_booster_classifier( np.asarray(X, order="C"), - np.asarray(y, order="C").ravel(), + np.asarray(y, order="C"), n_estimators=self.n_estimators, learning_rate=self.learning_rate, n_hidden_features=self.n_hidden_features, @@ -543,7 +543,8 @@ def update(self, X, y, eta=0.9): ) self.obj = boosterc.update_booster( - self.obj, np.asarray(X, order="C"), np.asarray(y, order="C"), eta + self.obj, np.asarray(X, order="C"), + np.asarray(y, order="C").ravel(), eta ) return self diff --git a/mlsauce/booster/_booster_regressor.py b/mlsauce/booster/_booster_regressor.py index b9ab67e..af226ce 100644 --- a/mlsauce/booster/_booster_regressor.py +++ b/mlsauce/booster/_booster_regressor.py @@ -240,7 +240,7 @@ def __init__( check_and_install("jax") check_and_install("jaxlib") - def fit(self, X, y, histo=False, **kwargs): + def fit(self, X, y, **kwargs): """Fit Booster (regressor) to training data (X, y) Args: @@ -284,7 +284,7 @@ def fit(self, X, y, histo=False, **kwargs): self.obj = boosterc.fit_booster_regressor( X=np.asarray(X, order="C"), - y=np.asarray(y, order="C").ravel(), + y=np.asarray(y, order="C"), n_estimators=self.n_estimators, learning_rate=self.learning_rate, n_hidden_features=self.n_hidden_features, From 7731f1ab6f5656f3fbd4ee7a81cc8f0b49a1944c Mon Sep 17 00:00:00 2001 From: Thierry Moudiki Date: Sun, 27 Oct 2024 18:17:47 +0100 Subject: [PATCH 04/26] fix gbooster Pt.2 --- mlsauce/booster/_booster_classifier.py | 6 +++++- mlsauce/booster/_booster_regressor.py | 3 +++ mlsauce/lazybooster/lazyboosterregression.py | 14 +++++++------- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/mlsauce/booster/_booster_classifier.py b/mlsauce/booster/_booster_classifier.py index f845104..94fd2b5 100644 --- a/mlsauce/booster/_booster_classifier.py +++ b/mlsauce/booster/_booster_classifier.py @@ -391,6 +391,9 @@ def fit(self, X, y, **kwargs): if isinstance(X, pd.DataFrame): X = X.values + + if isinstance(y, pd.Series): + y = y.values.ravel() if self.degree is not None: assert isinstance(self.degree, int), "`degree` must be an integer" @@ -433,7 +436,8 @@ def fit(self, X, y, **kwargs): obj=self.base_model, ) - self.n_classes_ = len(np.unique(y)) # for compatibility with sklearn + self.classes_ = np.unique(y) # for compatibility with sklearn + self.n_classes_ = len(self.classes_) # for compatibility with sklearn self.n_estimators = self.obj["n_estimators"] return self diff --git a/mlsauce/booster/_booster_regressor.py b/mlsauce/booster/_booster_regressor.py index af226ce..9bd1a3a 100644 --- a/mlsauce/booster/_booster_regressor.py +++ b/mlsauce/booster/_booster_regressor.py @@ -261,6 +261,9 @@ def fit(self, X, y, **kwargs): if isinstance(X, pd.DataFrame): X = X.values + + if isinstance(y, pd.Series): + y = y.values.ravel() if self.degree is not None: assert isinstance(self.degree, int), "`degree` must be an integer" diff --git a/mlsauce/lazybooster/lazyboosterregression.py b/mlsauce/lazybooster/lazyboosterregression.py index 8230222..d954965 100644 --- a/mlsauce/lazybooster/lazyboosterregression.py +++ b/mlsauce/lazybooster/lazyboosterregression.py @@ -287,7 +287,7 @@ def fit(self, X_train, X_test, y_train, y_test, hist=False, **kwargs): for name, model in tqdm(zip(baseline_names, baseline_models)): start = time.time() try: - model.fit(X_train, y_train) + model.fit(X_train, y_train.ravel()) self.models_[name] = model y_pred = model.predict(X_test) r_squared = r2_score(y_test, y_pred) @@ -360,7 +360,7 @@ def fit(self, X_train, X_test, y_train, y_test, hist=False, **kwargs): base_model=regr(), verbose=self.verbose, **kwargs ) - model.fit(X_train, y_train) + model.fit(X_train, y_train.ravel()) pipe = Pipeline( steps=[ @@ -370,7 +370,7 @@ def fit(self, X_train, X_test, y_train, y_test, hist=False, **kwargs): ) if self.verbose > 0: print("\n Fitting boosted " + name + " model...") - pipe.fit(X_train, y_train) + pipe.fit(X_train, y_train.ravel()) self.models_[name] = pipe y_pred = pipe.predict(X_test) @@ -479,7 +479,7 @@ def fit(self, X_train, X_test, y_train, y_test, hist=False, **kwargs): if self.verbose > 0: print("\n Fitting boosted " + name + " model...") - model.fit(X_train, y_train) + model.fit(X_train, y_train.ravel()) self.models_[name] = model y_pred = model.predict(X_test) @@ -632,7 +632,7 @@ def provide_models(self, X_train, X_test, y_train, y_test): """ if len(self.models_.keys()) == 0: - self.fit(X_train, X_test, y_train, y_test) + self.fit(X_train, X_test, y_train.ravel(), y_test.values) return self.models_ @@ -677,7 +677,7 @@ def train_model( + name + " model with preprocessing..." ) - pipe.fit(X_train, y_train) + pipe.fit(X_train, y_train.ravel()) y_pred = pipe.predict(X_test) fitted_model = pipe else: @@ -688,7 +688,7 @@ def train_model( + name + " model without preprocessing..." ) - model.fit(X_train, y_train) + model.fit(X_train, y_train.ravel()) y_pred = model.predict(X_test) fitted_model = model From 347d58e194a8a3f1558daff3e9741f4e108ae2d8 Mon Sep 17 00:00:00 2001 From: Thierry Moudiki Date: Sun, 27 Oct 2024 18:25:35 +0100 Subject: [PATCH 05/26] fix gbooster Pt.3 --- mlsauce/booster/_booster_classifier.py | 4 +++- mlsauce/booster/_booster_regressor.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/mlsauce/booster/_booster_classifier.py b/mlsauce/booster/_booster_classifier.py index 94fd2b5..0f707d2 100644 --- a/mlsauce/booster/_booster_classifier.py +++ b/mlsauce/booster/_booster_classifier.py @@ -394,6 +394,8 @@ def fit(self, X, y, **kwargs): if isinstance(y, pd.Series): y = y.values.ravel() + else: + y = y.ravel() if self.degree is not None: assert isinstance(self.degree, int), "`degree` must be an integer" @@ -845,7 +847,7 @@ def fit(self, X, y, **kwargs): self: object. """ X, self.hist_bins = get_histo_features(X) - return super().fit(X, y.ravel(), **kwargs) + return super().fit(X, y, **kwargs) def predict_proba(self, X, **kwargs): """Predict probabilites for test data X. diff --git a/mlsauce/booster/_booster_regressor.py b/mlsauce/booster/_booster_regressor.py index 9bd1a3a..1135ce2 100644 --- a/mlsauce/booster/_booster_regressor.py +++ b/mlsauce/booster/_booster_regressor.py @@ -264,6 +264,8 @@ def fit(self, X, y, **kwargs): if isinstance(y, pd.Series): y = y.values.ravel() + else: + y = y.ravel() if self.degree is not None: assert isinstance(self.degree, int), "`degree` must be an integer" @@ -736,7 +738,7 @@ def fit(self, X, y, **kwargs): self: object. """ X, self.hist_bins = get_histo_features(X) - return super().fit(X, y.ravel(), **kwargs) + return super().fit(X, y, **kwargs) def predict(self, X, level=95, method=None, **kwargs): """Predict values for test data X. From 2fcd945797eee248a3cf625c940fbfa91fbcea24 Mon Sep 17 00:00:00 2001 From: Thierry Moudiki Date: Sun, 27 Oct 2024 18:53:23 +0100 Subject: [PATCH 06/26] fix gbooster Pt.4 --- mlsauce/lazybooster/lazyboosterclassif.py | 12 ++++++------ mlsauce/lazybooster/lazyboosterregression.py | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/mlsauce/lazybooster/lazyboosterclassif.py b/mlsauce/lazybooster/lazyboosterclassif.py index 2ef1398..c58c812 100644 --- a/mlsauce/lazybooster/lazyboosterclassif.py +++ b/mlsauce/lazybooster/lazyboosterclassif.py @@ -379,7 +379,7 @@ def fit(self, X_train, X_test, y_train, y_test, hist=False, **kwargs): try: if "random_state" in model().get_params().keys(): - if hist: + if hist is False: fitted_clf = GenericBoostingClassifier( {**other_args, **kwargs}, verbose=self.verbose, @@ -397,7 +397,7 @@ def fit(self, X_train, X_test, y_train, y_test, hist=False, **kwargs): ) else: - if hist: + if hist is False: fitted_clf = GenericBoostingClassifier( base_model=model(**kwargs), verbose=self.verbose, @@ -518,7 +518,7 @@ def fit(self, X_train, X_test, y_train, y_test, hist=False, **kwargs): start = time.time() try: if "random_state" in model().get_params().keys(): - if hist: + if hist is False: fitted_clf = GenericBoostingClassifier( base_model=model( random_state=self.random_state @@ -536,7 +536,7 @@ def fit(self, X_train, X_test, y_train, y_test, hist=False, **kwargs): ) else: - if hist: + if hist is False: fitted_clf = GenericBoostingClassifier( base_model=model(), verbose=self.verbose, @@ -746,7 +746,7 @@ def train_model( try: # Handle random_state parameter if "random_state" in model().get_params().keys(): - if hist: + if hist is False: fitted_clf = GenericBoostingClassifier( {**other_args, **kwargs}, verbose=self.verbose, @@ -759,7 +759,7 @@ def train_model( base_model=model(random_state=self.random_state), ) else: - if hist: + if hist is False: fitted_clf = GenericBoostingClassifier( base_model=model(**kwargs), verbose=self.verbose, diff --git a/mlsauce/lazybooster/lazyboosterregression.py b/mlsauce/lazybooster/lazyboosterregression.py index d954965..5d94b75 100644 --- a/mlsauce/lazybooster/lazyboosterregression.py +++ b/mlsauce/lazybooster/lazyboosterregression.py @@ -348,7 +348,7 @@ def fit(self, X_train, X_test, y_train, y_test, hist=False, **kwargs): try: - if hist: + if hist is False: model = GenericBoostingRegressor( base_model=regr(), verbose=self.verbose, **kwargs @@ -468,7 +468,7 @@ def fit(self, X_train, X_test, y_train, y_test, hist=False, **kwargs): start = time.time() try: - if hist: + if hist is False: model = GenericBoostingRegressor( base_model=regr(), verbose=self.verbose, **kwargs ) @@ -655,7 +655,7 @@ def train_model( start = time.time() try: - if hist: + if hist is False: model = GenericBoostingRegressor( base_model=regr(), verbose=self.verbose, **kwargs ) From 559a9e59014f34839d2368abb6d7257e75bc7935 Mon Sep 17 00:00:00 2001 From: Thierry Moudiki Date: Sun, 27 Oct 2024 19:06:32 +0100 Subject: [PATCH 07/26] return np array assigned values --- mlsauce/utils/histofeatures/gethistofeatures.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlsauce/utils/histofeatures/gethistofeatures.py b/mlsauce/utils/histofeatures/gethistofeatures.py index 2b762c8..0cc80d5 100644 --- a/mlsauce/utils/histofeatures/gethistofeatures.py +++ b/mlsauce/utils/histofeatures/gethistofeatures.py @@ -51,7 +51,7 @@ def assign_values_to_input(new_data, bin_value_dict): assigned_values.append(assigned) - return assigned_values + return np.asarray(assigned_values) def get_histo_features(X, bin_value_dict=None): """ From a695c72b86cb1db8f4d8e72ac831f4d7a43e549b Mon Sep 17 00:00:00 2001 From: Thierry Moudiki Date: Sun, 27 Oct 2024 19:07:05 +0100 Subject: [PATCH 08/26] return np array assigned values Pt.2 --- mlsauce/utils/histofeatures/gethistofeatures.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlsauce/utils/histofeatures/gethistofeatures.py b/mlsauce/utils/histofeatures/gethistofeatures.py index 0cc80d5..badc1a5 100644 --- a/mlsauce/utils/histofeatures/gethistofeatures.py +++ b/mlsauce/utils/histofeatures/gethistofeatures.py @@ -51,7 +51,7 @@ def assign_values_to_input(new_data, bin_value_dict): assigned_values.append(assigned) - return np.asarray(assigned_values) + return np.asarray(assigned_values).ravel() def get_histo_features(X, bin_value_dict=None): """ From e38dceee72a0b5f701839a40deac5708a7ce9e14 Mon Sep 17 00:00:00 2001 From: Thierry Moudiki Date: Sun, 27 Oct 2024 19:21:36 +0100 Subject: [PATCH 09/26] return np array assigned values Pt.3 --- mlsauce/booster/_booster_classifier.py | 4 +++- mlsauce/booster/_booster_regressor.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/mlsauce/booster/_booster_classifier.py b/mlsauce/booster/_booster_classifier.py index 0f707d2..2c5ed1f 100644 --- a/mlsauce/booster/_booster_classifier.py +++ b/mlsauce/booster/_booster_classifier.py @@ -846,7 +846,9 @@ def fit(self, X, y, **kwargs): self: object. """ - X, self.hist_bins = get_histo_features(X) + print(f"\n before: {X} \n") + X, self.hist_bins = get_histo_features(X) + print(f"\n after: {X} \n") return super().fit(X, y, **kwargs) def predict_proba(self, X, **kwargs): diff --git a/mlsauce/booster/_booster_regressor.py b/mlsauce/booster/_booster_regressor.py index 1135ce2..f8779f6 100644 --- a/mlsauce/booster/_booster_regressor.py +++ b/mlsauce/booster/_booster_regressor.py @@ -737,7 +737,9 @@ def fit(self, X, y, **kwargs): self: object. """ - X, self.hist_bins = get_histo_features(X) + print(f"\n before: {X} \n") + X, self.hist_bins = get_histo_features(X) + print(f"\n after: {X} \n") return super().fit(X, y, **kwargs) def predict(self, X, level=95, method=None, **kwargs): From 656cfb8fcb918df7efda179ff060bbb62ec08e04 Mon Sep 17 00:00:00 2001 From: Thierry Moudiki Date: Sun, 27 Oct 2024 19:39:17 +0100 Subject: [PATCH 10/26] fix bounds --- mlsauce/utils/histofeatures/gethistofeatures.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/mlsauce/utils/histofeatures/gethistofeatures.py b/mlsauce/utils/histofeatures/gethistofeatures.py index badc1a5..efe0cbd 100644 --- a/mlsauce/utils/histofeatures/gethistofeatures.py +++ b/mlsauce/utils/histofeatures/gethistofeatures.py @@ -17,14 +17,19 @@ def create_histogram_with_bin_values(x): # Compute the histogram hist, bin_edges = np.histogram(x, bins="auto") - bin_edges = np.concatenate([[1e-10], bin_edges, [1e10]]).ravel() + bin_edges = np.concatenate([[-1e10], bin_edges, [1e10]]).ravel() # Create a dict to store bin ranges and assigned values bin_value_dict = {} for i in range(len(bin_edges) - 1): bin_range = (bin_edges[i], bin_edges[i + 1]) - bin_value_dict[i] = (bin_range, np.median(list(bin_range))) + if bin_edges[i] == -1e10: + bin_value_dict[i] = (bin_range, bin_edges[i + 1]) + elif bin_edges[i + 1] == 1e10: + bin_value_dict[i] = (bin_range, bin_edges[i]) + else: + bin_value_dict[i] = (bin_range, np.median(list(bin_range))) return bin_edges, bin_value_dict From 6c1d3cd716ee0cbe6847c5039035fc4b42ef9078 Mon Sep 17 00:00:00 2001 From: Thierry Moudiki Date: Sun, 27 Oct 2024 19:49:53 +0100 Subject: [PATCH 11/26] change no. of bins --- mlsauce/utils/histofeatures/gethistofeatures.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlsauce/utils/histofeatures/gethistofeatures.py b/mlsauce/utils/histofeatures/gethistofeatures.py index efe0cbd..adb9309 100644 --- a/mlsauce/utils/histofeatures/gethistofeatures.py +++ b/mlsauce/utils/histofeatures/gethistofeatures.py @@ -15,7 +15,7 @@ def create_histogram_with_bin_values(x): bin_value_dict (dict): A dictionary where keys are the bin ranges (tuples) and values reflect the ordering. """ # Compute the histogram - hist, bin_edges = np.histogram(x, bins="auto") + hist, bin_edges = np.histogram(x) bin_edges = np.concatenate([[-1e10], bin_edges, [1e10]]).ravel() From 0db67a8a4c50ce63a3c541e4787ea37f175f68d8 Mon Sep 17 00:00:00 2001 From: Thierry Moudiki Date: Sun, 27 Oct 2024 19:53:29 +0100 Subject: [PATCH 12/26] change no. of bins Pt.2 --- mlsauce/booster/_booster_regressor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mlsauce/booster/_booster_regressor.py b/mlsauce/booster/_booster_regressor.py index f8779f6..3fd153a 100644 --- a/mlsauce/booster/_booster_regressor.py +++ b/mlsauce/booster/_booster_regressor.py @@ -737,9 +737,9 @@ def fit(self, X, y, **kwargs): self: object. """ - print(f"\n before: {X} \n") + #print(f"\n before: {X} \n") X, self.hist_bins = get_histo_features(X) - print(f"\n after: {X} \n") + #print(f"\n after: {X} \n") return super().fit(X, y, **kwargs) def predict(self, X, level=95, method=None, **kwargs): From 3b051064516a40ff22f12d6e5d58ba95a8c2a23f Mon Sep 17 00:00:00 2001 From: Thierry Moudiki Date: Sun, 27 Oct 2024 19:59:38 +0100 Subject: [PATCH 13/26] change no. of bins Pt.3 --- mlsauce/booster/_booster_classifier.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mlsauce/booster/_booster_classifier.py b/mlsauce/booster/_booster_classifier.py index 2c5ed1f..52a4b1d 100644 --- a/mlsauce/booster/_booster_classifier.py +++ b/mlsauce/booster/_booster_classifier.py @@ -846,9 +846,9 @@ def fit(self, X, y, **kwargs): self: object. """ - print(f"\n before: {X} \n") + #print(f"\n before: {X} \n") X, self.hist_bins = get_histo_features(X) - print(f"\n after: {X} \n") + #print(f"\n after: {X} \n") return super().fit(X, y, **kwargs) def predict_proba(self, X, **kwargs): From e567d5b89ac1255f18ffd267e66e715b4843602c Mon Sep 17 00:00:00 2001 From: Thierry Moudiki Date: Mon, 28 Oct 2024 07:00:01 +0100 Subject: [PATCH 14/26] bins = 'auto' --- mlsauce/utils/histofeatures/gethistofeatures.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlsauce/utils/histofeatures/gethistofeatures.py b/mlsauce/utils/histofeatures/gethistofeatures.py index adb9309..efe0cbd 100644 --- a/mlsauce/utils/histofeatures/gethistofeatures.py +++ b/mlsauce/utils/histofeatures/gethistofeatures.py @@ -15,7 +15,7 @@ def create_histogram_with_bin_values(x): bin_value_dict (dict): A dictionary where keys are the bin ranges (tuples) and values reflect the ordering. """ # Compute the histogram - hist, bin_edges = np.histogram(x) + hist, bin_edges = np.histogram(x, bins="auto") bin_edges = np.concatenate([[-1e10], bin_edges, [1e10]]).ravel() From becf078b8f041b2e79fa18630992662e105af6dd Mon Sep 17 00:00:00 2001 From: Thierry Moudiki Date: Mon, 28 Oct 2024 07:14:44 +0100 Subject: [PATCH 15/26] change no. of bins Pt.4 --- mlsauce/booster/_booster_regressor.py | 8 ++++---- mlsauce/utils/histofeatures/gethistofeatures.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/mlsauce/booster/_booster_regressor.py b/mlsauce/booster/_booster_regressor.py index 3fd153a..4833323 100644 --- a/mlsauce/booster/_booster_regressor.py +++ b/mlsauce/booster/_booster_regressor.py @@ -738,9 +738,9 @@ def fit(self, X, y, **kwargs): self: object. """ #print(f"\n before: {X} \n") - X, self.hist_bins = get_histo_features(X) + X_, self.hist_bins = get_histo_features(X) #print(f"\n after: {X} \n") - return super().fit(X, y, **kwargs) + return super().fit(X_, y, **kwargs) def predict(self, X, level=95, method=None, **kwargs): """Predict values for test data X. @@ -769,6 +769,6 @@ def predict(self, X, level=95, method=None, **kwargs): predicted values estimates for test data: {array-like} """ assert self.hist_bins is not None, "You must fit the model first" - X = get_histo_features(X, self.hist_bins) - return super().predict(X, level=level, method=method, **kwargs) + X_ = get_histo_features(X, self.hist_bins) + return super().predict(X_, level=level, method=method, **kwargs) diff --git a/mlsauce/utils/histofeatures/gethistofeatures.py b/mlsauce/utils/histofeatures/gethistofeatures.py index efe0cbd..a4a2d1c 100644 --- a/mlsauce/utils/histofeatures/gethistofeatures.py +++ b/mlsauce/utils/histofeatures/gethistofeatures.py @@ -50,7 +50,7 @@ def assign_values_to_input(new_data, bin_value_dict): assigned = None # Find the appropriate bin for each value for elt in bin_value_dict.items(): - if elt[1][0][0] <= value < elt[1][0][1]: + if elt[1][0][0] < value <= elt[1][0][1]: assigned = elt[1][1] break From e278f38a17d7e6a1522aa556d75912370dcd6b62 Mon Sep 17 00:00:00 2001 From: Thierry Moudiki Date: Mon, 28 Oct 2024 08:40:46 +0100 Subject: [PATCH 16/26] modif gethistofeatures --- .../utils/histofeatures/gethistofeatures.py | 42 +++++++++---------- 1 file changed, 19 insertions(+), 23 deletions(-) diff --git a/mlsauce/utils/histofeatures/gethistofeatures.py b/mlsauce/utils/histofeatures/gethistofeatures.py index a4a2d1c..896679b 100644 --- a/mlsauce/utils/histofeatures/gethistofeatures.py +++ b/mlsauce/utils/histofeatures/gethistofeatures.py @@ -1,6 +1,7 @@ import numpy as np import pandas as pd + def create_histogram_with_bin_values(x): """ Computes a histogram for the input data and assigns a value to each bin @@ -15,49 +16,44 @@ def create_histogram_with_bin_values(x): bin_value_dict (dict): A dictionary where keys are the bin ranges (tuples) and values reflect the ordering. """ # Compute the histogram - hist, bin_edges = np.histogram(x, bins="auto") + _, bin_edges = np.histogram(x, bins="auto") - bin_edges = np.concatenate([[-1e10], bin_edges, [1e10]]).ravel() - - # Create a dict to store bin ranges and assigned values - bin_value_dict = {} - - for i in range(len(bin_edges) - 1): - bin_range = (bin_edges[i], bin_edges[i + 1]) - if bin_edges[i] == -1e10: - bin_value_dict[i] = (bin_range, bin_edges[i + 1]) - elif bin_edges[i + 1] == 1e10: - bin_value_dict[i] = (bin_range, bin_edges[i]) - else: - bin_value_dict[i] = (bin_range, np.median(list(bin_range))) + bin_edges = np.concatenate([[-1e10], bin_edges, [1e10]]).ravel() - return bin_edges, bin_value_dict + return {i: (bin_edges[i], bin_edges[i + 1]) for i in range(len(bin_edges) - 1)} + -def assign_values_to_input(new_data, bin_value_dict): +def assign_values_to_input(x, bin_value_dict): """ Assigns values to a new input based on the provided bin ranges and values. Args: - new_data (list or np.array): New input data to assign values to. + x (list or np.array): New input data to assign values to. bin_value_dict (dict): Dictionary where keys are bin ranges (tuples) and values are the assigned values. Returns: assigned_values (list): List of assigned values for the new input data. """ + + if np.issubdtype(x.dtype, np.integer) or np.issubdtype(x.dtype, np.object_): + + return x + assigned_values = [] - for value in new_data: + for value in x: assigned = None # Find the appropriate bin for each value - for elt in bin_value_dict.items(): - if elt[1][0][0] < value <= elt[1][0][1]: - assigned = elt[1][1] + for i, elt in enumerate(bin_value_dict.items()): + if elt[1][0] < value <= elt[1][1]: + assigned = i break assigned_values.append(assigned) return np.asarray(assigned_values).ravel() + def get_histo_features(X, bin_value_dict=None): """ Computes histogram features for the input data. @@ -77,12 +73,12 @@ def get_histo_features(X, bin_value_dict=None): X_hist = pd.DataFrame(np.zeros(X.shape), columns=colnames) for i, col in enumerate(colnames): - _, bin_value_dict = create_histogram_with_bin_values(X[:, i]) + bin_value_dict = create_histogram_with_bin_values(X[:, i]) X_hist[col] = assign_values_to_input(X[:, i], bin_value_dict) else: X_hist = np.zeros(X.shape) for i in range(X.shape[1]): - _, bin_value_dict = create_histogram_with_bin_values(X[:, i]) + bin_value_dict = create_histogram_with_bin_values(X[:, i]) X_hist[:, i] = assign_values_to_input(X[:, i], bin_value_dict) return X_hist, bin_value_dict From 83847554d8ffea4f8dbb0db2acba07883ab7ded7 Mon Sep 17 00:00:00 2001 From: Thierry Moudiki Date: Mon, 28 Oct 2024 08:57:46 +0100 Subject: [PATCH 17/26] fix histo --- mlsauce/utils/histofeatures/gethistofeatures.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlsauce/utils/histofeatures/gethistofeatures.py b/mlsauce/utils/histofeatures/gethistofeatures.py index 896679b..70024b5 100644 --- a/mlsauce/utils/histofeatures/gethistofeatures.py +++ b/mlsauce/utils/histofeatures/gethistofeatures.py @@ -46,7 +46,7 @@ def assign_values_to_input(x, bin_value_dict): # Find the appropriate bin for each value for i, elt in enumerate(bin_value_dict.items()): if elt[1][0] < value <= elt[1][1]: - assigned = i + assigned = float(i) break assigned_values.append(assigned) From 62859d34bba629bbe5702d3c6b10c5ca1d04f1b3 Mon Sep 17 00:00:00 2001 From: Thierry Moudiki Date: Mon, 28 Oct 2024 09:06:36 +0100 Subject: [PATCH 18/26] raise warnings in histgenbooster --- mlsauce/booster/_booster_classifier.py | 5 ++++- mlsauce/booster/_booster_regressor.py | 8 ++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/mlsauce/booster/_booster_classifier.py b/mlsauce/booster/_booster_classifier.py index 52a4b1d..1aeecd5 100644 --- a/mlsauce/booster/_booster_classifier.py +++ b/mlsauce/booster/_booster_classifier.py @@ -681,7 +681,7 @@ def __init__( ) class HistGenericBoostingClassifier(GenericBoostingClassifier): - """Histogram-based Generic Boosting classifier (using any classifier as base learner). + """EXPERIMENTAL Histogram-based Generic Boosting classifier (using any classifier as base learner). Attributes: @@ -779,6 +779,9 @@ def __init__( degree=None, weights_distr="uniform", ): + + warnings.warn("This class is highly experimental", UserWarning) + super().__init__( base_model=base_model, n_estimators=n_estimators, diff --git a/mlsauce/booster/_booster_regressor.py b/mlsauce/booster/_booster_regressor.py index 4833323..ae1a19e 100644 --- a/mlsauce/booster/_booster_regressor.py +++ b/mlsauce/booster/_booster_regressor.py @@ -579,7 +579,7 @@ def __init__( ) class HistGenericBoostingRegressor(GenericBoostingRegressor): - """Generic Boosting regressor with histogram-based features. + """EXPERIMENTAL Generic Boosting regressor with histogram-based features. Attributes: @@ -690,6 +690,9 @@ def __init__( degree=None, weights_distr="uniform", ): + + warnings.warn("This class is highly experimental", UserWarning) + self.base_model = base_model self.hist_bins = None super().__init__( @@ -717,7 +720,8 @@ def __init__( degree=degree, weights_distr=weights_distr, base_model=self.base_model, - ) + ) + def fit(self, X, y, **kwargs): """Fit Booster (regressor) to training data (X, y) From 5af9da82bf7adc8bf8196c3920566521b3a238a9 Mon Sep 17 00:00:00 2001 From: Thierry Moudiki Date: Mon, 28 Oct 2024 09:44:49 +0100 Subject: [PATCH 19/26] fix histo Pt.2 --- mlsauce/booster/_booster_classifier.py | 20 ++++++++------------ mlsauce/booster/_booster_regressor.py | 16 +++++++++------- setup.py | 2 +- 3 files changed, 18 insertions(+), 20 deletions(-) diff --git a/mlsauce/booster/_booster_classifier.py b/mlsauce/booster/_booster_classifier.py index 1aeecd5..c15f51a 100644 --- a/mlsauce/booster/_booster_classifier.py +++ b/mlsauce/booster/_booster_classifier.py @@ -780,7 +780,7 @@ def __init__( weights_distr="uniform", ): - warnings.warn("This class is highly experimental", UserWarning) + #warnings.warn("This class is highly experimental", UserWarning) super().__init__( base_model=base_model, @@ -806,7 +806,8 @@ def __init__( weights_distr=weights_distr, ) self.base_model = base_model - self.hist_bins = None + self.hist_bins_ = None + super().__init__( base_model=base_model, n_estimators=n_estimators, @@ -850,9 +851,10 @@ def fit(self, X, y, **kwargs): self: object. """ #print(f"\n before: {X} \n") - X, self.hist_bins = get_histo_features(X) + res = get_histo_features(X) + self.hist_bins_ = res[1] #print(f"\n after: {X} \n") - return super().fit(X, y, **kwargs) + return self.fit(res[0], y, **kwargs) def predict_proba(self, X, **kwargs): """Predict probabilites for test data X. @@ -870,12 +872,6 @@ def predict_proba(self, X, **kwargs): predicted values estimates for test data: {array-like} """ - assert self.hist_bins is not None, "You must fit the model first" - X = get_histo_features(X, self.hist_bins) - try: - return super().predict_proba(np.asarray(X, order="C"), - **kwargs) - except Exception: - return super().predict_proba(X, - **kwargs) + assert self.hist_bins_ is not None, "You must fit the model first" + return self.predict_proba(get_histo_features(X, self.hist_bins_)) diff --git a/mlsauce/booster/_booster_regressor.py b/mlsauce/booster/_booster_regressor.py index ae1a19e..399c39b 100644 --- a/mlsauce/booster/_booster_regressor.py +++ b/mlsauce/booster/_booster_regressor.py @@ -691,10 +691,11 @@ def __init__( weights_distr="uniform", ): - warnings.warn("This class is highly experimental", UserWarning) + #warnings.warn("This class is highly experimental", UserWarning) self.base_model = base_model - self.hist_bins = None + self.hist_bins_ = None + super().__init__( n_estimators=n_estimators, learning_rate=learning_rate, @@ -742,9 +743,10 @@ def fit(self, X, y, **kwargs): self: object. """ #print(f"\n before: {X} \n") - X_, self.hist_bins = get_histo_features(X) + res = get_histo_features(X) + self.hist_bins_ = res[1] #print(f"\n after: {X} \n") - return super().fit(X_, y, **kwargs) + return self.fit(res[0], y, **kwargs) def predict(self, X, level=95, method=None, **kwargs): """Predict values for test data X. @@ -772,7 +774,7 @@ def predict(self, X, level=95, method=None, **kwargs): predicted values estimates for test data: {array-like} """ - assert self.hist_bins is not None, "You must fit the model first" - X_ = get_histo_features(X, self.hist_bins) - return super().predict(X_, level=level, method=method, **kwargs) + assert self.hist_bins_ is not None, "You must fit the model first" + X_ = get_histo_features(X, self.hist_bins_) + return self.predict(X_, level=level, method=method, **kwargs) diff --git a/setup.py b/setup.py index 188ab86..9b8871a 100644 --- a/setup.py +++ b/setup.py @@ -37,7 +37,7 @@ MAINTAINER_EMAIL = 'thierry.moudiki@gmail.com' LICENSE = 'BSD3 Clause Clear' -__version__ = '0.22.4' +__version__ = '0.23.0' VERSION = __version__ From f9e7b7e7ab14d65cfd258dbc0629dc1ac2462694 Mon Sep 17 00:00:00 2001 From: Thierry Moudiki Date: Mon, 28 Oct 2024 09:52:25 +0100 Subject: [PATCH 20/26] fix histo Pt.3 --- mlsauce/booster/_booster_classifier.py | 4 ++-- mlsauce/booster/_booster_regressor.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mlsauce/booster/_booster_classifier.py b/mlsauce/booster/_booster_classifier.py index c15f51a..bd59dd9 100644 --- a/mlsauce/booster/_booster_classifier.py +++ b/mlsauce/booster/_booster_classifier.py @@ -807,7 +807,7 @@ def __init__( ) self.base_model = base_model self.hist_bins_ = None - + super().__init__( base_model=base_model, n_estimators=n_estimators, @@ -854,7 +854,7 @@ def fit(self, X, y, **kwargs): res = get_histo_features(X) self.hist_bins_ = res[1] #print(f"\n after: {X} \n") - return self.fit(res[0], y, **kwargs) + return super().fit(res[0], y, **kwargs) def predict_proba(self, X, **kwargs): """Predict probabilites for test data X. diff --git a/mlsauce/booster/_booster_regressor.py b/mlsauce/booster/_booster_regressor.py index 399c39b..e3d9a21 100644 --- a/mlsauce/booster/_booster_regressor.py +++ b/mlsauce/booster/_booster_regressor.py @@ -746,7 +746,7 @@ def fit(self, X, y, **kwargs): res = get_histo_features(X) self.hist_bins_ = res[1] #print(f"\n after: {X} \n") - return self.fit(res[0], y, **kwargs) + return super().fit(res[0], y, **kwargs) def predict(self, X, level=95, method=None, **kwargs): """Predict values for test data X. From 4fc7dd7b2a685bdcfef12e5a92b8b835d5bc68bc Mon Sep 17 00:00:00 2001 From: Thierry Moudiki Date: Mon, 28 Oct 2024 10:04:12 +0100 Subject: [PATCH 21/26] fix histo Pt.4 --- mlsauce/booster/_booster_classifier.py | 2 +- mlsauce/booster/_booster_regressor.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mlsauce/booster/_booster_classifier.py b/mlsauce/booster/_booster_classifier.py index bd59dd9..666fe25 100644 --- a/mlsauce/booster/_booster_classifier.py +++ b/mlsauce/booster/_booster_classifier.py @@ -873,5 +873,5 @@ def predict_proba(self, X, **kwargs): predicted values estimates for test data: {array-like} """ assert self.hist_bins_ is not None, "You must fit the model first" - return self.predict_proba(get_histo_features(X, self.hist_bins_)) + return super().predict_proba(get_histo_features(X, self.hist_bins_)) diff --git a/mlsauce/booster/_booster_regressor.py b/mlsauce/booster/_booster_regressor.py index e3d9a21..5abdada 100644 --- a/mlsauce/booster/_booster_regressor.py +++ b/mlsauce/booster/_booster_regressor.py @@ -776,5 +776,5 @@ def predict(self, X, level=95, method=None, **kwargs): """ assert self.hist_bins_ is not None, "You must fit the model first" X_ = get_histo_features(X, self.hist_bins_) - return self.predict(X_, level=level, method=method, **kwargs) + return super().predict(X_, level=level, method=method, **kwargs) From 7c8a45ed9d6ec6950be9dac67a64e3efa3b594ed Mon Sep 17 00:00:00 2001 From: Thierry Moudiki Date: Mon, 28 Oct 2024 10:23:49 +0100 Subject: [PATCH 22/26] fix histo Pt.5 --- mlsauce/__init__.py | 5 - mlsauce/booster/__init__.py | 4 - mlsauce/booster/_booster_classifier.py | 222 +++--------------------- mlsauce/booster/_booster_regressor.py | 225 +++---------------------- 4 files changed, 51 insertions(+), 405 deletions(-) diff --git a/mlsauce/__init__.py b/mlsauce/__init__.py index 9767523..2dddb7d 100644 --- a/mlsauce/__init__.py +++ b/mlsauce/__init__.py @@ -61,9 +61,6 @@ LSBoostRegressor, GenericBoostingClassifier, GenericBoostingRegressor, - HistGenericBoostingRegressor, - HistGenericBoostingClassifier, - ) from .lazybooster import LazyBoostingClassifier, LazyBoostingRegressor from .multitaskregressor import MultiTaskRegressor @@ -80,8 +77,6 @@ "LSBoostClassifier", "GenericBoostingClassifier", "GenericBoostingRegressor", - "HistGenericBoostingClassifier", - "HistGenericBoostingRegressor", "StumpClassifier", "ElasticNetRegressor", "LassoRegressor", diff --git a/mlsauce/booster/__init__.py b/mlsauce/booster/__init__.py index b8941dc..786fe8f 100644 --- a/mlsauce/booster/__init__.py +++ b/mlsauce/booster/__init__.py @@ -1,15 +1,11 @@ from ._booster_regressor import LSBoostRegressor from ._booster_regressor import GenericBoostingRegressor -from ._booster_regressor import HistGenericBoostingRegressor from ._booster_classifier import LSBoostClassifier from ._booster_classifier import GenericBoostingClassifier -from ._booster_classifier import HistGenericBoostingClassifier __all__ = [ "LSBoostClassifier", "LSBoostRegressor", "GenericBoostingClassifier", "GenericBoostingRegressor", - "HistGenericBoostingRegressor", - "HistGenericBoostingClassifier" ] diff --git a/mlsauce/booster/_booster_classifier.py b/mlsauce/booster/_booster_classifier.py index 666fe25..ef5d6ec 100644 --- a/mlsauce/booster/_booster_classifier.py +++ b/mlsauce/booster/_booster_classifier.py @@ -83,6 +83,12 @@ class LSBoostClassifier(BaseEstimator, ClassifierMixin): weights_distr: str distribution of weights for constructing the model's hidden layer; currently 'uniform', 'gaussian' + + hist: bool + indicates whether histogram features are used or not (default is False) + + bins: int or str + number of bins for histogram features (same as numpy.histogram, default is 'auto') Examples: @@ -307,9 +313,14 @@ def __init__( degree=None, weights_distr="uniform", base_model=None, + hist=False, + bins="auto", ): self.base_model = base_model + self.hist = hist + self.bins = bins + self.hist_bins_ = None if n_clusters > 0: assert clustering_method in ( @@ -392,6 +403,9 @@ def fit(self, X, y, **kwargs): if isinstance(X, pd.DataFrame): X = X.values + if self.hist: + X, self.hist_bins_ = get_histo_features(X) + if isinstance(y, pd.Series): y = y.values.ravel() else: @@ -628,6 +642,12 @@ class GenericBoostingClassifier(LSBoostClassifier): weights_distr: str distribution of weights for constructing the model's hidden layer; currently 'uniform', 'gaussian' + + hist: bool + indicates whether histogram features are used or not (default is False) + + bins: int or str + number of bins for histogram features (same as numpy.histogram, default is 'auto') """ @@ -654,162 +674,15 @@ def __init__( cluster_scaling="standard", degree=None, weights_distr="uniform", + hist=False, + bins="auto", ): self.base_model = base_model - super().__init__( - n_estimators=n_estimators, - learning_rate=learning_rate, - n_hidden_features=n_hidden_features, - reg_lambda=reg_lambda, - alpha=alpha, - row_sample=row_sample, - col_sample=col_sample, - dropout=dropout, - tolerance=tolerance, - direct_link=direct_link, - verbose=verbose, - seed=seed, - backend=backend, - solver=solver, - activation=activation, - n_clusters=n_clusters, - clustering_method=clustering_method, - cluster_scaling=cluster_scaling, - degree=degree, - weights_distr=weights_distr, - base_model=self.base_model, - ) - -class HistGenericBoostingClassifier(GenericBoostingClassifier): - """EXPERIMENTAL Histogram-based Generic Boosting classifier (using any classifier as base learner). - - Attributes: - - base_model: object - base learner (default is ExtraTreeRegressor) to be boosted. - - n_estimators: int - number of boosting iterations. - - learning_rate: float - controls the learning speed at training time. - - n_hidden_features: int - number of nodes in successive hidden layers. - - reg_lambda: float - L2 regularization parameter for successive errors in the optimizer - (at training time). - - alpha: float - compromise between L1 and L2 regularization (must be in [0, 1]), - for `solver` == 'enet'. - - row_sample: float - percentage of rows chosen from the training set. - - col_sample: float - percentage of columns chosen from the training set. - - dropout: float - percentage of nodes dropped from the training set. - - tolerance: float - controls early stopping in gradient descent (at training time). - - direct_link: bool - indicates whether the original features are included (True) in model's - fitting or not (False). - - verbose: int - progress bar (yes = 1) or not (no = 0) (currently). - - seed: int - reproducibility seed for nodes_sim=='uniform', clustering and dropout. - - backend: str - type of backend; must be in ('cpu', 'gpu', 'tpu') - - solver: str - type of 'weak' learner; currently in ('ridge', 'lasso', 'enet'). - 'enet' is a combination of 'ridge' and 'lasso' called Elastic Net. - - activation: str - activation function: currently 'relu', 'relu6', 'sigmoid', 'tanh' - - n_clusters: int - number of clusters for clustering the features - - clustering_method: str - clustering method: currently 'kmeans', 'gmm' - - cluster_scaling: str - scaling method for clustering: currently 'standard', 'robust', 'minmax' - - degree: int - degree of features interactions to include in the model - - weights_distr: str - distribution of weights for constructing the model's hidden layer; - currently 'uniform', 'gaussian' - - """ - - def __init__( - self, - base_model=ExtraTreeRegressor(), - n_estimators=100, - learning_rate=0, - n_hidden_features=5, - reg_lambda=0.1, - alpha=0.5, - row_sample=1, - col_sample=1, - dropout=0, - tolerance=1e-4, - direct_link=1, - verbose=1, - seed=123, - backend="cpu", - solver="ridge", - activation="relu", - n_clusters=0, - clustering_method="kmeans", - cluster_scaling="standard", - degree=None, - weights_distr="uniform", - ): - - #warnings.warn("This class is highly experimental", UserWarning) - - super().__init__( - base_model=base_model, - n_estimators=n_estimators, - learning_rate=learning_rate, - n_hidden_features=n_hidden_features, - reg_lambda=reg_lambda, - alpha=alpha, - row_sample=row_sample, - col_sample=col_sample, - dropout=dropout, - tolerance=tolerance, - direct_link=direct_link, - verbose=verbose, - seed=seed, - backend=backend, - solver=solver, - activation=activation, - n_clusters=n_clusters, - clustering_method=clustering_method, - cluster_scaling=cluster_scaling, - degree=degree, - weights_distr=weights_distr, - ) - self.base_model = base_model + self.hist = hist + self.bins = bins self.hist_bins_ = None super().__init__( - base_model=base_model, n_estimators=n_estimators, learning_rate=learning_rate, n_hidden_features=n_hidden_features, @@ -830,48 +703,5 @@ def __init__( cluster_scaling=cluster_scaling, degree=degree, weights_distr=weights_distr, - ) - - def fit(self, X, y, **kwargs): - """Fit Booster (classifier) to training data (X, y) - - Args: - - X: {array-like}, shape = [n_samples, n_features] - Training vectors, where n_samples is the number - of samples and n_features is the number of features. - - y: array-like, shape = [n_samples] - Target values. - - **kwargs: additional parameters to be passed to self.cook_training_set. - - Returns: - - self: object. - """ - #print(f"\n before: {X} \n") - res = get_histo_features(X) - self.hist_bins_ = res[1] - #print(f"\n after: {X} \n") - return super().fit(res[0], y, **kwargs) - - def predict_proba(self, X, **kwargs): - """Predict probabilites for test data X. - - Args: - - X: {array-like}, shape = [n_samples, n_features] - Training vectors, where n_samples is the number - of samples and n_features is the number of features. - - **kwargs: additional parameters to be passed to - self.cook_test_set - - Returns: - - predicted values estimates for test data: {array-like} - """ - assert self.hist_bins_ is not None, "You must fit the model first" - return super().predict_proba(get_histo_features(X, self.hist_bins_)) - + base_model=self.base_model, + ) \ No newline at end of file diff --git a/mlsauce/booster/_booster_regressor.py b/mlsauce/booster/_booster_regressor.py index 5abdada..2ceeedb 100644 --- a/mlsauce/booster/_booster_regressor.py +++ b/mlsauce/booster/_booster_regressor.py @@ -93,6 +93,12 @@ class LSBoostRegressor(BaseEstimator, RegressorMixin): weights_distr: str distribution of weights for constructing the model's hidden layer; either 'uniform' or 'gaussian' + + hist: bool + whether to use histogram features or not + + bins: int or str + number of bins for histogram features (same as numpy.histogram, default is 'auto') Examples: @@ -174,9 +180,14 @@ def __init__( degree=None, weights_distr="uniform", base_model=None, + hist=False, + bins="auto", ): self.base_model = base_model + self.hist = hist + self.bins = bins + self.hist_bins_ = None if n_clusters > 0: assert clustering_method in ( @@ -262,6 +273,9 @@ def fit(self, X, y, **kwargs): if isinstance(X, pd.DataFrame): X = X.values + if self.hist: + X, self.hist_bins_ = get_histo_features(X) + if isinstance(y, pd.Series): y = y.values.ravel() else: @@ -520,6 +534,12 @@ class GenericBoostingRegressor(LSBoostRegressor): weights_distr: str distribution of weights for constructing the model's hidden layer; either 'uniform' or 'gaussian' + + hist: bool + whether to use histogram features or not + + bins: int or str + number of bins for histogram features (same as numpy.histogram, default is 'auto') """ @@ -549,151 +569,12 @@ def __init__( cluster_scaling="standard", degree=None, weights_distr="uniform", + hist=False, + bins="auto", ): self.base_model = base_model - super().__init__( - n_estimators=n_estimators, - learning_rate=learning_rate, - n_hidden_features=n_hidden_features, - reg_lambda=reg_lambda, - alpha=alpha, - row_sample=row_sample, - col_sample=col_sample, - dropout=dropout, - tolerance=tolerance, - direct_link=direct_link, - verbose=verbose, - seed=seed, - backend=backend, - solver=solver, - activation=activation, - type_pi=type_pi, - replications=replications, - kernel=kernel, - n_clusters=n_clusters, - clustering_method=clustering_method, - cluster_scaling=cluster_scaling, - degree=degree, - weights_distr=weights_distr, - base_model=self.base_model, - ) - -class HistGenericBoostingRegressor(GenericBoostingRegressor): - """EXPERIMENTAL Generic Boosting regressor with histogram-based features. - - Attributes: - - base_model: object - base learner (default is ExtraTreeRegressor) to be boosted. - - n_estimators: int - number of boosting iterations. - - learning_rate: float - controls the learning speed at training time. - - n_hidden_features: int - number of nodes in successive hidden layers. - - reg_lambda: float - L2 regularization parameter for successive errors in the optimizer - (at training time). - - alpha: float - compromise between L1 and L2 regularization (must be in [0, 1]), - for `solver` == 'enet' - - row_sample: float - percentage of rows chosen from the training set. - - col_sample: float - percentage of columns chosen from the training set. - - dropout: float - percentage of nodes dropped from the training set. - - tolerance: float - controls early stopping in gradient descent (at training time). - - direct_link: bool - indicates whether the original features are included (True) in model's - fitting or not (False). - - verbose: int - progress bar (yes = 1) or not (no = 0) (currently). - - seed: int - reproducibility seed for nodes_sim=='uniform', clustering and dropout. - - backend: str - type of backend; must be in ('cpu', 'gpu', 'tpu') - - solver: str - type of 'weak' learner; currently in ('ridge', 'lasso') - - activation: str - activation function: currently 'relu', 'relu6', 'sigmoid', 'tanh' - - type_pi: str. - type of prediction interval; currently "kde" (default) or "bootstrap". - Used only in `self.predict`, for `self.replications` > 0 and `self.kernel` - in ('gaussian', 'tophat'). Default is `None`. - - replications: int. - number of replications (if needed) for predictive simulation. - Used only in `self.predict`, for `self.kernel` in ('gaussian', - 'tophat') and `self.type_pi = 'kde'`. Default is `None`. - - n_clusters: int - number of clusters for clustering the features - - clustering_method: str - clustering method: currently 'kmeans', 'gmm' - - cluster: bool - whether to cluster the features or not - - cluster_scaling: str - scaling method for clustering: currently 'standard', 'robust', 'minmax' - - degree: int - degree of features interactions to include in the model - - weights_distr: str - distribution of weights for constructing the model's hidden layer; - either 'uniform' or 'gaussian' - """ - def __init__( - self, - base_model=ExtraTreeRegressor(), - n_estimators=100, - learning_rate=0.1, - n_hidden_features=5, - reg_lambda=0.1, - alpha=0.5, - row_sample=1, - col_sample=1, - dropout=0, - tolerance=1e-4, - direct_link=1, - verbose=1, - seed=123, - backend="cpu", - solver="ridge", - activation="relu", - type_pi=None, - replications=None, - kernel=None, - n_clusters=0, - clustering_method="kmeans", - cluster_scaling="standard", - degree=None, - weights_distr="uniform", - ): - - #warnings.warn("This class is highly experimental", UserWarning) - - self.base_model = base_model + self.hist = hist + self.bins = bins self.hist_bins_ = None super().__init__( @@ -721,60 +602,4 @@ def __init__( degree=degree, weights_distr=weights_distr, base_model=self.base_model, - ) - - - def fit(self, X, y, **kwargs): - """Fit Booster (regressor) to training data (X, y) - - Args: - - X: {array-like}, shape = [n_samples, n_features] - Training vectors, where n_samples is the number - of samples and n_features is the number of features. - - y: array-like, shape = [n_samples] - Target values. - - **kwargs: additional parameters to be passed to self.cook_training_set. - - Returns: - - self: object. - """ - #print(f"\n before: {X} \n") - res = get_histo_features(X) - self.hist_bins_ = res[1] - #print(f"\n after: {X} \n") - return super().fit(res[0], y, **kwargs) - - def predict(self, X, level=95, method=None, **kwargs): - """Predict values for test data X. - - Args: - - X: {array-like}, shape = [n_samples, n_features] - Training vectors, where n_samples is the number - of samples and n_features is the number of features. - - level: int - Level of confidence (default = 95) - - method: str - `None`, or 'splitconformal', 'localconformal' - prediction (if you specify `return_pi = True`) - - histo: bool - whether to use histogram features or not - - **kwargs: additional parameters to be passed to - self.cook_test_set - - Returns: - - predicted values estimates for test data: {array-like} - """ - assert self.hist_bins_ is not None, "You must fit the model first" - X_ = get_histo_features(X, self.hist_bins_) - return super().predict(X_, level=level, method=method, **kwargs) - + ) From b53f9e595e9116aa8efaaa387e677fe449639213 Mon Sep 17 00:00:00 2001 From: Thierry Moudiki Date: Mon, 28 Oct 2024 10:27:42 +0100 Subject: [PATCH 23/26] fix histo Pt.6 --- mlsauce/lazybooster/lazyboosterclassif.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/mlsauce/lazybooster/lazyboosterclassif.py b/mlsauce/lazybooster/lazyboosterclassif.py index c58c812..963cb46 100644 --- a/mlsauce/lazybooster/lazyboosterclassif.py +++ b/mlsauce/lazybooster/lazyboosterclassif.py @@ -26,7 +26,7 @@ f1_score, ) from .config import REGRESSORS, MTASKREGRESSORS -from ..booster import GenericBoostingClassifier, HistGenericBoostingClassifier +from ..booster import GenericBoostingClassifier, GenericBoostingClassifier from ..multitaskregressor import MultiTaskRegressor import warnings @@ -217,7 +217,7 @@ def fit(self, X_train, X_test, y_train, y_test, hist=False, **kwargs): and columns is the number of features. hist: bool, optional (default=False) - When set to True, the model is a HistGenericBoostingClassifier. + When set to True, the model is a GenericBoostingClassifier. **kwargs: dict, Additional arguments to be passed to the fit GenericBoostingClassifier. @@ -388,12 +388,13 @@ def fit(self, X_train, X_test, y_train, y_test, hist=False, **kwargs): ), ) else: - fitted_clf = HistGenericBoostingClassifier( + fitted_clf = GenericBoostingClassifier( {**other_args, **kwargs}, verbose=self.verbose, base_model=model( random_state=self.random_state ), + hist=True, ) else: @@ -403,9 +404,10 @@ def fit(self, X_train, X_test, y_train, y_test, hist=False, **kwargs): verbose=self.verbose, ) else: - fitted_clf = HistGenericBoostingClassifier( + fitted_clf = GenericBoostingClassifier( base_model=model(**kwargs), verbose=self.verbose, + hist=True, ) if self.verbose > 0: @@ -527,11 +529,12 @@ def fit(self, X_train, X_test, y_train, y_test, hist=False, **kwargs): **kwargs ) else: - fitted_clf = HistGenericBoostingClassifier( + fitted_clf = GenericBoostingClassifier( base_model=model( random_state=self.random_state ), verbose=self.verbose, + hist=True, **kwargs ) @@ -543,9 +546,10 @@ def fit(self, X_train, X_test, y_train, y_test, hist=False, **kwargs): **kwargs ) else: - fitted_clf = HistGenericBoostingClassifier( + fitted_clf = GenericBoostingClassifier( base_model=model(), verbose=self.verbose, + hist=True, **kwargs ) @@ -753,10 +757,11 @@ def train_model( base_model=model(random_state=self.random_state), ) else: - fitted_clf = HistGenericBoostingClassifier( + fitted_clf = GenericBoostingClassifier( {**other_args, **kwargs}, verbose=self.verbose, base_model=model(random_state=self.random_state), + hist=True, ) else: if hist is False: @@ -765,9 +770,10 @@ def train_model( verbose=self.verbose, ) else: - fitted_clf = HistGenericBoostingClassifier( + fitted_clf = GenericBoostingClassifier( base_model=model(**kwargs), verbose=self.verbose, + hist=True, ) if self.verbose > 0: From 4b81c72c443bf0ffd2b813690a31a1bae87b8960 Mon Sep 17 00:00:00 2001 From: Thierry Moudiki Date: Mon, 28 Oct 2024 10:29:29 +0100 Subject: [PATCH 24/26] fix histo Pt.7 --- mlsauce/lazybooster/lazyboosterclassif.py | 2 +- mlsauce/lazybooster/lazyboosterregression.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mlsauce/lazybooster/lazyboosterclassif.py b/mlsauce/lazybooster/lazyboosterclassif.py index 963cb46..6b40a94 100644 --- a/mlsauce/lazybooster/lazyboosterclassif.py +++ b/mlsauce/lazybooster/lazyboosterclassif.py @@ -26,7 +26,7 @@ f1_score, ) from .config import REGRESSORS, MTASKREGRESSORS -from ..booster import GenericBoostingClassifier, GenericBoostingClassifier +from ..booster import GenericBoostingClassifier from ..multitaskregressor import MultiTaskRegressor import warnings diff --git a/mlsauce/lazybooster/lazyboosterregression.py b/mlsauce/lazybooster/lazyboosterregression.py index 5d94b75..0fc6a03 100644 --- a/mlsauce/lazybooster/lazyboosterregression.py +++ b/mlsauce/lazybooster/lazyboosterregression.py @@ -22,7 +22,7 @@ r2_score ) from .config import REGRESSORS -from ..booster import GenericBoostingRegressor, HistGenericBoostingRegressor +from ..booster import GenericBoostingRegressor import warnings From 3c32436a07344286f94d7d21d64ee2c7b03d54c9 Mon Sep 17 00:00:00 2001 From: Thierry Moudiki Date: Mon, 28 Oct 2024 11:10:02 +0100 Subject: [PATCH 25/26] fix histo Pt.8 --- mlsauce/booster/_booster_classifier.py | 3 +++ mlsauce/booster/_booster_regressor.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/mlsauce/booster/_booster_classifier.py b/mlsauce/booster/_booster_classifier.py index ef5d6ec..f6d87d3 100644 --- a/mlsauce/booster/_booster_classifier.py +++ b/mlsauce/booster/_booster_classifier.py @@ -496,6 +496,9 @@ def predict_proba(self, X, **kwargs): if isinstance(X, pd.DataFrame): X = X.values + if self.hist: + X = get_histo_features(X, bins=self.hist_bins_) + if self.degree is not None: X = self.poly_.transform(X) diff --git a/mlsauce/booster/_booster_regressor.py b/mlsauce/booster/_booster_regressor.py index 2ceeedb..3d170e5 100644 --- a/mlsauce/booster/_booster_regressor.py +++ b/mlsauce/booster/_booster_regressor.py @@ -359,6 +359,9 @@ def predict(self, X, level=95, method=None, histo=False, **kwargs): if isinstance(X, pd.DataFrame): X = X.values + + if self.hist: + X = get_histo_features(X, bins=self.hist_bins_) if self.degree is not None: X = self.poly_.transform(X) From 4a530bba99970855aed3001b58870f0253b05f06 Mon Sep 17 00:00:00 2001 From: Thierry Moudiki Date: Mon, 28 Oct 2024 11:27:54 +0100 Subject: [PATCH 26/26] fix histo Pt.9 --- mlsauce/booster/_booster_classifier.py | 4 ++-- mlsauce/booster/_booster_regressor.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/mlsauce/booster/_booster_classifier.py b/mlsauce/booster/_booster_classifier.py index f6d87d3..eb4938f 100644 --- a/mlsauce/booster/_booster_classifier.py +++ b/mlsauce/booster/_booster_classifier.py @@ -403,7 +403,7 @@ def fit(self, X, y, **kwargs): if isinstance(X, pd.DataFrame): X = X.values - if self.hist: + if self.hist == True: X, self.hist_bins_ = get_histo_features(X) if isinstance(y, pd.Series): @@ -496,7 +496,7 @@ def predict_proba(self, X, **kwargs): if isinstance(X, pd.DataFrame): X = X.values - if self.hist: + if self.hist == True: X = get_histo_features(X, bins=self.hist_bins_) if self.degree is not None: diff --git a/mlsauce/booster/_booster_regressor.py b/mlsauce/booster/_booster_regressor.py index 3d170e5..8954113 100644 --- a/mlsauce/booster/_booster_regressor.py +++ b/mlsauce/booster/_booster_regressor.py @@ -273,7 +273,7 @@ def fit(self, X, y, **kwargs): if isinstance(X, pd.DataFrame): X = X.values - if self.hist: + if self.hist == True: X, self.hist_bins_ = get_histo_features(X) if isinstance(y, pd.Series): @@ -360,7 +360,7 @@ def predict(self, X, level=95, method=None, histo=False, **kwargs): if isinstance(X, pd.DataFrame): X = X.values - if self.hist: + if self.hist == True: X = get_histo_features(X, bins=self.hist_bins_) if self.degree is not None: