diff --git a/examples/hist_genboost_classifier.py b/examples/hist_genboost_classifier.py new file mode 100644 index 0000000..f0ef839 --- /dev/null +++ b/examples/hist_genboost_classifier.py @@ -0,0 +1,55 @@ +import numpy as np +from sklearn.datasets import load_digits, load_breast_cancer, load_wine, load_iris +from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score +from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor +from sklearn.kernel_ridge import KernelRidge +from sklearn.linear_model import LinearRegression +from time import time +from os import chdir +from sklearn import metrics +import os + +print(f"\n ----- Running: {os.path.basename(__file__)}... ----- \n") + +print(os.path.relpath(os.path.dirname(__file__))) + +#wd="/workspace/mlsauce/mlsauce/examples" +# +#chdir(wd) + +import mlsauce as ms + +#ridge + +print("\n") +print("GenericBoosting Decision tree -----") +print("\n") + +print("\n") +print("breast_cancer data -----") + +# data 1 +breast_cancer = load_breast_cancer() +X = breast_cancer.data +y = breast_cancer.target +# split data into training test and test set +np.random.seed(15029) +X_train, X_test, y_train, y_test = train_test_split(X, y, + test_size=0.2) + +clf = ExtraTreeRegressor() +clf2 = LinearRegression() + +obj = ms.HistGenericBoostingClassifier(clf) +print(obj.get_params()) +start = time() +obj.fit(X_train, y_train) +print(time()-start) +start = time() +print(obj.score(X_test, y_test)) +print(time()-start) + +print(obj.obj['loss']) + +print(obj.obj['fit_obj_i']) + diff --git a/examples/hist_genboost_regressor.py b/examples/hist_genboost_regressor.py new file mode 100644 index 0000000..857d96a --- /dev/null +++ b/examples/hist_genboost_regressor.py @@ -0,0 +1,45 @@ +import subprocess +import sys +import os + +print(f"\n ----- Running: {os.path.basename(__file__)}... ----- \n") + + +subprocess.check_call([sys.executable, "-m", "pip", "install", "matplotlib"]) + +import mlsauce as ms +import numpy as np +import matplotlib.pyplot as plt +from sklearn.datasets import load_diabetes, fetch_california_housing +from sklearn.linear_model import Ridge, LinearRegression +from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score +from sklearn.tree import ExtraTreeRegressor +from time import time +from os import chdir +from sklearn import metrics + + +print("\n") +print("diabetes data -----") + +regr = ExtraTreeRegressor() + +diabetes = load_diabetes() +X = diabetes.data +y = diabetes.target +# split data into training test and test set +np.random.seed(15029) +X_train, X_test, y_train, y_test = train_test_split(X, y, + test_size=0.2) + + +obj = ms.HistGenericBoostingRegressor(regr) +print(obj.get_params()) +start = time() +obj.fit(X_train, y_train) +print(time()-start) +start = time() +print(np.sqrt(np.mean(np.square(obj.predict(X_test) - y_test)))) +print(time()-start) +print(obj.obj['loss']) + diff --git a/examples/lazy_histbooster_classification.py b/examples/lazy_histbooster_classification.py new file mode 100644 index 0000000..2b625e6 --- /dev/null +++ b/examples/lazy_histbooster_classification.py @@ -0,0 +1,29 @@ +import os +import mlsauce as ms +from sklearn.datasets import load_breast_cancer, load_iris, load_wine, load_digits +from sklearn.model_selection import train_test_split +from time import time + +print(f"\n ----- Running: {os.path.basename(__file__)}... ----- \n") + +#load_models = [load_breast_cancer, load_iris, load_wine, load_digits] +load_models = [load_breast_cancer, load_iris, load_wine] +#load_models = [load_digits] + +for model in load_models: + + data = model() + X = data.data + y= data.target + + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 13) + + clf = ms.LazyBoostingClassifier(verbose=0, ignore_warnings=True, #n_jobs=2, + custom_metric=None, preprocess=False) + + start = time() + models, predictioms = clf.fit(X_train, X_test, y_train, y_test, hist=True) + print(f"\nElapsed: {time() - start} seconds\n") + + print(models) + diff --git a/examples/lazy_histbooster_regression.py b/examples/lazy_histbooster_regression.py new file mode 100644 index 0000000..aa68da6 --- /dev/null +++ b/examples/lazy_histbooster_regression.py @@ -0,0 +1,59 @@ +import os +import mlsauce as ms +import numpy as np +from sklearn.datasets import load_diabetes +from sklearn.datasets import fetch_california_housing +from sklearn.model_selection import train_test_split + +print(f"\n ----- Running: {os.path.basename(__file__)}... ----- \n") + +data = load_diabetes() +X = data.data +y= data.target +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 123) + +regr = ms.LazyBoostingRegressor(verbose=0, ignore_warnings=True, #n_jobs=2, + custom_metric=None, preprocess=True) +models, predictioms = regr.fit(X_train, X_test, y_train, y_test) +model_dictionary = regr.provide_models(X_train, X_test, y_train, y_test) +print(models) + +data = fetch_california_housing() +X = data.data[0:1000,:] +y= data.target[0:1000] +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 123) + +regr = ms.LazyBoostingRegressor(verbose=0, ignore_warnings=True, + custom_metric=None, preprocess=True) +models, predictioms = regr.fit(X_train, X_test, y_train, y_test, hist=True) +model_dictionary = regr.provide_models(X_train, X_test, y_train, y_test) +print(models) + + +from sklearn.datasets import fetch_openml + +# Load the dataset from OpenML +boston = fetch_openml(name='boston', version=1, as_frame=True) + +# Get the features and target +X = boston.data +y = boston.target + +# Display the first few rows +print(X.head()) +print(y.head()) + +np.random.seed(1509) +X_train, X_test, y_train, y_test = train_test_split(X, y, + test_size=0.2) + +X_train = X_train.astype(np.float64) +X_test = X_test.astype(np.float64) +y_train = y_train.astype(np.float64) +y_test = y_test.astype(np.float64) + +regr = ms.LazyBoostingRegressor(verbose=0, ignore_warnings=True, #n_jobs=2, + custom_metric=None, preprocess=True) +models, predictioms = regr.fit(X_train, X_test, y_train, y_test, hist=True) +model_dictionary = regr.provide_models(X_train, X_test, y_train, y_test) +print(models) diff --git a/mlsauce/booster/_booster_classifier.py b/mlsauce/booster/_booster_classifier.py index 0d395a8..eb4938f 100644 --- a/mlsauce/booster/_booster_classifier.py +++ b/mlsauce/booster/_booster_classifier.py @@ -11,7 +11,7 @@ from . import _boosterc as boosterc except ImportError: import _boosterc as boosterc -from ..utils import cluster, check_and_install +from ..utils import cluster, check_and_install, get_histo_features class LSBoostClassifier(BaseEstimator, ClassifierMixin): @@ -83,6 +83,12 @@ class LSBoostClassifier(BaseEstimator, ClassifierMixin): weights_distr: str distribution of weights for constructing the model's hidden layer; currently 'uniform', 'gaussian' + + hist: bool + indicates whether histogram features are used or not (default is False) + + bins: int or str + number of bins for histogram features (same as numpy.histogram, default is 'auto') Examples: @@ -307,9 +313,14 @@ def __init__( degree=None, weights_distr="uniform", base_model=None, + hist=False, + bins="auto", ): self.base_model = base_model + self.hist = hist + self.bins = bins + self.hist_bins_ = None if n_clusters > 0: assert clustering_method in ( @@ -391,6 +402,14 @@ def fit(self, X, y, **kwargs): if isinstance(X, pd.DataFrame): X = X.values + + if self.hist == True: + X, self.hist_bins_ = get_histo_features(X) + + if isinstance(y, pd.Series): + y = y.values.ravel() + else: + y = y.ravel() if self.degree is not None: assert isinstance(self.degree, int), "`degree` must be an integer" @@ -433,7 +452,8 @@ def fit(self, X, y, **kwargs): obj=self.base_model, ) - self.n_classes_ = len(np.unique(y)) # for compatibility with sklearn + self.classes_ = np.unique(y) # for compatibility with sklearn + self.n_classes_ = len(self.classes_) # for compatibility with sklearn self.n_estimators = self.obj["n_estimators"] return self @@ -476,6 +496,9 @@ def predict_proba(self, X, **kwargs): if isinstance(X, pd.DataFrame): X = X.values + if self.hist == True: + X = get_histo_features(X, bins=self.hist_bins_) + if self.degree is not None: X = self.poly_.transform(X) @@ -543,7 +566,8 @@ def update(self, X, y, eta=0.9): ) self.obj = boosterc.update_booster( - self.obj, np.asarray(X, order="C"), np.asarray(y, order="C"), eta + self.obj, np.asarray(X, order="C"), + np.asarray(y, order="C").ravel(), eta ) return self @@ -621,6 +645,12 @@ class GenericBoostingClassifier(LSBoostClassifier): weights_distr: str distribution of weights for constructing the model's hidden layer; currently 'uniform', 'gaussian' + + hist: bool + indicates whether histogram features are used or not (default is False) + + bins: int or str + number of bins for histogram features (same as numpy.histogram, default is 'auto') """ @@ -647,8 +677,14 @@ def __init__( cluster_scaling="standard", degree=None, weights_distr="uniform", + hist=False, + bins="auto", ): self.base_model = base_model + self.hist = hist + self.bins = bins + self.hist_bins_ = None + super().__init__( n_estimators=n_estimators, learning_rate=learning_rate, @@ -671,4 +707,4 @@ def __init__( degree=degree, weights_distr=weights_distr, base_model=self.base_model, - ) + ) \ No newline at end of file diff --git a/mlsauce/booster/_booster_regressor.py b/mlsauce/booster/_booster_regressor.py index 09a2ec5..8954113 100644 --- a/mlsauce/booster/_booster_regressor.py +++ b/mlsauce/booster/_booster_regressor.py @@ -12,7 +12,7 @@ except ImportError: import _boosterc as boosterc from ..predictioninterval import PredictionInterval -from ..utils import cluster, check_and_install +from ..utils import cluster, check_and_install, get_histo_features class LSBoostRegressor(BaseEstimator, RegressorMixin): @@ -93,6 +93,12 @@ class LSBoostRegressor(BaseEstimator, RegressorMixin): weights_distr: str distribution of weights for constructing the model's hidden layer; either 'uniform' or 'gaussian' + + hist: bool + whether to use histogram features or not + + bins: int or str + number of bins for histogram features (same as numpy.histogram, default is 'auto') Examples: @@ -174,9 +180,14 @@ def __init__( degree=None, weights_distr="uniform", base_model=None, + hist=False, + bins="auto", ): self.base_model = base_model + self.hist = hist + self.bins = bins + self.hist_bins_ = None if n_clusters > 0: assert clustering_method in ( @@ -261,6 +272,14 @@ def fit(self, X, y, **kwargs): if isinstance(X, pd.DataFrame): X = X.values + + if self.hist == True: + X, self.hist_bins_ = get_histo_features(X) + + if isinstance(y, pd.Series): + y = y.values.ravel() + else: + y = y.ravel() if self.degree is not None: assert isinstance(self.degree, int), "`degree` must be an integer" @@ -311,8 +330,8 @@ def fit(self, X, y, **kwargs): return self - def predict(self, X, level=95, method=None, **kwargs): - """Predict probabilities for test data X. + def predict(self, X, level=95, method=None, histo=False, **kwargs): + """Predict values for test data X. Args: @@ -326,17 +345,23 @@ def predict(self, X, level=95, method=None, **kwargs): method: str `None`, or 'splitconformal', 'localconformal' prediction (if you specify `return_pi = True`) + + histo: bool + whether to use histogram features or not **kwargs: additional parameters to be passed to self.cook_test_set Returns: - probability estimates for test data: {array-like} + predicted values estimates for test data: {array-like} """ if isinstance(X, pd.DataFrame): X = X.values + + if self.hist == True: + X = get_histo_features(X, bins=self.hist_bins_) if self.degree is not None: X = self.poly_.transform(X) @@ -432,7 +457,7 @@ def update(self, X, y, eta=0.9): class GenericBoostingRegressor(LSBoostRegressor): - """LSBoost regressor. + """Generic Boosting regressor. Attributes: @@ -512,6 +537,12 @@ class GenericBoostingRegressor(LSBoostRegressor): weights_distr: str distribution of weights for constructing the model's hidden layer; either 'uniform' or 'gaussian' + + hist: bool + whether to use histogram features or not + + bins: int or str + number of bins for histogram features (same as numpy.histogram, default is 'auto') """ @@ -541,8 +572,14 @@ def __init__( cluster_scaling="standard", degree=None, weights_distr="uniform", + hist=False, + bins="auto", ): self.base_model = base_model + self.hist = hist + self.bins = bins + self.hist_bins_ = None + super().__init__( n_estimators=n_estimators, learning_rate=learning_rate, diff --git a/mlsauce/lazybooster/lazyboosterclassif.py b/mlsauce/lazybooster/lazyboosterclassif.py index b76ceab..6b40a94 100644 --- a/mlsauce/lazybooster/lazyboosterclassif.py +++ b/mlsauce/lazybooster/lazyboosterclassif.py @@ -194,7 +194,7 @@ def __init__( self.preprocess = preprocess self.n_jobs = n_jobs - def fit(self, X_train, X_test, y_train, y_test, **kwargs): + def fit(self, X_train, X_test, y_train, y_test, hist=False, **kwargs): """Fit classifiers to X_train and y_train, predict and score on X_test, y_test. @@ -215,6 +215,9 @@ def fit(self, X_train, X_test, y_train, y_test, **kwargs): y_test: array-like, Testing vectors, where rows is the number of samples and columns is the number of features. + + hist: bool, optional (default=False) + When set to True, the model is a GenericBoostingClassifier. **kwargs: dict, Additional arguments to be passed to the fit GenericBoostingClassifier. @@ -376,19 +379,36 @@ def fit(self, X_train, X_test, y_train, y_test, **kwargs): try: if "random_state" in model().get_params().keys(): - fitted_clf = GenericBoostingClassifier( - {**other_args, **kwargs}, - verbose=self.verbose, - base_model=model( - random_state=self.random_state - ), - ) + if hist is False: + fitted_clf = GenericBoostingClassifier( + {**other_args, **kwargs}, + verbose=self.verbose, + base_model=model( + random_state=self.random_state + ), + ) + else: + fitted_clf = GenericBoostingClassifier( + {**other_args, **kwargs}, + verbose=self.verbose, + base_model=model( + random_state=self.random_state + ), + hist=True, + ) else: - fitted_clf = GenericBoostingClassifier( - base_model=model(**kwargs), - verbose=self.verbose, - ) + if hist is False: + fitted_clf = GenericBoostingClassifier( + base_model=model(**kwargs), + verbose=self.verbose, + ) + else: + fitted_clf = GenericBoostingClassifier( + base_model=model(**kwargs), + verbose=self.verbose, + hist=True, + ) if self.verbose > 0: print("\n Fitting boosted " + name + " model...") @@ -500,20 +520,38 @@ def fit(self, X_train, X_test, y_train, y_test, **kwargs): start = time.time() try: if "random_state" in model().get_params().keys(): - fitted_clf = GenericBoostingClassifier( - base_model=model( - random_state=self.random_state - ), - verbose=self.verbose, - **kwargs - ) + if hist is False: + fitted_clf = GenericBoostingClassifier( + base_model=model( + random_state=self.random_state + ), + verbose=self.verbose, + **kwargs + ) + else: + fitted_clf = GenericBoostingClassifier( + base_model=model( + random_state=self.random_state + ), + verbose=self.verbose, + hist=True, + **kwargs + ) else: - fitted_clf = GenericBoostingClassifier( - base_model=model(), - verbose=self.verbose, - **kwargs - ) + if hist is False: + fitted_clf = GenericBoostingClassifier( + base_model=model(), + verbose=self.verbose, + **kwargs + ) + else: + fitted_clf = GenericBoostingClassifier( + base_model=model(), + verbose=self.verbose, + hist=True, + **kwargs + ) fitted_clf.fit(X_train, y_train) @@ -689,6 +727,7 @@ def train_model( y_test, use_preprocessing=False, preprocessor=None, + hist=False, **kwargs ): """ @@ -711,16 +750,31 @@ def train_model( try: # Handle random_state parameter if "random_state" in model().get_params().keys(): - fitted_clf = GenericBoostingClassifier( - {**other_args, **kwargs}, - verbose=self.verbose, - base_model=model(random_state=self.random_state), - ) + if hist is False: + fitted_clf = GenericBoostingClassifier( + {**other_args, **kwargs}, + verbose=self.verbose, + base_model=model(random_state=self.random_state), + ) + else: + fitted_clf = GenericBoostingClassifier( + {**other_args, **kwargs}, + verbose=self.verbose, + base_model=model(random_state=self.random_state), + hist=True, + ) else: - fitted_clf = GenericBoostingClassifier( - base_model=model(**kwargs), - verbose=self.verbose, - ) + if hist is False: + fitted_clf = GenericBoostingClassifier( + base_model=model(**kwargs), + verbose=self.verbose, + ) + else: + fitted_clf = GenericBoostingClassifier( + base_model=model(**kwargs), + verbose=self.verbose, + hist=True, + ) if self.verbose > 0: print("\n Fitting boosted " + name + " model...") diff --git a/mlsauce/lazybooster/lazyboosterregression.py b/mlsauce/lazybooster/lazyboosterregression.py index 2957383..0fc6a03 100644 --- a/mlsauce/lazybooster/lazyboosterregression.py +++ b/mlsauce/lazybooster/lazyboosterregression.py @@ -185,7 +185,7 @@ def __init__( self.preprocess = preprocess self.n_jobs = n_jobs - def fit(self, X_train, X_test, y_train, y_test, **kwargs): + def fit(self, X_train, X_test, y_train, y_test, hist=False, **kwargs): """Fit Regression algorithms to X_train and y_train, predict and score on X_test, y_test. Parameters: @@ -205,6 +205,9 @@ def fit(self, X_train, X_test, y_train, y_test, **kwargs): y_test : array-like, Testing vectors, where rows is the number of samples and columns is the number of features. + + hist: bool, optional (default=False) + When set to True, the model is a HistGenericBoostingRegressor. **kwargs: dict, Additional parameters to be passed to the GenericBoostingRegressor. @@ -284,7 +287,7 @@ def fit(self, X_train, X_test, y_train, y_test, **kwargs): for name, model in tqdm(zip(baseline_names, baseline_models)): start = time.time() try: - model.fit(X_train, y_train) + model.fit(X_train, y_train.ravel()) self.models_[name] = model y_pred = model.predict(X_test) r_squared = r2_score(y_test, y_pred) @@ -345,11 +348,19 @@ def fit(self, X_train, X_test, y_train, y_test, **kwargs): try: - model = GenericBoostingRegressor( - base_model=regr(), verbose=self.verbose, **kwargs - ) + if hist is False: + + model = GenericBoostingRegressor( + base_model=regr(), verbose=self.verbose, **kwargs + ) + + else: + + model = HistGenericBoostingRegressor( + base_model=regr(), verbose=self.verbose, **kwargs + ) - model.fit(X_train, y_train) + model.fit(X_train, y_train.ravel()) pipe = Pipeline( steps=[ @@ -359,7 +370,7 @@ def fit(self, X_train, X_test, y_train, y_test, **kwargs): ) if self.verbose > 0: print("\n Fitting boosted " + name + " model...") - pipe.fit(X_train, y_train) + pipe.fit(X_train, y_train.ravel()) self.models_[name] = pipe y_pred = pipe.predict(X_test) @@ -456,14 +467,19 @@ def fit(self, X_train, X_test, y_train, y_test, **kwargs): for name, regr in tqdm(self.regressors): # do parallel exec start = time.time() try: - - model = GenericBoostingRegressor( - base_model=regr(), verbose=self.verbose, **kwargs - ) + + if hist is False: + model = GenericBoostingRegressor( + base_model=regr(), verbose=self.verbose, **kwargs + ) + else: + model = HistGenericBoostingRegressor( + base_model=regr(), verbose=self.verbose, **kwargs + ) if self.verbose > 0: print("\n Fitting boosted " + name + " model...") - model.fit(X_train, y_train) + model.fit(X_train, y_train.ravel()) self.models_[name] = model y_pred = model.predict(X_test) @@ -616,7 +632,7 @@ def provide_models(self, X_train, X_test, y_train, y_test): """ if len(self.models_.keys()) == 0: - self.fit(X_train, X_test, y_train, y_test) + self.fit(X_train, X_test, y_train.ravel(), y_test.values) return self.models_ @@ -630,6 +646,7 @@ def train_model( y_test, use_preprocessing=False, preprocessor=None, + hist=False, **kwargs ): """ @@ -638,9 +655,14 @@ def train_model( start = time.time() try: - model = GenericBoostingRegressor( - base_model=regr(), verbose=self.verbose, **kwargs - ) + if hist is False: + model = GenericBoostingRegressor( + base_model=regr(), verbose=self.verbose, **kwargs + ) + else: + model = HistGenericBoostingRegressor( + base_model=regr(), verbose=self.verbose, **kwargs + ) if use_preprocessing and preprocessor is not None: pipe = Pipeline( @@ -655,7 +677,7 @@ def train_model( + name + " model with preprocessing..." ) - pipe.fit(X_train, y_train) + pipe.fit(X_train, y_train.ravel()) y_pred = pipe.predict(X_test) fitted_model = pipe else: @@ -666,7 +688,7 @@ def train_model( + name + " model without preprocessing..." ) - model.fit(X_train, y_train) + model.fit(X_train, y_train.ravel()) y_pred = model.predict(X_test) fitted_model = model diff --git a/mlsauce/utils/__init__.py b/mlsauce/utils/__init__.py index 99dec6d..156e393 100644 --- a/mlsauce/utils/__init__.py +++ b/mlsauce/utils/__init__.py @@ -10,6 +10,7 @@ ) from .progress_bar import Progbar from .get_beta import get_beta +from .histofeatures.gethistofeatures import get_histo_features __all__ = [ "cluster", @@ -22,4 +23,5 @@ "get_beta", "check_and_install", "is_multitask_estimator", + "get_histo_features" ] diff --git a/mlsauce/utils/histofeatures/__init__.py b/mlsauce/utils/histofeatures/__init__.py new file mode 100644 index 0000000..a20796b --- /dev/null +++ b/mlsauce/utils/histofeatures/__init__.py @@ -0,0 +1,3 @@ +from .gethistofeatures import get_histo_features + +__all__ = ['get_histo_features'] \ No newline at end of file diff --git a/mlsauce/utils/histofeatures/gethistofeatures.py b/mlsauce/utils/histofeatures/gethistofeatures.py new file mode 100644 index 0000000..70024b5 --- /dev/null +++ b/mlsauce/utils/histofeatures/gethistofeatures.py @@ -0,0 +1,100 @@ +import numpy as np +import pandas as pd + + +def create_histogram_with_bin_values(x): + """ + Computes a histogram for the input data and assigns a value to each bin + reflecting the ordering of the input. + + Args: + x (list or np.array): Input data. + num_bins (int): Number of bins for the histogram. + + Returns: + bin_edges (np.array): The edges of the bins. + bin_value_dict (dict): A dictionary where keys are the bin ranges (tuples) and values reflect the ordering. + """ + # Compute the histogram + _, bin_edges = np.histogram(x, bins="auto") + + bin_edges = np.concatenate([[-1e10], bin_edges, [1e10]]).ravel() + + return {i: (bin_edges[i], bin_edges[i + 1]) for i in range(len(bin_edges) - 1)} + + +def assign_values_to_input(x, bin_value_dict): + """ + Assigns values to a new input based on the provided bin ranges and values. + + Args: + x (list or np.array): New input data to assign values to. + bin_value_dict (dict): Dictionary where keys are bin ranges (tuples) and values are the assigned values. + + Returns: + assigned_values (list): List of assigned values for the new input data. + """ + + if np.issubdtype(x.dtype, np.integer) or np.issubdtype(x.dtype, np.object_): + + return x + + assigned_values = [] + + for value in x: + assigned = None + # Find the appropriate bin for each value + for i, elt in enumerate(bin_value_dict.items()): + if elt[1][0] < value <= elt[1][1]: + assigned = float(i) + break + + assigned_values.append(assigned) + + return np.asarray(assigned_values).ravel() + + +def get_histo_features(X, bin_value_dict=None): + """ + Computes histogram features for the input data. + + Args: + X {np.array or pd.DataFrame}: Input data. + + Returns: + X_hist {np.array or pd.DataFrame}: Input data with histogram features. + """ + + if bin_value_dict is None: # training set case + + if isinstance(X, pd.DataFrame): + colnames = X.columns + X = X.values + X_hist = pd.DataFrame(np.zeros(X.shape), + columns=colnames) + for i, col in enumerate(colnames): + bin_value_dict = create_histogram_with_bin_values(X[:, i]) + X_hist[col] = assign_values_to_input(X[:, i], bin_value_dict) + else: + X_hist = np.zeros(X.shape) + for i in range(X.shape[1]): + bin_value_dict = create_histogram_with_bin_values(X[:, i]) + X_hist[:, i] = assign_values_to_input(X[:, i], bin_value_dict) + + return X_hist, bin_value_dict + + else: # test set case + + if isinstance(X, pd.DataFrame): + colnames = X.columns + X = X.values + X_hist = pd.DataFrame(np.zeros(X.shape), + columns=colnames) + for i, col in enumerate(colnames): + X_hist[col] = assign_values_to_input(X[:, i], bin_value_dict) + else: + X_hist = np.zeros(X.shape) + for i in range(X.shape[1]): + X_hist[:, i] = assign_values_to_input(X[:, i], bin_value_dict) + + return X_hist \ No newline at end of file diff --git a/setup.py b/setup.py index 188ab86..9b8871a 100644 --- a/setup.py +++ b/setup.py @@ -37,7 +37,7 @@ MAINTAINER_EMAIL = 'thierry.moudiki@gmail.com' LICENSE = 'BSD3 Clause Clear' -__version__ = '0.22.4' +__version__ = '0.23.0' VERSION = __version__