From 4d1c79d6baf98d9ec7ffab2e24e25e717cb67589 Mon Sep 17 00:00:00 2001 From: Thierry Moudiki Date: Mon, 2 Sep 2024 10:51:43 +0200 Subject: [PATCH] conformal GBDT v0.7.0 Pt.2 --- examples/conformal-classification.py | 59 ++++++++++ examples/conformal-regression.py | 108 +++++++++++++++++ examples/conformal.py | 59 ++++++++++ unifiedbooster.egg-info/PKG-INFO | 29 +++++ unifiedbooster.egg-info/SOURCES.txt | 27 +++++ unifiedbooster.egg-info/dependency_links.txt | 1 + unifiedbooster.egg-info/entry_points.txt | 2 + unifiedbooster.egg-info/not-zip-safe | 1 + unifiedbooster.egg-info/requires.txt | 8 ++ unifiedbooster.egg-info/top_level.txt | 1 + unifiedbooster/predictionset/__init__.py | 3 + unifiedbooster/predictionset/predictionset.py | 111 ++++++++++++++++++ 12 files changed, 409 insertions(+) create mode 100644 examples/conformal-classification.py create mode 100644 examples/conformal-regression.py create mode 100644 examples/conformal.py create mode 100644 unifiedbooster.egg-info/PKG-INFO create mode 100644 unifiedbooster.egg-info/SOURCES.txt create mode 100644 unifiedbooster.egg-info/dependency_links.txt create mode 100644 unifiedbooster.egg-info/entry_points.txt create mode 100644 unifiedbooster.egg-info/not-zip-safe create mode 100644 unifiedbooster.egg-info/requires.txt create mode 100644 unifiedbooster.egg-info/top_level.txt create mode 100644 unifiedbooster/predictionset/__init__.py create mode 100644 unifiedbooster/predictionset/predictionset.py diff --git a/examples/conformal-classification.py b/examples/conformal-classification.py new file mode 100644 index 0000000..a289139 --- /dev/null +++ b/examples/conformal-classification.py @@ -0,0 +1,59 @@ +import numpy as np +import os +import unifiedbooster as ub +from sklearn.datasets import load_iris, load_breast_cancer, load_wine +from sklearn.model_selection import train_test_split +from sklearn.metrics import mean_squared_error +from time import time + + +print(f"\n ----- Running: {os.path.basename(__file__)}... ----- \n") + +load_datasets = [load_iris(), load_breast_cancer(), load_wine()] +dataset_names = ["Iris", "Breast Cancer", "Wine"] + +for i, dataset in enumerate(load_datasets): + + print(f"\n ----- Running: {dataset_names[i]} ----- \n") + X, y = dataset.data, dataset.target + + # Split dataset into training and testing sets + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, random_state=42 + ) + + # Initialize the unified clf (example with XGBoost) + print("\n ---------- Initialize the unified clf (example with XGBoost)") + clf1 = ub.GBDTClassifier(model_type="xgboost", + level=95, + pi_method="tcp") + + # Fit the model + start = time() + clf1.fit(X_train, y_train) + print(f"Time taken: {time() - start} seconds") + # Predict with the model + y_pred1 = clf1.predict(X_test) + print(y_test) + print(y_pred1.argmax(axis=1)) + # Calculate accuracy + accuracy = (y_test == y_pred1.argmax(axis=1)).mean() + print(f"\nAccuracy: {accuracy:.4f}") + + print("\n ---------- Initialize the unified clf (example with LightGBM)") + clf2 = ub.GBDTClassifier(model_type="lightgbm", + level=95, + pi_method="icp") + # Fit the model + start = time() + clf2.fit(X_train, y_train) + print(f"Time taken: {time() - start} seconds") + # Predict with the model + y_pred2 = clf2.predict(X_test) + print(y_pred2) + + # Calculate accuracy + print(y_test) + print(y_pred2.argmax(axis=1)) + accuracy = (y_test == y_pred2.argmax(axis=1)).mean() + print(f"\nAccuracy: {accuracy:.4f}") \ No newline at end of file diff --git a/examples/conformal-regression.py b/examples/conformal-regression.py new file mode 100644 index 0000000..93f8dc7 --- /dev/null +++ b/examples/conformal-regression.py @@ -0,0 +1,108 @@ +import matplotlib.pyplot as plt +import numpy as np +import os +import unifiedbooster as ub +import warnings +from sklearn.datasets import load_diabetes, fetch_california_housing +from sklearn.model_selection import train_test_split +from time import time + + +print(f"\n ----- Running: {os.path.basename(__file__)}... ----- \n") + +load_datasets = [fetch_california_housing(), load_diabetes()] +dataset_names = ["California Housing", "Diabetes"] + +warnings.filterwarnings('ignore') + +split_color = 'green' +split_color2 = 'orange' +local_color = 'gray' + +def plot_func(x, + y, + y_u=None, + y_l=None, + pred=None, + shade_color="lightblue", + method_name="", + title=""): + + fig = plt.figure() + + plt.plot(x, y, 'k.', alpha=.3, markersize=10, + fillstyle='full', label=u'Test set observations') + + if (y_u is not None) and (y_l is not None): + plt.fill(np.concatenate([x, x[::-1]]), + np.concatenate([y_u, y_l[::-1]]), + alpha=.3, fc=shade_color, ec='None', + label = method_name + ' Prediction interval') + + if pred is not None: + plt.plot(x, pred, 'k--', lw=2, alpha=0.9, + label=u'Predicted value') + + #plt.ylim([-2.5, 7]) + plt.xlabel('$X$') + plt.ylabel('$Y$') + plt.legend(loc='upper right') + plt.title(title) + + plt.show() + +for i, dataset in enumerate(load_datasets): + + print(f"\n ----- Running: {dataset_names[i]} ----- \n") + X, y = dataset.data, dataset.target + + # Split dataset into training and testing sets + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, random_state=42 + ) + + # Initialize the unified regr (example with XGBoost) + print("\n ---------- Initialize the unified regr (example with XGBoost)") + regr1 = ub.GBDTRegressor(model_type="xgboost", + level=95, + pi_method="splitconformal") + + # Fit the model + start = time() + regr1.fit(X_train, y_train) + print(f"Time taken: {time() - start} seconds") + # Predict with the model + y_pred1 = regr1.predict(X_test) + # Coverage error + coverage_error = (y_test >= y_pred1.lower) & (y_test <= y_pred1.upper) + print(f"Coverage rate: {coverage_error.mean():.4f}") + #x, + #y, + #y_u=None, + #y_l=None, + #pred=None, + plot_func(range(len(y_test))[0:30], y_test[0:30], + y_pred1.upper[0:30], y_pred1.lower[0:30], + y_pred1.mean[0:30], method_name="Split Conformal") + + print("\n ---------- Initialize the unified regr (example with LightGBM)") + regr2 = ub.GBDTRegressor(model_type="lightgbm", + level=95, + pi_method="localconformal") + # Fit the model + start = time() + regr2.fit(X_train, y_train) + print(f"Time taken: {time() - start} seconds") + # Predict with the model + y_pred2 = regr2.predict(X_test) + # Coverage error + coverage_error = (y_test >= y_pred2.lower) & (y_test <= y_pred2.upper) + print(f"Coverage rate: {coverage_error.mean():.4f}") + #x, + #y, + #y_u=None, + #y_l=None, + #pred=None, + plot_func(range(len(y_test))[0:30], y_test[0:30], + y_pred2.upper[0:30], y_pred2.lower[0:30], + y_pred2.mean[0:30], method_name="Local Conformal") \ No newline at end of file diff --git a/examples/conformal.py b/examples/conformal.py new file mode 100644 index 0000000..a289139 --- /dev/null +++ b/examples/conformal.py @@ -0,0 +1,59 @@ +import numpy as np +import os +import unifiedbooster as ub +from sklearn.datasets import load_iris, load_breast_cancer, load_wine +from sklearn.model_selection import train_test_split +from sklearn.metrics import mean_squared_error +from time import time + + +print(f"\n ----- Running: {os.path.basename(__file__)}... ----- \n") + +load_datasets = [load_iris(), load_breast_cancer(), load_wine()] +dataset_names = ["Iris", "Breast Cancer", "Wine"] + +for i, dataset in enumerate(load_datasets): + + print(f"\n ----- Running: {dataset_names[i]} ----- \n") + X, y = dataset.data, dataset.target + + # Split dataset into training and testing sets + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, random_state=42 + ) + + # Initialize the unified clf (example with XGBoost) + print("\n ---------- Initialize the unified clf (example with XGBoost)") + clf1 = ub.GBDTClassifier(model_type="xgboost", + level=95, + pi_method="tcp") + + # Fit the model + start = time() + clf1.fit(X_train, y_train) + print(f"Time taken: {time() - start} seconds") + # Predict with the model + y_pred1 = clf1.predict(X_test) + print(y_test) + print(y_pred1.argmax(axis=1)) + # Calculate accuracy + accuracy = (y_test == y_pred1.argmax(axis=1)).mean() + print(f"\nAccuracy: {accuracy:.4f}") + + print("\n ---------- Initialize the unified clf (example with LightGBM)") + clf2 = ub.GBDTClassifier(model_type="lightgbm", + level=95, + pi_method="icp") + # Fit the model + start = time() + clf2.fit(X_train, y_train) + print(f"Time taken: {time() - start} seconds") + # Predict with the model + y_pred2 = clf2.predict(X_test) + print(y_pred2) + + # Calculate accuracy + print(y_test) + print(y_pred2.argmax(axis=1)) + accuracy = (y_test == y_pred2.argmax(axis=1)).mean() + print(f"\nAccuracy: {accuracy:.4f}") \ No newline at end of file diff --git a/unifiedbooster.egg-info/PKG-INFO b/unifiedbooster.egg-info/PKG-INFO new file mode 100644 index 0000000..ffacaca --- /dev/null +++ b/unifiedbooster.egg-info/PKG-INFO @@ -0,0 +1,29 @@ +Metadata-Version: 2.1 +Name: unifiedbooster +Version: 0.7.0 +Summary: Unified interface for Gradient Boosted Decision Trees +Home-page: https://github.com/thierrymoudiki/unifiedbooster +Author: T. Moudiki +Author-email: thierry.moudiki@gmail.com +License: BSD license +Keywords: unifiedbooster +Classifier: Development Status :: 2 - Pre-Alpha +Classifier: Intended Audience :: Developers +Classifier: License :: OSI Approved :: BSD License +Classifier: Natural Language :: English +Classifier: Programming Language :: Python :: 3 +Classifier: Programming Language :: Python :: 3.6 +Classifier: Programming Language :: Python :: 3.7 +Classifier: Programming Language :: Python :: 3.8 +Requires-Python: >=3.6 +License-File: LICENSE +Requires-Dist: Cython +Requires-Dist: numpy +Requires-Dist: scikit-learn +Requires-Dist: xgboost +Requires-Dist: lightgbm +Requires-Dist: catboost +Requires-Dist: GPopt +Requires-Dist: nnetsauce + +Unified interface for Gradient Boosted Decision Trees diff --git a/unifiedbooster.egg-info/SOURCES.txt b/unifiedbooster.egg-info/SOURCES.txt new file mode 100644 index 0000000..b5dcee2 --- /dev/null +++ b/unifiedbooster.egg-info/SOURCES.txt @@ -0,0 +1,27 @@ +LICENSE +README.md +setup.py +unifiedbooster/__init__.py +unifiedbooster/gbdt.py +unifiedbooster/gbdt_classification.py +unifiedbooster/gbdt_regression.py +unifiedbooster/gpoptimization.py +unifiedbooster.egg-info/PKG-INFO +unifiedbooster.egg-info/SOURCES.txt +unifiedbooster.egg-info/dependency_links.txt +unifiedbooster.egg-info/entry_points.txt +unifiedbooster.egg-info/not-zip-safe +unifiedbooster.egg-info/requires.txt +unifiedbooster.egg-info/top_level.txt +unifiedbooster/nonconformist/__init__.py +unifiedbooster/nonconformist/acp.py +unifiedbooster/nonconformist/base.py +unifiedbooster/nonconformist/cp.py +unifiedbooster/nonconformist/evaluation.py +unifiedbooster/nonconformist/icp.py +unifiedbooster/nonconformist/nc.py +unifiedbooster/nonconformist/util.py +unifiedbooster/predictioninterval/__init__.py +unifiedbooster/predictioninterval/predictioninterval.py +unifiedbooster/predictionset/__init__.py +unifiedbooster/predictionset/predictionset.py \ No newline at end of file diff --git a/unifiedbooster.egg-info/dependency_links.txt b/unifiedbooster.egg-info/dependency_links.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/unifiedbooster.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/unifiedbooster.egg-info/entry_points.txt b/unifiedbooster.egg-info/entry_points.txt new file mode 100644 index 0000000..3fc43f8 --- /dev/null +++ b/unifiedbooster.egg-info/entry_points.txt @@ -0,0 +1,2 @@ +[console_scripts] +unifiedbooster = unifiedbooster.cli:main diff --git a/unifiedbooster.egg-info/not-zip-safe b/unifiedbooster.egg-info/not-zip-safe new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/unifiedbooster.egg-info/not-zip-safe @@ -0,0 +1 @@ + diff --git a/unifiedbooster.egg-info/requires.txt b/unifiedbooster.egg-info/requires.txt new file mode 100644 index 0000000..0d8325e --- /dev/null +++ b/unifiedbooster.egg-info/requires.txt @@ -0,0 +1,8 @@ +Cython +numpy +scikit-learn +xgboost +lightgbm +catboost +GPopt +nnetsauce diff --git a/unifiedbooster.egg-info/top_level.txt b/unifiedbooster.egg-info/top_level.txt new file mode 100644 index 0000000..75a229a --- /dev/null +++ b/unifiedbooster.egg-info/top_level.txt @@ -0,0 +1 @@ +unifiedbooster diff --git a/unifiedbooster/predictionset/__init__.py b/unifiedbooster/predictionset/__init__.py new file mode 100644 index 0000000..e7f99a8 --- /dev/null +++ b/unifiedbooster/predictionset/__init__.py @@ -0,0 +1,3 @@ +from .predictionset import PredictionSet + +__all__ = ["PredictionSet"] diff --git a/unifiedbooster/predictionset/predictionset.py b/unifiedbooster/predictionset/predictionset.py new file mode 100644 index 0000000..0ba767d --- /dev/null +++ b/unifiedbooster/predictionset/predictionset.py @@ -0,0 +1,111 @@ +from locale import normalize +import numpy as np +import pickle +from collections import namedtuple +from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.model_selection import train_test_split +from sklearn.ensemble import ExtraTreesRegressor +from sklearn.preprocessing import StandardScaler +from scipy.stats import gaussian_kde +from tqdm import tqdm +from ..nonconformist import ClassifierAdapter, IcpClassifier, TcpClassifier, ClassifierNc, MarginErrFunc + + +class PredictionSet(BaseEstimator, ClassifierMixin): + """Class PredictionSet: Obtain prediction sets. + + Attributes: + + obj: an object; + fitted object containing methods `fit` and `predict` + + method: a string; + method for constructing the prediction sets. + Currently "icp" (default, inductive conformal) and "tcp" (transductive conformal) + + level: a float; + Confidence level for prediction sets. Default is None, + 95 is equivalent to a miscoverage error of 5 (%) + + seed: an integer; + Reproducibility of fit (there's a random split between fitting and calibration data) + """ + + def __init__( + self, + obj, + method="icp", + level=None, + seed=123, + ): + + self.obj = obj + self.method = method + self.level = level + self.seed = seed + if self.level is not None: + self.alpha_ = 1 - self.level / 100 + self.quantile_ = None + self.icp_ = None + self.tcp_ = None + + if self.method == "icp": + self.icp_ = IcpClassifier( + ClassifierNc(ClassifierAdapter(self.obj), MarginErrFunc()), + ) + elif self.method == "tcp": + self.tcp_ = TcpClassifier( + ClassifierNc(ClassifierAdapter(self.obj), MarginErrFunc()), + ) + else: + raise ValueError( + "`self.method` must be in ('icp', 'tcp')" + ) + + + def fit(self, X, y): + """Fit the `method` to training data (X, y). + + Args: + + X: array-like, shape = [n_samples, n_features]; + Training set vectors, where n_samples is the number + of samples and n_features is the number of features. + + y: array-like, shape = [n_samples, ]; Target values. + + """ + if self.method == "icp": + + X_train, X_calibration, y_train, y_calibration = train_test_split( + X, y, test_size=0.5, random_state=self.seed) + self.icp_.fit(X_train, y_train) + self.icp_.calibrate(X_calibration, y_calibration) + + elif self.method == "tcp": + + self.tcp_.fit(X, y) + + return self + + def predict(self, X): + """Obtain predictions and prediction sets + + Args: + + X: array-like, shape = [n_samples, n_features]; + Testing set vectors, where n_samples is the number + of samples and n_features is the number of features. + + """ + + if self.method == "icp": + return self.icp_.predict(X, significance=self.alpha_) + + elif self.method == "tcp": + return self.tcp_.predict(X, significance=self.alpha_) + + else: + raise ValueError( + "`self.method` must be in ('icp', 'tcp')" + ) \ No newline at end of file