From 4d1c79d6baf98d9ec7ffab2e24e25e717cb67589 Mon Sep 17 00:00:00 2001
From: Thierry Moudiki <thierry.moudiki@gmail.com>
Date: Mon, 2 Sep 2024 10:51:43 +0200
Subject: [PATCH] conformal GBDT v0.7.0 Pt.2

---
 examples/conformal-classification.py          |  59 ++++++++++
 examples/conformal-regression.py              | 108 +++++++++++++++++
 examples/conformal.py                         |  59 ++++++++++
 unifiedbooster.egg-info/PKG-INFO              |  29 +++++
 unifiedbooster.egg-info/SOURCES.txt           |  27 +++++
 unifiedbooster.egg-info/dependency_links.txt  |   1 +
 unifiedbooster.egg-info/entry_points.txt      |   2 +
 unifiedbooster.egg-info/not-zip-safe          |   1 +
 unifiedbooster.egg-info/requires.txt          |   8 ++
 unifiedbooster.egg-info/top_level.txt         |   1 +
 unifiedbooster/predictionset/__init__.py      |   3 +
 unifiedbooster/predictionset/predictionset.py | 111 ++++++++++++++++++
 12 files changed, 409 insertions(+)
 create mode 100644 examples/conformal-classification.py
 create mode 100644 examples/conformal-regression.py
 create mode 100644 examples/conformal.py
 create mode 100644 unifiedbooster.egg-info/PKG-INFO
 create mode 100644 unifiedbooster.egg-info/SOURCES.txt
 create mode 100644 unifiedbooster.egg-info/dependency_links.txt
 create mode 100644 unifiedbooster.egg-info/entry_points.txt
 create mode 100644 unifiedbooster.egg-info/not-zip-safe
 create mode 100644 unifiedbooster.egg-info/requires.txt
 create mode 100644 unifiedbooster.egg-info/top_level.txt
 create mode 100644 unifiedbooster/predictionset/__init__.py
 create mode 100644 unifiedbooster/predictionset/predictionset.py

diff --git a/examples/conformal-classification.py b/examples/conformal-classification.py
new file mode 100644
index 0000000..a289139
--- /dev/null
+++ b/examples/conformal-classification.py
@@ -0,0 +1,59 @@
+import numpy as np
+import os 
+import unifiedbooster as ub
+from sklearn.datasets import load_iris, load_breast_cancer, load_wine
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import mean_squared_error
+from time import time
+
+
+print(f"\n ----- Running: {os.path.basename(__file__)}... ----- \n")
+
+load_datasets = [load_iris(), load_breast_cancer(), load_wine()]
+dataset_names = ["Iris", "Breast Cancer", "Wine"]
+
+for i, dataset in enumerate(load_datasets):
+
+    print(f"\n ----- Running: {dataset_names[i]} ----- \n")
+    X, y = dataset.data, dataset.target
+
+    # Split dataset into training and testing sets
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.2, random_state=42
+    )
+
+    # Initialize the unified clf (example with XGBoost)
+    print("\n ---------- Initialize the unified clf (example with XGBoost)")
+    clf1 = ub.GBDTClassifier(model_type="xgboost", 
+                            level=95, 
+                            pi_method="tcp")
+
+    # Fit the model
+    start = time()
+    clf1.fit(X_train, y_train)
+    print(f"Time taken: {time() - start} seconds")
+    # Predict with the model
+    y_pred1 = clf1.predict(X_test)
+    print(y_test)
+    print(y_pred1.argmax(axis=1))
+    # Calculate accuracy
+    accuracy = (y_test == y_pred1.argmax(axis=1)).mean()
+    print(f"\nAccuracy: {accuracy:.4f}")
+
+    print("\n ---------- Initialize the unified clf (example with LightGBM)")
+    clf2 = ub.GBDTClassifier(model_type="lightgbm", 
+                            level=95, 
+                            pi_method="icp")
+    # Fit the model
+    start = time()
+    clf2.fit(X_train, y_train)
+    print(f"Time taken: {time() - start} seconds")
+    # Predict with the model
+    y_pred2 = clf2.predict(X_test)
+    print(y_pred2)
+
+    # Calculate accuracy
+    print(y_test)
+    print(y_pred2.argmax(axis=1))
+    accuracy = (y_test == y_pred2.argmax(axis=1)).mean()
+    print(f"\nAccuracy: {accuracy:.4f}")
\ No newline at end of file
diff --git a/examples/conformal-regression.py b/examples/conformal-regression.py
new file mode 100644
index 0000000..93f8dc7
--- /dev/null
+++ b/examples/conformal-regression.py
@@ -0,0 +1,108 @@
+import matplotlib.pyplot as plt
+import numpy as np
+import os 
+import unifiedbooster as ub
+import warnings
+from sklearn.datasets import load_diabetes, fetch_california_housing
+from sklearn.model_selection import train_test_split
+from time import time
+
+
+print(f"\n ----- Running: {os.path.basename(__file__)}... ----- \n")
+
+load_datasets = [fetch_california_housing(), load_diabetes()]
+dataset_names = ["California Housing", "Diabetes"]
+
+warnings.filterwarnings('ignore')
+
+split_color = 'green'
+split_color2 = 'orange'
+local_color = 'gray'
+
+def plot_func(x,
+              y,
+              y_u=None,
+              y_l=None,
+              pred=None,
+              shade_color="lightblue",
+              method_name="",
+              title=""):
+
+    fig = plt.figure()
+
+    plt.plot(x, y, 'k.', alpha=.3, markersize=10,
+             fillstyle='full', label=u'Test set observations')
+
+    if (y_u is not None) and (y_l is not None):
+        plt.fill(np.concatenate([x, x[::-1]]),
+                 np.concatenate([y_u, y_l[::-1]]),
+                 alpha=.3, fc=shade_color, ec='None',
+                 label = method_name + ' Prediction interval')
+
+    if pred is not None:
+        plt.plot(x, pred, 'k--', lw=2, alpha=0.9,
+                 label=u'Predicted value')
+
+    #plt.ylim([-2.5, 7])
+    plt.xlabel('$X$')
+    plt.ylabel('$Y$')
+    plt.legend(loc='upper right')
+    plt.title(title)
+
+    plt.show()
+
+for i, dataset in enumerate(load_datasets):
+
+    print(f"\n ----- Running: {dataset_names[i]} ----- \n")
+    X, y = dataset.data, dataset.target
+
+    # Split dataset into training and testing sets
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.2, random_state=42
+    )
+
+    # Initialize the unified regr (example with XGBoost)
+    print("\n ---------- Initialize the unified regr (example with XGBoost)")
+    regr1 = ub.GBDTRegressor(model_type="xgboost", 
+                            level=95, 
+                            pi_method="splitconformal")
+
+    # Fit the model
+    start = time()
+    regr1.fit(X_train, y_train)
+    print(f"Time taken: {time() - start} seconds")
+    # Predict with the model
+    y_pred1 = regr1.predict(X_test)
+    # Coverage error
+    coverage_error = (y_test >= y_pred1.lower) & (y_test <= y_pred1.upper)
+    print(f"Coverage rate: {coverage_error.mean():.4f}")
+    #x,
+    #y,
+    #y_u=None,
+    #y_l=None,
+    #pred=None,
+    plot_func(range(len(y_test))[0:30], y_test[0:30],
+              y_pred1.upper[0:30], y_pred1.lower[0:30], 
+              y_pred1.mean[0:30], method_name="Split Conformal")
+    
+    print("\n ---------- Initialize the unified regr (example with LightGBM)")
+    regr2 = ub.GBDTRegressor(model_type="lightgbm", 
+                            level=95, 
+                            pi_method="localconformal")
+    # Fit the model
+    start = time()
+    regr2.fit(X_train, y_train)
+    print(f"Time taken: {time() - start} seconds")
+    # Predict with the model
+    y_pred2 = regr2.predict(X_test)
+    # Coverage error
+    coverage_error = (y_test >= y_pred2.lower) & (y_test <= y_pred2.upper)
+    print(f"Coverage rate: {coverage_error.mean():.4f}")
+    #x,
+    #y,
+    #y_u=None,
+    #y_l=None,
+    #pred=None,
+    plot_func(range(len(y_test))[0:30], y_test[0:30], 
+              y_pred2.upper[0:30], y_pred2.lower[0:30], 
+              y_pred2.mean[0:30], method_name="Local Conformal")
\ No newline at end of file
diff --git a/examples/conformal.py b/examples/conformal.py
new file mode 100644
index 0000000..a289139
--- /dev/null
+++ b/examples/conformal.py
@@ -0,0 +1,59 @@
+import numpy as np
+import os 
+import unifiedbooster as ub
+from sklearn.datasets import load_iris, load_breast_cancer, load_wine
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import mean_squared_error
+from time import time
+
+
+print(f"\n ----- Running: {os.path.basename(__file__)}... ----- \n")
+
+load_datasets = [load_iris(), load_breast_cancer(), load_wine()]
+dataset_names = ["Iris", "Breast Cancer", "Wine"]
+
+for i, dataset in enumerate(load_datasets):
+
+    print(f"\n ----- Running: {dataset_names[i]} ----- \n")
+    X, y = dataset.data, dataset.target
+
+    # Split dataset into training and testing sets
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.2, random_state=42
+    )
+
+    # Initialize the unified clf (example with XGBoost)
+    print("\n ---------- Initialize the unified clf (example with XGBoost)")
+    clf1 = ub.GBDTClassifier(model_type="xgboost", 
+                            level=95, 
+                            pi_method="tcp")
+
+    # Fit the model
+    start = time()
+    clf1.fit(X_train, y_train)
+    print(f"Time taken: {time() - start} seconds")
+    # Predict with the model
+    y_pred1 = clf1.predict(X_test)
+    print(y_test)
+    print(y_pred1.argmax(axis=1))
+    # Calculate accuracy
+    accuracy = (y_test == y_pred1.argmax(axis=1)).mean()
+    print(f"\nAccuracy: {accuracy:.4f}")
+
+    print("\n ---------- Initialize the unified clf (example with LightGBM)")
+    clf2 = ub.GBDTClassifier(model_type="lightgbm", 
+                            level=95, 
+                            pi_method="icp")
+    # Fit the model
+    start = time()
+    clf2.fit(X_train, y_train)
+    print(f"Time taken: {time() - start} seconds")
+    # Predict with the model
+    y_pred2 = clf2.predict(X_test)
+    print(y_pred2)
+
+    # Calculate accuracy
+    print(y_test)
+    print(y_pred2.argmax(axis=1))
+    accuracy = (y_test == y_pred2.argmax(axis=1)).mean()
+    print(f"\nAccuracy: {accuracy:.4f}")
\ No newline at end of file
diff --git a/unifiedbooster.egg-info/PKG-INFO b/unifiedbooster.egg-info/PKG-INFO
new file mode 100644
index 0000000..ffacaca
--- /dev/null
+++ b/unifiedbooster.egg-info/PKG-INFO
@@ -0,0 +1,29 @@
+Metadata-Version: 2.1
+Name: unifiedbooster
+Version: 0.7.0
+Summary: Unified interface for Gradient Boosted Decision Trees
+Home-page: https://github.com/thierrymoudiki/unifiedbooster
+Author: T. Moudiki
+Author-email: thierry.moudiki@gmail.com
+License: BSD license
+Keywords: unifiedbooster
+Classifier: Development Status :: 2 - Pre-Alpha
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: BSD License
+Classifier: Natural Language :: English
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.6
+Classifier: Programming Language :: Python :: 3.7
+Classifier: Programming Language :: Python :: 3.8
+Requires-Python: >=3.6
+License-File: LICENSE
+Requires-Dist: Cython
+Requires-Dist: numpy
+Requires-Dist: scikit-learn
+Requires-Dist: xgboost
+Requires-Dist: lightgbm
+Requires-Dist: catboost
+Requires-Dist: GPopt
+Requires-Dist: nnetsauce
+
+Unified interface for Gradient Boosted Decision Trees
diff --git a/unifiedbooster.egg-info/SOURCES.txt b/unifiedbooster.egg-info/SOURCES.txt
new file mode 100644
index 0000000..b5dcee2
--- /dev/null
+++ b/unifiedbooster.egg-info/SOURCES.txt
@@ -0,0 +1,27 @@
+LICENSE
+README.md
+setup.py
+unifiedbooster/__init__.py
+unifiedbooster/gbdt.py
+unifiedbooster/gbdt_classification.py
+unifiedbooster/gbdt_regression.py
+unifiedbooster/gpoptimization.py
+unifiedbooster.egg-info/PKG-INFO
+unifiedbooster.egg-info/SOURCES.txt
+unifiedbooster.egg-info/dependency_links.txt
+unifiedbooster.egg-info/entry_points.txt
+unifiedbooster.egg-info/not-zip-safe
+unifiedbooster.egg-info/requires.txt
+unifiedbooster.egg-info/top_level.txt
+unifiedbooster/nonconformist/__init__.py
+unifiedbooster/nonconformist/acp.py
+unifiedbooster/nonconformist/base.py
+unifiedbooster/nonconformist/cp.py
+unifiedbooster/nonconformist/evaluation.py
+unifiedbooster/nonconformist/icp.py
+unifiedbooster/nonconformist/nc.py
+unifiedbooster/nonconformist/util.py
+unifiedbooster/predictioninterval/__init__.py
+unifiedbooster/predictioninterval/predictioninterval.py
+unifiedbooster/predictionset/__init__.py
+unifiedbooster/predictionset/predictionset.py
\ No newline at end of file
diff --git a/unifiedbooster.egg-info/dependency_links.txt b/unifiedbooster.egg-info/dependency_links.txt
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/unifiedbooster.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
diff --git a/unifiedbooster.egg-info/entry_points.txt b/unifiedbooster.egg-info/entry_points.txt
new file mode 100644
index 0000000..3fc43f8
--- /dev/null
+++ b/unifiedbooster.egg-info/entry_points.txt
@@ -0,0 +1,2 @@
+[console_scripts]
+unifiedbooster = unifiedbooster.cli:main
diff --git a/unifiedbooster.egg-info/not-zip-safe b/unifiedbooster.egg-info/not-zip-safe
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/unifiedbooster.egg-info/not-zip-safe
@@ -0,0 +1 @@
+
diff --git a/unifiedbooster.egg-info/requires.txt b/unifiedbooster.egg-info/requires.txt
new file mode 100644
index 0000000..0d8325e
--- /dev/null
+++ b/unifiedbooster.egg-info/requires.txt
@@ -0,0 +1,8 @@
+Cython
+numpy
+scikit-learn
+xgboost
+lightgbm
+catboost
+GPopt
+nnetsauce
diff --git a/unifiedbooster.egg-info/top_level.txt b/unifiedbooster.egg-info/top_level.txt
new file mode 100644
index 0000000..75a229a
--- /dev/null
+++ b/unifiedbooster.egg-info/top_level.txt
@@ -0,0 +1 @@
+unifiedbooster
diff --git a/unifiedbooster/predictionset/__init__.py b/unifiedbooster/predictionset/__init__.py
new file mode 100644
index 0000000..e7f99a8
--- /dev/null
+++ b/unifiedbooster/predictionset/__init__.py
@@ -0,0 +1,3 @@
+from .predictionset import PredictionSet
+
+__all__ = ["PredictionSet"]
diff --git a/unifiedbooster/predictionset/predictionset.py b/unifiedbooster/predictionset/predictionset.py
new file mode 100644
index 0000000..0ba767d
--- /dev/null
+++ b/unifiedbooster/predictionset/predictionset.py
@@ -0,0 +1,111 @@
+from locale import normalize
+import numpy as np
+import pickle
+from collections import namedtuple
+from sklearn.base import BaseEstimator, ClassifierMixin
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import ExtraTreesRegressor
+from sklearn.preprocessing import StandardScaler
+from scipy.stats import gaussian_kde
+from tqdm import tqdm
+from ..nonconformist import ClassifierAdapter, IcpClassifier, TcpClassifier, ClassifierNc, MarginErrFunc
+
+
+class PredictionSet(BaseEstimator, ClassifierMixin):
+    """Class PredictionSet: Obtain prediction sets.
+
+    Attributes:
+
+        obj: an object;
+            fitted object containing methods `fit` and `predict`
+
+        method: a string;
+            method for constructing the prediction sets.
+            Currently "icp" (default, inductive conformal) and "tcp" (transductive conformal)
+
+        level: a float;
+            Confidence level for prediction sets. Default is None,
+            95 is equivalent to a miscoverage error of 5 (%)
+
+        seed: an integer;
+            Reproducibility of fit (there's a random split between fitting and calibration data)
+    """
+
+    def __init__(
+        self,
+        obj,
+        method="icp",
+        level=None,
+        seed=123,
+    ):
+
+        self.obj = obj
+        self.method = method
+        self.level = level
+        self.seed = seed
+        if self.level is not None:
+            self.alpha_ = 1 - self.level / 100
+        self.quantile_ = None
+        self.icp_ = None
+        self.tcp_ = None        
+
+        if self.method == "icp":
+            self.icp_ = IcpClassifier(                
+                ClassifierNc(ClassifierAdapter(self.obj), MarginErrFunc()),
+            )
+        elif self.method == "tcp":
+            self.tcp_ = TcpClassifier(                
+                ClassifierNc(ClassifierAdapter(self.obj), MarginErrFunc()),
+            )            
+        else:
+            raise ValueError(
+                "`self.method` must be in ('icp', 'tcp')"
+            )
+           
+
+    def fit(self, X, y):
+        """Fit the `method` to training data (X, y).
+
+        Args:
+
+            X: array-like, shape = [n_samples, n_features];
+                Training set vectors, where n_samples is the number
+                of samples and n_features is the number of features.
+
+            y: array-like, shape = [n_samples, ]; Target values.
+
+        """        
+        if self.method == "icp":
+
+            X_train, X_calibration, y_train, y_calibration = train_test_split(
+            X, y, test_size=0.5, random_state=self.seed)
+            self.icp_.fit(X_train, y_train)
+            self.icp_.calibrate(X_calibration, y_calibration)  
+
+        elif self.method == "tcp":
+
+            self.tcp_.fit(X, y)
+
+        return self
+
+    def predict(self, X):
+        """Obtain predictions and prediction sets
+
+        Args:
+
+            X: array-like, shape = [n_samples, n_features];
+                Testing set vectors, where n_samples is the number
+                of samples and n_features is the number of features.
+
+        """
+
+        if self.method == "icp":
+            return self.icp_.predict(X, significance=self.alpha_)
+        
+        elif self.method == "tcp":
+            return self.tcp_.predict(X, significance=self.alpha_)
+        
+        else:
+            raise ValueError(
+                "`self.method` must be in ('icp', 'tcp')"
+            )
\ No newline at end of file