diff --git a/CHANGES.md b/CHANGES.md index 1eab268..f57a005 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,4 +1,4 @@ -# version 0.15.0 +# version 0.15.1 - Can use pandas DataFrame in `LSBoostRegressor`, `LSBoostClassifier` and `AdaOpt` diff --git a/mlsauce/__init__.py b/mlsauce/__init__.py index 350d908..c2efe87 100644 --- a/mlsauce/__init__.py +++ b/mlsauce/__init__.py @@ -23,7 +23,7 @@ # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer. # 'X.Y.dev0' is the canonical version of 'X.Y.dev' # -__version__ = "0.10.0" +# __version__ = "0.10.0" # On OSX, we can get a runtime error due to multiple OpenMP libraries loaded diff --git a/mlsauce/adaopt/_adaopt.py b/mlsauce/adaopt/_adaopt.py index 5919f9e..87ea295 100644 --- a/mlsauce/adaopt/_adaopt.py +++ b/mlsauce/adaopt/_adaopt.py @@ -9,13 +9,12 @@ from tqdm import tqdm from ..utils import subsample from ..utils import cluster +from ..utils import cythonize_file try: from . import _adaoptc as adaoptc except ImportError: - import pyximport - - pyximport.install() + cythonize_file("_adaoptc.pyx") import _adaoptc @@ -203,8 +202,8 @@ def fit(self, X, y, **kwargs): n_classes = len(np.unique(y_)) assert n == len(y_), "must have X.shape[0] == len(y)" - - try: + + try: res = adaoptc.fit_adaopt( X=np.asarray(X_).astype(np.float64), @@ -220,7 +219,7 @@ def fit(self, X, y, **kwargs): gamma=self.gamma, tolerance=self.tolerance, ) - + except ValueError: res = _adaoptc.fit_adaopt( @@ -306,7 +305,7 @@ def predict_proba(self, X, **kwargs): n_test = X.shape[0] if self.n_jobs is None: - try: + try: return adaoptc.predict_proba_adaopt( X_test=np.asarray(X, order="C").astype(np.float64), scaled_X_train=np.asarray( @@ -348,7 +347,7 @@ def predict_proba(self, X, **kwargs): scaled_X_test = X / norm(X, ord=2, axis=1)[:, None] - try: + try: if self.type_dist == "euclidean": @@ -538,7 +537,6 @@ def multiproc_func(i, *args): probs=probs_test_i, weights=weights_test_i ) - if self.verbose == 1: res = Parallel(n_jobs=self.n_jobs, prefer="threads")( (multiproc_func)(m) for m in tqdm(range(n_test)) diff --git a/mlsauce/booster/_booster_classifier.py b/mlsauce/booster/_booster_classifier.py index 600b9b1..aa791ce 100644 --- a/mlsauce/booster/_booster_classifier.py +++ b/mlsauce/booster/_booster_classifier.py @@ -1,9 +1,4 @@ -try: - from . import _boosterc as boosterc -except ImportError: - import pyximport - pyximport.install() - import _boosterc + import numpy as np import pandas as pd import platform @@ -11,9 +6,13 @@ from sklearn.preprocessing import PolynomialFeatures from sklearn.base import BaseEstimator from sklearn.base import ClassifierMixin - from ..utils import cluster - +from ..utils import cythonize_file +try: + from . import _boosterc as boosterc +except ModuleNotFoundError: + cythonize_file("_boosterc.pyx") + import _boosterc class LSBoostClassifier(BaseEstimator, ClassifierMixin): """LSBoost classifier. @@ -192,7 +191,7 @@ def fit(self, X, y, **kwargs): ) X = np.column_stack((X, clustered_X)) - try: + try: self.obj = boosterc.fit_booster_classifier( np.asarray(X, order="C"), np.asarray(y, order="C"), @@ -291,7 +290,7 @@ def predict_proba(self, X, **kwargs): ), ) ) - try: + try: return boosterc.predict_proba_booster_classifier( self.obj, np.asarray(X, order="C") ) diff --git a/mlsauce/booster/_booster_regressor.py b/mlsauce/booster/_booster_regressor.py index b0b8d65..73a6ab8 100644 --- a/mlsauce/booster/_booster_regressor.py +++ b/mlsauce/booster/_booster_regressor.py @@ -1,10 +1,3 @@ -try: - from . import _boosterc as boosterc -except ImportError: - import pyximport - - pyximport.install() - import _boosterc import numpy as np import pandas as pd import platform @@ -14,8 +7,12 @@ from sklearn.preprocessing import PolynomialFeatures from . import _boosterc as boosterc from ..predictioninterval import PredictionInterval -from ..utils import cluster - +from ..utils import cluster, cythonize_file +try: + from . import _boosterc as boosterc +except ModuleNotFoundError: + cythonize_file("_boosterc.pyx") + import _boosterc class LSBoostRegressor(BaseEstimator, RegressorMixin): """LSBoost regressor. @@ -210,7 +207,7 @@ def fit(self, X, y, **kwargs): ) X = np.column_stack((X, clustered_X)) - try: + try: self.obj = boosterc.fit_booster_regressor( X=np.asarray(X, order="C"), y=np.asarray(y, order="C"), @@ -320,7 +317,7 @@ def predict(self, X, level=95, method=None, **kwargs): preds = self.pi.predict(X, return_pi=True) return preds - try: + try: return boosterc.predict_booster_regressor( self.obj, np.asarray(X, order="C") ) diff --git a/mlsauce/lasso/__init__.py b/mlsauce/lasso/__init__.py index fd2e799..acb895a 100644 --- a/mlsauce/lasso/__init__.py +++ b/mlsauce/lasso/__init__.py @@ -1,6 +1,6 @@ try: from ._lasso import LassoRegressor -except ImportError: +except ModuleNotFoundError: pass __all__ = ["LassoRegressor"] diff --git a/mlsauce/lasso/_lasso.py b/mlsauce/lasso/_lasso.py index 76948e9..5efd195 100644 --- a/mlsauce/lasso/_lasso.py +++ b/mlsauce/lasso/_lasso.py @@ -5,21 +5,19 @@ from sklearn.base import BaseEstimator from sklearn.base import RegressorMixin from numpy.linalg import inv - -try: - from . import _lassoc as mo -except ImportError: - import pyximport - - pyximport.install() - import _lassoc -from ..utils import get_beta +from ..utils import get_beta, cythonize_file if platform.system() in ("Linux", "Darwin"): import jax.numpy as jnp from jax import device_put from jax.numpy.linalg import inv as jinv +try: + from . import _lassoc as mo +except ModuleNotFoundError: + cythonize_file("_lassoc.pyx") + import _lassoc + class LassoRegressor(BaseEstimator, RegressorMixin): """Lasso. @@ -79,7 +77,7 @@ def fit(self, X, y, **kwargs): self: object. """ - try: + try: self.ym, centered_y = mo.center_response(y) self.xm = X.mean(axis=0) @@ -141,7 +139,7 @@ def fit(self, X, y, **kwargs): ) self.beta = res[0] return self - + except ValueError: self.ym, centered_y = _lassoc.center_response(y) @@ -205,8 +203,6 @@ def fit(self, X, y, **kwargs): self.beta = res[0] return self - - def predict(self, X, **kwargs): """Predict test data X. diff --git a/mlsauce/ridge/__init__.py b/mlsauce/ridge/__init__.py index 7b0a897..661c375 100644 --- a/mlsauce/ridge/__init__.py +++ b/mlsauce/ridge/__init__.py @@ -1,6 +1,6 @@ try: from ._ridge import RidgeRegressor -except ImportError: +except ModuleNotFoundError: pass __all__ = ["RidgeRegressor"] diff --git a/mlsauce/ridge/_ridge.py b/mlsauce/ridge/_ridge.py index aa944ed..d35d456 100644 --- a/mlsauce/ridge/_ridge.py +++ b/mlsauce/ridge/_ridge.py @@ -5,20 +5,19 @@ from sklearn.base import RegressorMixin from numpy.linalg import inv -try: - from . import _ridgec as mo -except ImportError: - import pyximport - - pyximport.install() - import _ridgec -from ..utils import get_beta +from ..utils import get_beta, cythonize_file if platform.system() in ("Linux", "Darwin"): import jax.numpy as jnp from jax import device_put from jax.numpy.linalg import inv as jinv +try: + from . import _ridgec as mo +except ModuleNotFoundError: + cythonize_file("_ridgec.pyx") + import _ridgec + class RidgeRegressor(BaseEstimator, RegressorMixin): """Ridge. @@ -70,7 +69,7 @@ def fit(self, X, y, **kwargs): self: object. """ - try: + try: self.ym, centered_y = mo.center_response(y) except ValueError: self.ym, centered_y = _ridgec.center_response(y) @@ -99,21 +98,25 @@ def fit(self, X, y, **kwargs): # self.beta, _, _, _ = np.linalg.lstsq(X_, y_, rcond=None) self.beta = get_beta(X_, y_) except Exception: - try: + try: x = inv( - mo.crossprod(X_) + self.reg_lambda * np.eye(X_.shape[1]) + mo.crossprod(X_) + + self.reg_lambda * np.eye(X_.shape[1]) ) hat_matrix = mo.tcrossprod(x, X_) self.beta = mo.safe_sparse_dot(hat_matrix, centered_y) except ValueError: x = inv( - _ridgec.crossprod(X_) + self.reg_lambda * np.eye(X_.shape[1]) + _ridgec.crossprod(X_) + + self.reg_lambda * np.eye(X_.shape[1]) ) hat_matrix = _ridgec.tcrossprod(x, X_) - self.beta = _ridgec.safe_sparse_dot(hat_matrix, centered_y) + self.beta = _ridgec.safe_sparse_dot( + hat_matrix, centered_y + ) return self - try: + try: x = jinv( mo.crossprod(X_, backend=self.backend) + self.reg_lambda * jnp.eye(X_.shape[1]) @@ -151,7 +154,7 @@ def predict(self, X, **kwargs): """ X_ = (X - self.xm[None, :]) / self.xsd[None, :] - try: + try: if self.backend == "cpu": if isinstance(self.ym, float): return self.ym + mo.safe_sparse_dot(X_, self.beta) @@ -170,11 +173,12 @@ def predict(self, X, **kwargs): if isinstance(self.ym, float): return self.ym + _ridgec.safe_sparse_dot(X_, self.beta) return self.ym[None, :] + _ridgec.safe_sparse_dot(X_, self.beta) - + # if self.backend in ("gpu", "tpu"): if isinstance(self.ym, float): - return self.ym + _ridgec.safe_sparse_dot(X_, self.beta, backend=self.backend) - return self.ym[None, :] + _ridgec.safe_sparse_dot(X_, self.beta, backend=self.backend) - - - + return self.ym + _ridgec.safe_sparse_dot( + X_, self.beta, backend=self.backend + ) + return self.ym[None, :] + _ridgec.safe_sparse_dot( + X_, self.beta, backend=self.backend + ) diff --git a/mlsauce/stump/__init__.py b/mlsauce/stump/__init__.py index 64ee412..b312bc1 100644 --- a/mlsauce/stump/__init__.py +++ b/mlsauce/stump/__init__.py @@ -1,6 +1,6 @@ try: from ._stump_classifier import StumpClassifier -except ImportError: +except ModuleNotFoundError: pass __all__ = ["StumpClassifier"] diff --git a/mlsauce/stump/_stump_classifier.py b/mlsauce/stump/_stump_classifier.py index c314e27..6fcfbd1 100644 --- a/mlsauce/stump/_stump_classifier.py +++ b/mlsauce/stump/_stump_classifier.py @@ -1,13 +1,12 @@ import numpy as np from sklearn.base import BaseEstimator from sklearn.base import ClassifierMixin +from ..utils import cythonize_file try: from . import _stumpc as stumpc -except ImportError: - import pyximport - - pyximport.install() +except ModuleNotFoundError: + cythonize_file("_stumpc.pyx") import _stumpc @@ -45,7 +44,7 @@ def fit(self, X, y, sample_weight=None, **kwargs): """ if sample_weight is None: - try: + try: self.obj = stumpc.fit_stump_classifier( X=np.asarray(X, order="C"), y=np.asarray(y, order="C"), @@ -60,7 +59,7 @@ def fit(self, X, y, sample_weight=None, **kwargs): return self - try: + try: self.obj = stumpc.fit_stump_classifier( X=np.asarray(X, order="C"), y=np.asarray(y, order="C"), @@ -112,7 +111,7 @@ def predict_proba(self, X, **kwargs): probability estimates for test data: {array-like} """ - try: + try: return stumpc.predict_proba_stump_classifier( self.obj, np.asarray(X, order="C") ) diff --git a/mlsauce/utils/__init__.py b/mlsauce/utils/__init__.py index 4f57f90..61ac04a 100644 --- a/mlsauce/utils/__init__.py +++ b/mlsauce/utils/__init__.py @@ -1,10 +1,11 @@ from .sampling.rowsubsampling import subsample -from .misc.misc import cluster, merge_two_dicts, flatten, is_float, is_factor +from .misc.misc import cluster, cythonize_file, merge_two_dicts, flatten, is_float, is_factor from .progress_bar import Progbar from .get_beta import get_beta __all__ = [ "cluster", + "cythonize_file", "subsample", "merge_two_dicts", "flatten", diff --git a/mlsauce/utils/misc/__init__.py b/mlsauce/utils/misc/__init__.py index ddd6694..2cb2576 100644 --- a/mlsauce/utils/misc/__init__.py +++ b/mlsauce/utils/misc/__init__.py @@ -1,4 +1,4 @@ -from .misc import cluster, merge_two_dicts, flatten, is_float, is_factor +from .misc import cluster, cythonize_file, merge_two_dicts, flatten, is_float, is_factor -__all__ = ["cluster", "merge_two_dicts", "flatten", "is_float", "is_factor"] +__all__ = ["cluster", "cythonize_file", "merge_two_dicts", "flatten", "is_float", "is_factor"] diff --git a/mlsauce/utils/misc/misc.py b/mlsauce/utils/misc/misc.py index d9204a3..d22062c 100644 --- a/mlsauce/utils/misc/misc.py +++ b/mlsauce/utils/misc/misc.py @@ -1,7 +1,11 @@ # Authors: Thierry Moudiki # # License: BSD 3 +import os +import re +import subprocess import numpy as np +from Cython.Build import cythonize from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler from sklearn.preprocessing import OneHotEncoder from sklearn.cluster import KMeans @@ -82,6 +86,47 @@ def cluster( ).toarray() +def cythonize_file(filename): + """ + Cythonize a given .pyx file and compile it if it doesn't exist already. + + Args: + filename (str): Name of the .pyx file to cythonize. + """ + # Define a generic regex pattern to match compiled files + compiled_pattern = re.compile(r"\.cpython-\d+\.\d+-[^-]+-(?:x86|x64)_\w+\.so$") + + # Check if any matching compiled files already exist + compiled_files = [f for f in os.listdir() if compiled_pattern.search(f)] + if compiled_files: + # Compiled file(s) already exist, so we skip silently + return + + # Get a list of all .pyx files in the current directory + pyx_files = [file for file in os.listdir() if file.endswith(".pyx")] + + if not pyx_files: + print("No .pyx files found in the current directory.") + return + + for pyx_file in pyx_files: + # Cythonize the .pyx file + cythonized_modules = cythonize(pyx_file) + + # Compile the generated C code using a C compiler (GCC for Unix-like systems) + for extension in cythonized_modules: + setup_args = extension.get_metadata('setup_args') + # Extract the module name from setup_args + module_name = setup_args['ext_modules'][0].name + + # Compile the C code into a Python extension module (.so file) + compile_command = f"gcc -shared -pthread -fPIC -fwrapv -O2 -Wall -fno-strict-aliasing \ + -I/usr/include/python3.x -o {module_name}.so {module_name}.c" + + os.system(compile_command) + + print(f"Compiled {module_name}.so") + # merge two dictionaries def merge_two_dicts(x, y): z = x.copy() diff --git a/setup.py b/setup.py index c623f20..7e789c2 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ try: import builtins -except ImportError: +except ModuleNotFoundError: import __builtin__ as builtins subprocess.run(['pip', 'install', 'numpy'], check=False) @@ -38,7 +38,7 @@ MAINTAINER_EMAIL = 'thierry.moudiki@gmail.com' LICENSE = 'BSD3 Clause Clear' -__version__ = '0.15.0' +__version__ = '0.15.1' VERSION = __version__