From b9a6343ace9419036af55a795ab2c52e93931c85 Mon Sep 17 00:00:00 2001 From: julian fong Date: Sat, 23 Mar 2024 15:14:21 -0400 Subject: [PATCH 01/27] added class name --- skpro/regression/linear/_sklearn.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/skpro/regression/linear/_sklearn.py b/skpro/regression/linear/_sklearn.py index f85a3974..e8cf7b92 100644 --- a/skpro/regression/linear/_sklearn.py +++ b/skpro/regression/linear/_sklearn.py @@ -361,3 +361,7 @@ def get_test_params(cls, parameter_set="default"): "fit_intercept": False, } return [param1, param2] + + +class GaussianRegressor(_DelegateWithFittedParamForwarding): + pass From bebe8351416154880b0e375bb05935f99e1b75df Mon Sep 17 00:00:00 2001 From: julian fong Date: Sat, 23 Mar 2024 18:43:09 -0400 Subject: [PATCH 02/27] updates --- skpro/regression/linear/_sklearn.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/skpro/regression/linear/_sklearn.py b/skpro/regression/linear/_sklearn.py index e8cf7b92..dacc197d 100644 --- a/skpro/regression/linear/_sklearn.py +++ b/skpro/regression/linear/_sklearn.py @@ -362,6 +362,3 @@ def get_test_params(cls, parameter_set="default"): } return [param1, param2] - -class GaussianRegressor(_DelegateWithFittedParamForwarding): - pass From 63a4684d7653c644d24cd363b0c98314e534a599 Mon Sep 17 00:00:00 2001 From: julian fong Date: Sun, 24 Mar 2024 12:46:20 -0400 Subject: [PATCH 03/27] wrote docstring --- skpro/regression/linear/_glm.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 skpro/regression/linear/_glm.py diff --git a/skpro/regression/linear/_glm.py b/skpro/regression/linear/_glm.py new file mode 100644 index 00000000..e69de29b From 2e89bb289eae43dce203e4e6b148508482d1f12f Mon Sep 17 00:00:00 2001 From: julian fong Date: Sun, 24 Mar 2024 18:29:09 -0400 Subject: [PATCH 04/27] implemented init, fit and predict --- skpro/regression/linear/_glm.py | 314 ++++++++++++++++++++++++++++++++ 1 file changed, 314 insertions(+) diff --git a/skpro/regression/linear/_glm.py b/skpro/regression/linear/_glm.py index e69de29b..e6e5d7dd 100644 --- a/skpro/regression/linear/_glm.py +++ b/skpro/regression/linear/_glm.py @@ -0,0 +1,314 @@ +"""Interface adapter for the Generalized Linear Model Regressor with Gaussian Link""" +# copyright: skpro developers, BSD-3-Clause License (see LICENSE file) + +import numpy as np +import pandas as pd + +from skpro.regression.base import BaseProbaRegressor +from skpro.utils.sklearn import prep_skl_df + +from statsmodels.genmod.generalized_linear_model import GLM +from statsmodels.genmod.families.family import Gaussian + +class GaussianRegressor(BaseProbaRegressor): + """ + Fits a generalized linear model with a gaussian link. + + Direct interface to ``statsmodels.genmod.generalized_linear_model.GLM`` + from the ``statsmodels`` package. + + statsmodels uses parameters 'exog' and 'endog' to denote the X and y values + respectively and supports two separate definition of weights: frequency + and variance. + + For a direct link to statmodels' Generalized Linear Models module see: + https://www.statsmodels.org/stable/glm.html#module-reference + + Parameters + ---------- + endog : pandas DataFrame + 1d array of the endogenous (y) response variable. This array can be 1d + or 2d. Binomial family models accept a 2d array with two columns. If supplied + each observation is expected to be [success, failure]. + + exog : pandas DataFrame + A (n,k) array where n is the number of observations and k is the number + of regressors. An intercept is not included by default and should be + added by the user (models specified using a formula include an + intercept by default). + + family : family class instance + To specify the binomial distribution family = sm.family.Binomial() Each + family can take a link instance as an argument. + See statsmodels.family.family for more information. + + offset : array_like or None + An offset to be included in the model. If provided, must be an array + whose length is the number of rows in exog (x). + + exposure : array_like or None + Log(exposure) will be added to the linear prediction in the model. + Exposure is only valid if the log link is used. If provided, it must be + an array with the same length as endog (y). + + freq_weights : array_like + 1d array of frequency weights. The default is None. If None is selected + or a blank value, then the algorithm will replace with an array of 1s + with length equal to the endog. + + var_weights : array_like + 1d array of variance (analytic) weights. The default is None. If None + is selected or a blank value, then the algorithm will replace with an + array of 1s with length equal to the endog. + missing : str + Available options are 'none', 'drop' and 'raise'. If 'none', no nan + checking is done. If 'drop', any observations with nans are dropped. + If 'raise', an error is raised. Default = 'none' + + Attributes + ---------- + df_model : float + Model degrees of freedom is equal to p - 1, where p is the number of + regressors. Note that the intercept is not reported as a degree of freedom. + + df_resid : float + Residual degrees of freedom is equal to the number of observation n + minus the number of regressors p. + + endog : pandas DataFrame + Note that endog is a reference to the data so that if data is already + an array and it is changed, then endog changes as well. + + exposure : array_like + Include ln(exposure) in model with coefficient constrained to 1. + Can only be used if the link is the logarithm function. + + exog : pandas DataFrame + Note that exog is a reference to the data so that if data is already + an array and it is changed, then exog changes as well. + + freq_weights : ndarray + Note that freq_weights is a reference to the data so that if data + is already an array and it is changed, then freq_weights changes + as well. + + var_weights : ndarray + Note that var_weights is a reference to the data so that if + data is already an array and it is changed, then var_weights + changes as well. + + iteration : int + The number of iterations that fit has run. Initialized at 0. + + family : family class instance + he distribution family of the model. Can be any family + in statsmodels.families. Default is Gaussian. + + mu : ndarray + The mean response of the transformed variable. mu is the value of the + inverse of the link function at lin_pred, where lin_pred is the linear + predicted value of the WLS fit of the transformed variable. mu is only + available after fit is called. See statsmodels.families.family.fitted + of the distribution family for more information. + + n_trials : ndarray + Note that n_trials is a reference to the data so that if data is + already an array and it is changed, then n_trials changes as well. + n_trials is the number of binomial trials and only available with that + distribution. See statsmodels.families.Binomial for more information. + + normalized_cov_params : ndarray + The p x p normalized covariance of the design / exogenous data. This + is approximately equal to (X.T X)^(-1) + + offset : array_like + Include offset in model with coefficient constrained to 1. + + scale : float + The estimate of the scale / dispersion of the model fit. + Only available after fit is called. See GLM.fit and GLM.estimate_scale + for more information. + + scaletype : str + The scaling used for fitting the model. This is only available + after fit is called. The default is None. See GLM.fit for + more information. + + weights : ndarray + The value of the weights after the last iteration of fit. + Only available after fit is called. See statsmodels.families.family + for the specific distribution weighting functions. + """ + + _tags = { + "authors": ["julian-fong"], + "maintainers": ["julian-fong"], + "python_version": None, + "python_dependencies": None, + "capability:multioutput": False, + "capability:missing": False, + "X_inner_mtype": "pd_DataFrame", + "y_inner_mtype": "pd_DataFrame", + } + + def __init__( + self, + endog, + exog, + family = None, + offset = None, + exposure = None, + freq_weights = None, + var_weights = None, + missing = "none", + ): + self.endog = endog, + self.exog = exog, + self.family = Gaussian(), + self.offset = offset, + self.exposure = exposure, + self.freq_weights = freq_weights, + self.var_weights = var_weights, + self.missing = missing + + super().__init__() + + glm_estimator = GLM( + endog = endog, + exog = exog, + family = family, + offset = offset, + exposure = exposure, + freq_weights = freq_weights, + var_weights = var_weights, + missing = missing + ) + + self._estimator = glm_estimator #does this need to be cloned using some clone method? + + def _fit( + self, + start_params=None, + maxiter=100, + method='IRLS', + tol=1e-8, + scale=None, + cov_type='nonrobust', + cov_kwds=None, + use_t=None, + full_output=True, + disp=False, + max_start_irls=3 + ): + """ + Fits the regressor to the data. + + Note that the parameters X, y were defined when calling the statsmodel + GLM constructer. + + Writes to self: + Sets fitted model attributes ending in "_". + + Parameters + ---------- + start_params : array_like (optional) + Initial guess of the solution for the loglikelihood maximization. + The default is family-specific and is given by the + family.starting_mu(endog). If start_params is given then the initial + mean will be calculated as np.dot(exog, start_params). + + maxiter : int + Number of iterations + + method : str + Default is 'IRLS' for iteratively re-weighted least squares + + tol : float + Convergence tolerance. Default is 1e-8 + + scale : str/float + scale can be 'X2', 'dev', or a float. The default value is None, + which uses X2 for gamma, gaussian and inverse gaussian. X2 is + Pearson's chi-square divided by df_resid. The default is 1 for + the Bionmial and Poisson families. dev is the deviance divided + by df_resid + + cov_type : str + The type of parameter estimate covariance matrix to compute + + cov_kwds : dict-like + Extra arguments for calculating the covariance of the + parameter estimates + + use_t : bool + if True, the Student t-distribution if used for inference + + full_output : bool + Set to True to have all available output in the Results object’s + mle_retvals attribute. The output is dependent on the solver. See + LikelihoodModelResults notes section for more information. Not used + if methhod is IRLS. + + disp : bool + Set to True to print convergence messages. Not used if method + is IRLS + + max_start_irls : int + The number of IRLS iterations used to obtain starting values for + gradient optimization. Only relevenat if method is set to something + other than "IRLS" + + Returns + ------- + self : reference to self + """ + + fitted_glm_model = self._estimator.fit( + start_params, + maxiter, + method, + tol, + scale, + cov_type, + cov_kwds, + use_t, + full_output, + disp, + max_start_irls, + ) + + FITTED_PARAMS_TO_FORWARD = ["glm_estimator_"] + + for param in FITTED_PARAMS_TO_FORWARD: + setattr(self, param, fitted_glm_model) + + return self + + def _predict(self, X): + """Predict labels for data from features. + + State required: + Requires state to be "fitted" + + Accesses in self: + Fitted model attributes ending in "_" + + Parameters + ---------- + X : pandas DataFrame, must have same columns as X in `fit` + data to predict labels for + + Returns + ------- + y : pandas DataFrame, same length as `X`, with same columns as y in fit + """ + y_column = self.endog.columns + y_pred_series = self.glm_estimator_.predict(X) + y_pred = pd.DataFrame(y_pred_series, columns = [y_column]) + + return y_pred + + + def _predict_proba(self, X): + pass + From 905386a0fa6554fc0f2b460205effd687eeea003 Mon Sep 17 00:00:00 2001 From: julian fong Date: Sun, 24 Mar 2024 19:44:44 -0400 Subject: [PATCH 05/27] implemented _predict_proba, fixed bugs --- skpro/regression/linear/_glm.py | 46 +++++++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 8 deletions(-) diff --git a/skpro/regression/linear/_glm.py b/skpro/regression/linear/_glm.py index e6e5d7dd..5b213964 100644 --- a/skpro/regression/linear/_glm.py +++ b/skpro/regression/linear/_glm.py @@ -162,13 +162,13 @@ def __init__( var_weights = None, missing = "none", ): - self.endog = endog, - self.exog = exog, - self.family = Gaussian(), - self.offset = offset, - self.exposure = exposure, - self.freq_weights = freq_weights, - self.var_weights = var_weights, + self.endog = endog + self.exog = exog + self.family = Gaussian() + self.offset = offset + self.exposure = exposure + self.freq_weights = freq_weights + self.var_weights = var_weights self.missing = missing super().__init__() @@ -310,5 +310,35 @@ def _predict(self, X): def _predict_proba(self, X): - pass + """Predict distribution over labels for data from features. + + State required: + Requires state to be "fitted". + + Accesses in self: + Fitted model attributes ending in "_" + + Parameters + ---------- + X : pandas DataFrame, must have same columns as X in `fit` + data to predict labels for + + Returns + ------- + y_pred : skpro BaseDistribution, same length as `X` + labels predicted for `X` + """ + from skpro.distributions.normal import Normal + + y_pred_series = self.glm_estimator_.predict(X) + y_mu = pd.DataFrame(y_pred_series, columns = [self.endog.columns]) + y_sigma = np.std(y_mu.values) + params = { + "mu": y_mu, + "sigma": y_sigma, + "index": X.index, + "columns": y_mu.columns + } + y_pred = Normal(**params) + return y_pred From f423be5ee06b852ea8853d524453a1f4efff6974 Mon Sep 17 00:00:00 2001 From: julian fong Date: Mon, 25 Mar 2024 09:47:45 -0400 Subject: [PATCH 06/27] made changes as per 222 --- skpro/regression/linear/_glm.py | 321 ++++++++++++++-------------- skpro/regression/linear/_sklearn.py | 1 - 2 files changed, 166 insertions(+), 156 deletions(-) diff --git a/skpro/regression/linear/_glm.py b/skpro/regression/linear/_glm.py index 5b213964..df92d2ca 100644 --- a/skpro/regression/linear/_glm.py +++ b/skpro/regression/linear/_glm.py @@ -1,45 +1,45 @@ -"""Interface adapter for the Generalized Linear Model Regressor with Gaussian Link""" +"""Interface adapter for the Generalized Linear Model Regressor with Gaussian Link.""" # copyright: skpro developers, BSD-3-Clause License (see LICENSE file) import numpy as np import pandas as pd +from statsmodels.genmod.families.family import Gaussian +from statsmodels.genmod.generalized_linear_model import GLM from skpro.regression.base import BaseProbaRegressor -from skpro.utils.sklearn import prep_skl_df -from statsmodels.genmod.generalized_linear_model import GLM -from statsmodels.genmod.families.family import Gaussian class GaussianRegressor(BaseProbaRegressor): """ Fits a generalized linear model with a gaussian link. - Direct interface to ``statsmodels.genmod.generalized_linear_model.GLM`` + Direct interface to ``statsmodels.genmod.generalized_linear_model.GLM`` from the ``statsmodels`` package. statsmodels uses parameters 'exog' and 'endog' to denote the X and y values - respectively and supports two separate definition of weights: frequency + respectively and supports two separate definition of weights: frequency and variance. - + For a direct link to statmodels' Generalized Linear Models module see: https://www.statsmodels.org/stable/glm.html#module-reference Parameters ---------- - endog : pandas DataFrame - 1d array of the endogenous (y) response variable. This array can be 1d - or 2d. Binomial family models accept a 2d array with two columns. If supplied - each observation is expected to be [success, failure]. + y : pandas DataFrame + 1d array of the endogenous response variable. This array can be 1d or + 2d. Binomial family models accept a 2d array with two columns. + If supplied each observation is expected to be [success, failure]. + Equivalent to statsmodel's (endog). - exog : pandas DataFrame + X : pandas DataFrame A (n,k) array where n is the number of observations and k is the number - of regressors. An intercept is not included by default and should be - added by the user (models specified using a formula include an - intercept by default). + of regressors. An intercept is not included by default and should be + added by the user (models specified using a formula include an + intercept by default). Equivalent to statsmodel's (exog). family : family class instance To specify the binomial distribution family = sm.family.Binomial() Each - family can take a link instance as an argument. + family can take a link instance as an argument. See statsmodels.family.family for more information. offset : array_like or None @@ -47,19 +47,20 @@ class GaussianRegressor(BaseProbaRegressor): whose length is the number of rows in exog (x). exposure : array_like or None - Log(exposure) will be added to the linear prediction in the model. + Log(exposure) will be added to the linear prediction in the model. Exposure is only valid if the log link is used. If provided, it must be an array with the same length as endog (y). - + freq_weights : array_like 1d array of frequency weights. The default is None. If None is selected - or a blank value, then the algorithm will replace with an array of 1s + or a blank value, then the algorithm will replace with an array of 1s with length equal to the endog. - + var_weights : array_like - 1d array of variance (analytic) weights. The default is None. If None - is selected or a blank value, then the algorithm will replace with an + 1d array of variance (analytic) weights. The default is None. If None + is selected or a blank value, then the algorithm will replace with an array of 1s with length equal to the endog. + missing : str Available options are 'none', 'drop' and 'raise'. If 'none', no nan checking is done. If 'drop', any observations with nans are dropped. @@ -68,76 +69,128 @@ class GaussianRegressor(BaseProbaRegressor): Attributes ---------- df_model : float - Model degrees of freedom is equal to p - 1, where p is the number of + Model degrees of freedom is equal to p - 1, where p is the number of regressors. Note that the intercept is not reported as a degree of freedom. df_resid : float - Residual degrees of freedom is equal to the number of observation n + Residual degrees of freedom is equal to the number of observation n minus the number of regressors p. endog : pandas DataFrame - Note that endog is a reference to the data so that if data is already + Note that endog is a reference to the data so that if data is already an array and it is changed, then endog changes as well. exposure : array_like - Include ln(exposure) in model with coefficient constrained to 1. + Include ln(exposure) in model with coefficient constrained to 1. Can only be used if the link is the logarithm function. exog : pandas DataFrame - Note that exog is a reference to the data so that if data is already + Note that exog is a reference to the data so that if data is already an array and it is changed, then exog changes as well. freq_weights : ndarray - Note that freq_weights is a reference to the data so that if data - is already an array and it is changed, then freq_weights changes + Note that freq_weights is a reference to the data so that if data + is already an array and it is changed, then freq_weights changes as well. var_weights : ndarray - Note that var_weights is a reference to the data so that if - data is already an array and it is changed, then var_weights + Note that var_weights is a reference to the data so that if + data is already an array and it is changed, then var_weights changes as well. iteration : int The number of iterations that fit has run. Initialized at 0. family : family class instance - he distribution family of the model. Can be any family + he distribution family of the model. Can be any family in statsmodels.families. Default is Gaussian. mu : ndarray - The mean response of the transformed variable. mu is the value of the - inverse of the link function at lin_pred, where lin_pred is the linear - predicted value of the WLS fit of the transformed variable. mu is only - available after fit is called. See statsmodels.families.family.fitted + The mean response of the transformed variable. mu is the value of the + inverse of the link function at lin_pred, where lin_pred is the linear + predicted value of the WLS fit of the transformed variable. mu is only + available after fit is called. See statsmodels.families.family.fitted of the distribution family for more information. n_trials : ndarray - Note that n_trials is a reference to the data so that if data is - already an array and it is changed, then n_trials changes as well. - n_trials is the number of binomial trials and only available with that + Note that n_trials is a reference to the data so that if data is + already an array and it is changed, then n_trials changes as well. + n_trials is the number of binomial trials and only available with that distribution. See statsmodels.families.Binomial for more information. normalized_cov_params : ndarray - The p x p normalized covariance of the design / exogenous data. This + The p x p normalized covariance of the design / exogenous data. This is approximately equal to (X.T X)^(-1) offset : array_like Include offset in model with coefficient constrained to 1. scale : float - The estimate of the scale / dispersion of the model fit. - Only available after fit is called. See GLM.fit and GLM.estimate_scale + The estimate of the scale / dispersion of the model fit. + Only available after fit is called. See GLM.fit and GLM.estimate_scale for more information. scaletype : str - The scaling used for fitting the model. This is only available - after fit is called. The default is None. See GLM.fit for + The scaling used for fitting the model. This is only available + after fit is called. The default is None. See GLM.fit for more information. weights : ndarray - The value of the weights after the last iteration of fit. - Only available after fit is called. See statsmodels.families.family + The value of the weights after the last iteration of fit. + Only available after fit is called. See statsmodels.families.family for the specific distribution weighting functions. + + start_params : array_like (optional) + Initial guess of the solution for the loglikelihood maximization. + The default is family-specific and is given by the + family.starting_mu(endog). If start_params is given then the initial + mean will be calculated as np.dot(exog, start_params). + This parameter is used inside the GLM fit() function. + + maxiter : int + Number of iterations. This parameter is used inside the GLM fit() function. + + method : str + Default is 'IRLS' for iteratively re-weighted least squares. + This parameter is used inside the GLM fit() function. + + tol : float + Convergence tolerance. Default is 1e-8. This parameter is + used inside the GLM fit() function. + + scale : str/float + scale can be 'X2', 'dev', or a float. The default value is None, + which uses X2 for gamma, gaussian and inverse gaussian. X2 is + Pearson's chi-square divided by df_resid. The default is 1 for + the Bionmial and Poisson families. dev is the deviance divided + by df_resid. This parameter is used inside the GLM fit() function. + + cov_type : str + The type of parameter estimate covariance matrix to compute. + This parameter is used inside the GLM fit() function. + + cov_kwds : dict-like + Extra arguments for calculating the covariance of the + parameter estimates. This parameter is used inside the GLM fit() function. + + use_t : bool + if True, the Student t-distribution if used for inference. + This parameter is used inside the GLM fit() function. + + full_output : bool + Set to True to have all available output in the Results object’s + mle_retvals attribute. The output is dependent on the solver. See + LikelihoodModelResults notes section for more information. Not used + if methhod is IRLS. This parameter is used inside the GLM fit() function. + + disp : bool + Set to True to print convergence messages. Not used if method + is IRLS. This parameter is used inside the GLM fit() function. + + max_start_irls : int + The number of IRLS iterations used to obtain starting values for + gradient optimization. Only relevenat if method is set to something + other than "IRLS". This parameter is used inside the GLM fit() function. """ _tags = { @@ -152,129 +205,89 @@ class GaussianRegressor(BaseProbaRegressor): } def __init__( - self, - endog, - exog, - family = None, - offset = None, - exposure = None, - freq_weights = None, - var_weights = None, - missing = "none", + self, + family=None, + offset=None, + exposure=None, + freq_weights=None, + var_weights=None, + missing="none", + start_params=None, + maxiter=100, + method="IRLS", + tol=1e-8, + scale=None, + cov_type="nonrobust", + cov_kwds=None, + use_t=None, + full_output=True, + disp=False, + max_start_irls=3, ): - self.endog = endog - self.exog = exog self.family = Gaussian() self.offset = offset self.exposure = exposure self.freq_weights = freq_weights self.var_weights = var_weights self.missing = missing + self.start_params = start_params + self.maxiter = maxiter + self.method = method + self.tol = tol + self.scale = scale + self.cov_type = cov_type + self.cov_kwds = cov_kwds + self.use_t = use_t + self.full_output = full_output + self.disp = disp + self.max_start_irls = max_start_irls super().__init__() - glm_estimator = GLM( - endog = endog, - exog = exog, - family = family, - offset = offset, - exposure = exposure, - freq_weights = freq_weights, - var_weights = var_weights, - missing = missing - ) + def _fit(self, X, y): + """Fit regressor to training data. - self._estimator = glm_estimator #does this need to be cloned using some clone method? - - def _fit( - self, - start_params=None, - maxiter=100, - method='IRLS', - tol=1e-8, - scale=None, - cov_type='nonrobust', - cov_kwds=None, - use_t=None, - full_output=True, - disp=False, - max_start_irls=3 - ): - """ - Fits the regressor to the data. - - Note that the parameters X, y were defined when calling the statsmodel - GLM constructer. - Writes to self: Sets fitted model attributes ending in "_". - + Parameters ---------- - start_params : array_like (optional) - Initial guess of the solution for the loglikelihood maximization. - The default is family-specific and is given by the - family.starting_mu(endog). If start_params is given then the initial - mean will be calculated as np.dot(exog, start_params). - - maxiter : int - Number of iterations - - method : str - Default is 'IRLS' for iteratively re-weighted least squares - - tol : float - Convergence tolerance. Default is 1e-8 - - scale : str/float - scale can be 'X2', 'dev', or a float. The default value is None, - which uses X2 for gamma, gaussian and inverse gaussian. X2 is - Pearson's chi-square divided by df_resid. The default is 1 for - the Bionmial and Poisson families. dev is the deviance divided - by df_resid - - cov_type : str - The type of parameter estimate covariance matrix to compute - - cov_kwds : dict-like - Extra arguments for calculating the covariance of the - parameter estimates - - use_t : bool - if True, the Student t-distribution if used for inference - - full_output : bool - Set to True to have all available output in the Results object’s - mle_retvals attribute. The output is dependent on the solver. See - LikelihoodModelResults notes section for more information. Not used - if methhod is IRLS. - - disp : bool - Set to True to print convergence messages. Not used if method - is IRLS - - max_start_irls : int - The number of IRLS iterations used to obtain starting values for - gradient optimization. Only relevenat if method is set to something - other than "IRLS" + X : pandas DataFrame + feature instances to fit regressor to + y : pandas DataFrame, must be same length as X + labels to fit regressor to Returns ------- self : reference to self """ + glm_estimator = GLM( + endog=y, + exog=X, + family=self.family, + offset=self.offset, + exposure=self.exposure, + freq_weights=self.freq_weights, + var_weights=self.var_weights, + missing=self.missing, + ) + + self._estimator = ( + glm_estimator # does this need to be cloned using some clone method? + ) fitted_glm_model = self._estimator.fit( - start_params, - maxiter, - method, - tol, - scale, - cov_type, - cov_kwds, - use_t, - full_output, - disp, - max_start_irls, + self.start_params, + self.maxiter, + self.method, + self.tol, + self.scale, + self.cov_type, + self.cov_kwds, + self.use_t, + self.full_output, + self.disp, + self.max_start_irls, ) FITTED_PARAMS_TO_FORWARD = ["glm_estimator_"] @@ -304,10 +317,9 @@ def _predict(self, X): """ y_column = self.endog.columns y_pred_series = self.glm_estimator_.predict(X) - y_pred = pd.DataFrame(y_pred_series, columns = [y_column]) + y_pred = pd.DataFrame(y_pred_series, columns=[y_column]) return y_pred - def _predict_proba(self, X): """Predict distribution over labels for data from features. @@ -331,14 +343,13 @@ def _predict_proba(self, X): from skpro.distributions.normal import Normal y_pred_series = self.glm_estimator_.predict(X) - y_mu = pd.DataFrame(y_pred_series, columns = [self.endog.columns]) + y_mu = pd.DataFrame(y_pred_series, columns=[self.endog.columns]) y_sigma = np.std(y_mu.values) params = { - "mu": y_mu, - "sigma": y_sigma, - "index": X.index, - "columns": y_mu.columns + "mu": y_mu, + "sigma": y_sigma, + "index": X.index, + "columns": y_mu.columns, } y_pred = Normal(**params) return y_pred - diff --git a/skpro/regression/linear/_sklearn.py b/skpro/regression/linear/_sklearn.py index dacc197d..f85a3974 100644 --- a/skpro/regression/linear/_sklearn.py +++ b/skpro/regression/linear/_sklearn.py @@ -361,4 +361,3 @@ def get_test_params(cls, parameter_set="default"): "fit_intercept": False, } return [param1, param2] - From 5700e294a880a4cea8b30f84be602205b21f0187 Mon Sep 17 00:00:00 2001 From: julian fong Date: Mon, 25 Mar 2024 10:17:35 -0400 Subject: [PATCH 07/27] added parameter for intercept and implemented get_test_params --- skpro/regression/linear/_glm.py | 177 +++++++++++++++++++------------- 1 file changed, 105 insertions(+), 72 deletions(-) diff --git a/skpro/regression/linear/_glm.py b/skpro/regression/linear/_glm.py index df92d2ca..db577d84 100644 --- a/skpro/regression/linear/_glm.py +++ b/skpro/regression/linear/_glm.py @@ -5,6 +5,7 @@ import pandas as pd from statsmodels.genmod.families.family import Gaussian from statsmodels.genmod.generalized_linear_model import GLM +from statsmodels.tools import add_constant from skpro.regression.base import BaseProbaRegressor @@ -25,18 +26,6 @@ class GaussianRegressor(BaseProbaRegressor): Parameters ---------- - y : pandas DataFrame - 1d array of the endogenous response variable. This array can be 1d or - 2d. Binomial family models accept a 2d array with two columns. - If supplied each observation is expected to be [success, failure]. - Equivalent to statsmodel's (endog). - - X : pandas DataFrame - A (n,k) array where n is the number of observations and k is the number - of regressors. An intercept is not included by default and should be - added by the user (models specified using a formula include an - intercept by default). Equivalent to statsmodel's (exog). - family : family class instance To specify the binomial distribution family = sm.family.Binomial() Each family can take a link instance as an argument. @@ -66,6 +55,64 @@ class GaussianRegressor(BaseProbaRegressor): checking is done. If 'drop', any observations with nans are dropped. If 'raise', an error is raised. Default = 'none' + start_params : array_like (optional) + Initial guess of the solution for the loglikelihood maximization. + The default is family-specific and is given by the + family.starting_mu(endog). If start_params is given then the initial + mean will be calculated as np.dot(exog, start_params). + This parameter is used inside the GLM fit() function. + + maxiter : int + Number of iterations. This parameter is used inside the GLM fit() function. + + method : str + Default is 'IRLS' for iteratively re-weighted least squares. + This parameter is used inside the GLM fit() function. + + tol : float + Convergence tolerance. Default is 1e-8. This parameter is + used inside the GLM fit() function. + + scale : str/float + scale can be 'X2', 'dev', or a float. The default value is None, + which uses X2 for gamma, gaussian and inverse gaussian. X2 is + Pearson's chi-square divided by df_resid. The default is 1 for + the Bionmial and Poisson families. dev is the deviance divided + by df_resid. This parameter is used inside the GLM fit() function. + + cov_type : str + The type of parameter estimate covariance matrix to compute. + This parameter is used inside the GLM fit() function. + + cov_kwds : dict-like + Extra arguments for calculating the covariance of the + parameter estimates. This parameter is used inside the GLM fit() function. + + use_t : bool + if True, the Student t-distribution if used for inference. + This parameter is used inside the GLM fit() function. + + full_output : bool + Set to True to have all available output in the Results object’s + mle_retvals attribute. The output is dependent on the solver. See + LikelihoodModelResults notes section for more information. Not used + if methhod is IRLS. This parameter is used inside the GLM fit() function. + + disp : bool + Set to True to print convergence messages. Not used if method + is IRLS. This parameter is used inside the GLM fit() function. + + max_start_irls : int + The number of IRLS iterations used to obtain starting values for + gradient optimization. Only relevenat if method is set to something + other than "IRLS". This parameter is used inside the GLM fit() function. + + add_constant : bool + statsmodels does not include an intercept by default. Specify this as + True if you would like to add an intercept (floats of 1s) to the + dataset X. Default = False. Note that when the input is a pandas + Series or DataFrame, the added column's name is 'const'. + Attributes ---------- df_model : float @@ -139,58 +186,6 @@ class GaussianRegressor(BaseProbaRegressor): The value of the weights after the last iteration of fit. Only available after fit is called. See statsmodels.families.family for the specific distribution weighting functions. - - start_params : array_like (optional) - Initial guess of the solution for the loglikelihood maximization. - The default is family-specific and is given by the - family.starting_mu(endog). If start_params is given then the initial - mean will be calculated as np.dot(exog, start_params). - This parameter is used inside the GLM fit() function. - - maxiter : int - Number of iterations. This parameter is used inside the GLM fit() function. - - method : str - Default is 'IRLS' for iteratively re-weighted least squares. - This parameter is used inside the GLM fit() function. - - tol : float - Convergence tolerance. Default is 1e-8. This parameter is - used inside the GLM fit() function. - - scale : str/float - scale can be 'X2', 'dev', or a float. The default value is None, - which uses X2 for gamma, gaussian and inverse gaussian. X2 is - Pearson's chi-square divided by df_resid. The default is 1 for - the Bionmial and Poisson families. dev is the deviance divided - by df_resid. This parameter is used inside the GLM fit() function. - - cov_type : str - The type of parameter estimate covariance matrix to compute. - This parameter is used inside the GLM fit() function. - - cov_kwds : dict-like - Extra arguments for calculating the covariance of the - parameter estimates. This parameter is used inside the GLM fit() function. - - use_t : bool - if True, the Student t-distribution if used for inference. - This parameter is used inside the GLM fit() function. - - full_output : bool - Set to True to have all available output in the Results object’s - mle_retvals attribute. The output is dependent on the solver. See - LikelihoodModelResults notes section for more information. Not used - if methhod is IRLS. This parameter is used inside the GLM fit() function. - - disp : bool - Set to True to print convergence messages. Not used if method - is IRLS. This parameter is used inside the GLM fit() function. - - max_start_irls : int - The number of IRLS iterations used to obtain starting values for - gradient optimization. Only relevenat if method is set to something - other than "IRLS". This parameter is used inside the GLM fit() function. """ _tags = { @@ -223,6 +218,7 @@ def __init__( full_output=True, disp=False, max_start_irls=3, + add_constant=False, ): self.family = Gaussian() self.offset = offset @@ -241,6 +237,7 @@ def __init__( self.full_output = full_output self.disp = disp self.max_start_irls = max_start_irls + self.add_constant = add_constant super().__init__() @@ -253,14 +250,26 @@ def _fit(self, X, y): Parameters ---------- X : pandas DataFrame - feature instances to fit regressor to - y : pandas DataFrame, must be same length as X - labels to fit regressor to + A (n,k) array where n is the number of observations and k is the number + of regressors. An intercept is not included by default and should be + added by the user (models specified using a formula include an + intercept by default). Equivalent to statsmodel's (exog). + + y : pandas DataFrame + 1d array of the endogenous response variable. This array can be 1d or + 2d. Binomial family models accept a 2d array with two columns. + If supplied each observation is expected to be [success, failure]. + Equivalent to statsmodel's (endog). Returns ------- self : reference to self """ + if self.add_constant: + X = add_constant(X) + + y_col = y.columns + glm_estimator = GLM( endog=y, exog=X, @@ -290,10 +299,11 @@ def _fit(self, X, y): self.max_start_irls, ) - FITTED_PARAMS_TO_FORWARD = ["glm_estimator_"] + # forward some parameters to self + FITTED_PARAMS_TO_FORWARD = {"glm_estimator_": fitted_glm_model, "y_col": y_col} - for param in FITTED_PARAMS_TO_FORWARD: - setattr(self, param, fitted_glm_model) + for k, v in FITTED_PARAMS_TO_FORWARD.items(): + setattr(self, k, v) return self @@ -315,7 +325,7 @@ def _predict(self, X): ------- y : pandas DataFrame, same length as `X`, with same columns as y in fit """ - y_column = self.endog.columns + y_column = self.y_col y_pred_series = self.glm_estimator_.predict(X) y_pred = pd.DataFrame(y_pred_series, columns=[y_column]) @@ -343,7 +353,7 @@ def _predict_proba(self, X): from skpro.distributions.normal import Normal y_pred_series = self.glm_estimator_.predict(X) - y_mu = pd.DataFrame(y_pred_series, columns=[self.endog.columns]) + y_mu = pd.DataFrame(y_pred_series, columns=[self.y_col]) y_sigma = np.std(y_mu.values) params = { "mu": y_mu, @@ -353,3 +363,26 @@ def _predict_proba(self, X): } y_pred = Normal(**params) return y_pred + + @classmethod + def get_test_params(cls, parameter_set="default"): + """Return testing parameter settings for the estimator. + + Parameters + ---------- + parameter_set : str, default="default" + Name of the set of test parameters to return, for use in tests. If no + special parameters are defined for a value, will return `"default"` set. + + Returns + ------- + params : dict or list of dict, default = {} + Parameters to create testing instances of the class + Each dict are parameters to construct an "interesting" test instance, i.e., + `MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance. + `create_test_instance` uses the first (or only) dictionary in `params` + """ + params1 = {} + params2 = {"add_constant": True} + + return [params1, params2] From cefb3dba3248c38735df0f4fb0b02d0f6ab35363 Mon Sep 17 00:00:00 2001 From: julian fong Date: Mon, 25 Mar 2024 10:22:31 -0400 Subject: [PATCH 08/27] minor change --- skpro/regression/linear/_glm.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/skpro/regression/linear/_glm.py b/skpro/regression/linear/_glm.py index db577d84..90bff094 100644 --- a/skpro/regression/linear/_glm.py +++ b/skpro/regression/linear/_glm.py @@ -27,8 +27,7 @@ class GaussianRegressor(BaseProbaRegressor): Parameters ---------- family : family class instance - To specify the binomial distribution family = sm.family.Binomial() Each - family can take a link instance as an argument. + To specify the Gaussian link. See statsmodels.family.family for more information. offset : array_like or None From 1ecda586c9cc9fc3e180be391cc1950cc986e11c Mon Sep 17 00:00:00 2001 From: julian fong Date: Tue, 26 Mar 2024 13:27:18 -0400 Subject: [PATCH 09/27] updated docstring, fixed issues --- skpro/regression/linear/_glm.py | 95 ++++++++++++++++++++++----------- 1 file changed, 65 insertions(+), 30 deletions(-) diff --git a/skpro/regression/linear/_glm.py b/skpro/regression/linear/_glm.py index 90bff094..143ce1f8 100644 --- a/skpro/regression/linear/_glm.py +++ b/skpro/regression/linear/_glm.py @@ -1,7 +1,6 @@ """Interface adapter for the Generalized Linear Model Regressor with Gaussian Link.""" # copyright: skpro developers, BSD-3-Clause License (see LICENSE file) -import numpy as np import pandas as pd from statsmodels.genmod.families.family import Gaussian from statsmodels.genmod.generalized_linear_model import GLM @@ -114,77 +113,77 @@ class GaussianRegressor(BaseProbaRegressor): Attributes ---------- - df_model : float + df_model_ : float Model degrees of freedom is equal to p - 1, where p is the number of regressors. Note that the intercept is not reported as a degree of freedom. - df_resid : float + df_resid_ : float Residual degrees of freedom is equal to the number of observation n minus the number of regressors p. - endog : pandas DataFrame + endog_ : pandas DataFrame Note that endog is a reference to the data so that if data is already an array and it is changed, then endog changes as well. - exposure : array_like - Include ln(exposure) in model with coefficient constrained to 1. - Can only be used if the link is the logarithm function. - - exog : pandas DataFrame + exog_ : pandas DataFrame Note that exog is a reference to the data so that if data is already an array and it is changed, then exog changes as well. - freq_weights : ndarray + freq_weights_ : ndarray Note that freq_weights is a reference to the data so that if data is already an array and it is changed, then freq_weights changes as well. - var_weights : ndarray + var_weights_ : ndarray Note that var_weights is a reference to the data so that if data is already an array and it is changed, then var_weights changes as well. - iteration : int + iteration_ : int The number of iterations that fit has run. Initialized at 0. + Only available after fit is called - family : family class instance + family_ : family class instance he distribution family of the model. Can be any family in statsmodels.families. Default is Gaussian. - mu : ndarray + mu_ : ndarray The mean response of the transformed variable. mu is the value of the inverse of the link function at lin_pred, where lin_pred is the linear predicted value of the WLS fit of the transformed variable. mu is only available after fit is called. See statsmodels.families.family.fitted of the distribution family for more information. - n_trials : ndarray + n_trials_ : ndarray Note that n_trials is a reference to the data so that if data is already an array and it is changed, then n_trials changes as well. n_trials is the number of binomial trials and only available with that distribution. See statsmodels.families.Binomial for more information. - normalized_cov_params : ndarray + normalized_cov_params_ : ndarray The p x p normalized covariance of the design / exogenous data. This is approximately equal to (X.T X)^(-1) - offset : array_like + offset_ : array_like Include offset in model with coefficient constrained to 1. - scale : float + scale_ : float The estimate of the scale / dispersion of the model fit. Only available after fit is called. See GLM.fit and GLM.estimate_scale for more information. - scaletype : str + scaletype_ : str The scaling used for fitting the model. This is only available after fit is called. The default is None. See GLM.fit for more information. - weights : ndarray + weights_ : ndarray The value of the weights after the last iteration of fit. Only available after fit is called. See statsmodels.families.family for the specific distribution weighting functions. + + glm_fit_ : GLM + fitted generalized linear model """ _tags = { @@ -280,11 +279,26 @@ def _fit(self, X, y): missing=self.missing, ) - self._estimator = ( - glm_estimator # does this need to be cloned using some clone method? - ) + self._estimator = glm_estimator + + PARAMS_TO_FORWARD = { + "df_model_": glm_estimator.df_model, + "df_resid_": glm_estimator.df_resid, + "endog_": glm_estimator.endog, + "exog_": glm_estimator.exog, + "freq_weights_": glm_estimator.freq_weights, + "var_weights": glm_estimator.var_weights, + "family_": glm_estimator.family, + "mu_": glm_estimator.mu, + "n_trials_": glm_estimator.n_trials, + "weights_": glm_estimator.weights, + "scaletype_": glm_estimator.scaletype, + } - fitted_glm_model = self._estimator.fit( + for k, v in PARAMS_TO_FORWARD.items(): + setattr(self, k, v) + + fitted_glm_model = glm_estimator.fit( self.start_params, self.maxiter, self.method, @@ -299,7 +313,19 @@ def _fit(self, X, y): ) # forward some parameters to self - FITTED_PARAMS_TO_FORWARD = {"glm_estimator_": fitted_glm_model, "y_col": y_col} + FITTED_PARAMS_TO_FORWARD = { + "glm_fit_": fitted_glm_model, + "y_col": y_col, + "fit_history_": fitted_glm_model.fit_history, + "iterations_": fitted_glm_model.fit_history["iteration"], + "model_": fitted_glm_model.model, + "nobs_": fitted_glm_model.nobs, + "normalized_cov_params_": fitted_glm_model.normalized_cov_params, + "params_": fitted_glm_model.params, + "pvalues_": fitted_glm_model.pvalues, + "scale_": fitted_glm_model.scale, + "stand_errors_": fitted_glm_model.bse, + } for k, v in FITTED_PARAMS_TO_FORWARD.items(): setattr(self, k, v) @@ -324,9 +350,10 @@ def _predict(self, X): ------- y : pandas DataFrame, same length as `X`, with same columns as y in fit """ + index = X.index y_column = self.y_col - y_pred_series = self.glm_estimator_.predict(X) - y_pred = pd.DataFrame(y_pred_series, columns=[y_column]) + y_pred_series = self.glm_fit_.predict(X) + y_pred = pd.DataFrame(y_pred_series, index=index, columns=[y_column]) return y_pred @@ -351,9 +378,17 @@ def _predict_proba(self, X): """ from skpro.distributions.normal import Normal - y_pred_series = self.glm_estimator_.predict(X) - y_mu = pd.DataFrame(y_pred_series, columns=[self.y_col]) - y_sigma = np.std(y_mu.values) + index = X.index + y_column = self.y_col + + # instead of using the conventional predict() method, we use statsmodels + # get_prediction method, which returns a pandas df that contains + # the prediction and prediction variance i.e mu and sigma + y_predictions_df = self.glm_fit_.get_predictions(X).summary_frame() + y_mu = pd.DataFrame(y_predictions_df["mean"], index=index, columns=[y_column]) + y_sigma = pd.DataFrame( + y_predictions_df["mean_se"], index=index, columns=[y_column] + ) params = { "mu": y_mu, "sigma": y_sigma, From cac37ee0f05a795a7d2fb6d28978bd915433af28 Mon Sep 17 00:00:00 2001 From: julian fong Date: Tue, 26 Mar 2024 14:24:25 -0400 Subject: [PATCH 10/27] updated docstring, minor fixes --- skpro/regression/linear/_glm.py | 66 ++++++++++++++++++++++----------- 1 file changed, 44 insertions(+), 22 deletions(-) diff --git a/skpro/regression/linear/_glm.py b/skpro/regression/linear/_glm.py index 143ce1f8..58871965 100644 --- a/skpro/regression/linear/_glm.py +++ b/skpro/regression/linear/_glm.py @@ -160,13 +160,6 @@ class GaussianRegressor(BaseProbaRegressor): n_trials is the number of binomial trials and only available with that distribution. See statsmodels.families.Binomial for more information. - normalized_cov_params_ : ndarray - The p x p normalized covariance of the design / exogenous data. This - is approximately equal to (X.T X)^(-1) - - offset_ : array_like - Include offset in model with coefficient constrained to 1. - scale_ : float The estimate of the scale / dispersion of the model fit. Only available after fit is called. See GLM.fit and GLM.estimate_scale @@ -184,6 +177,35 @@ class GaussianRegressor(BaseProbaRegressor): glm_fit_ : GLM fitted generalized linear model + + fit_history_ : dict + Contains information about the iterations. + Its keys are iterations, deviance and params. Only available after + fit is called. + + model_ : class instance + Pointer to GLM model instance that called fit. + + nobs_ : float + The number of observations n. Only available after fit is called. + + normalized_cov_params_ : ndarray + For Gaussian link: This is the p x p normalized covariance of the + design / exogenous data. This is approximately equal to (X.T X)^(-1) + + params_ : ndarray + The coefficients of the fitted model. Note that interpretation of the + coefficients often depends on the distribution family and the data. + + pvalues_ : ndarray + The two-tailed p-values for the parameters. + + scale_ : float + The estimate of the scale / dispersion for the model fit. + See GLM.fit and GLM.estimate_scale for more information. + + stand_errors_ : ndarray + The standard errors of the fitted GLM. """ _tags = { @@ -281,6 +303,20 @@ def _fit(self, X, y): self._estimator = glm_estimator + fitted_glm_model = glm_estimator.fit( + self.start_params, + self.maxiter, + self.method, + self.tol, + self.scale, + self.cov_type, + self.cov_kwds, + self.use_t, + self.full_output, + self.disp, + self.max_start_irls, + ) + PARAMS_TO_FORWARD = { "df_model_": glm_estimator.df_model, "df_resid_": glm_estimator.df_resid, @@ -298,26 +334,12 @@ def _fit(self, X, y): for k, v in PARAMS_TO_FORWARD.items(): setattr(self, k, v) - fitted_glm_model = glm_estimator.fit( - self.start_params, - self.maxiter, - self.method, - self.tol, - self.scale, - self.cov_type, - self.cov_kwds, - self.use_t, - self.full_output, - self.disp, - self.max_start_irls, - ) - # forward some parameters to self FITTED_PARAMS_TO_FORWARD = { "glm_fit_": fitted_glm_model, "y_col": y_col, "fit_history_": fitted_glm_model.fit_history, - "iterations_": fitted_glm_model.fit_history["iteration"], + "iteration_": fitted_glm_model.fit_history["iteration"], "model_": fitted_glm_model.model, "nobs_": fitted_glm_model.nobs, "normalized_cov_params_": fitted_glm_model.normalized_cov_params, From a51cc5ef09fa4c72a4985ed03298d6c00cec3060 Mon Sep 17 00:00:00 2001 From: julian fong Date: Tue, 26 Mar 2024 14:45:06 -0400 Subject: [PATCH 11/27] removed family parameter as we are only using the Gaussian link --- skpro/regression/linear/_glm.py | 1 - 1 file changed, 1 deletion(-) diff --git a/skpro/regression/linear/_glm.py b/skpro/regression/linear/_glm.py index 58871965..f1f2292b 100644 --- a/skpro/regression/linear/_glm.py +++ b/skpro/regression/linear/_glm.py @@ -221,7 +221,6 @@ class GaussianRegressor(BaseProbaRegressor): def __init__( self, - family=None, offset=None, exposure=None, freq_weights=None, From 34c94affc64fb6f5b7da30e299002960e61df29b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Thu, 28 Mar 2024 01:46:36 +0100 Subject: [PATCH 12/27] _Table --- skpro/regression/linear/_glm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/skpro/regression/linear/_glm.py b/skpro/regression/linear/_glm.py index f1f2292b..7dd37feb 100644 --- a/skpro/regression/linear/_glm.py +++ b/skpro/regression/linear/_glm.py @@ -215,8 +215,8 @@ class GaussianRegressor(BaseProbaRegressor): "python_dependencies": None, "capability:multioutput": False, "capability:missing": False, - "X_inner_mtype": "pd_DataFrame", - "y_inner_mtype": "pd_DataFrame", + "X_inner_mtype": "pd_DataFrame_Table", + "y_inner_mtype": "pd_DataFrame_Table", } def __init__( From 97af171f528c286ad4661cac96c28379522825d5 Mon Sep 17 00:00:00 2001 From: julian fong Date: Thu, 28 Mar 2024 12:42:18 -0400 Subject: [PATCH 13/27] moved a couple imports to methods --- skpro/regression/linear/_glm.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/skpro/regression/linear/_glm.py b/skpro/regression/linear/_glm.py index 7dd37feb..b2eaaa59 100644 --- a/skpro/regression/linear/_glm.py +++ b/skpro/regression/linear/_glm.py @@ -3,8 +3,6 @@ import pandas as pd from statsmodels.genmod.families.family import Gaussian -from statsmodels.genmod.generalized_linear_model import GLM -from statsmodels.tools import add_constant from skpro.regression.base import BaseProbaRegressor @@ -284,6 +282,9 @@ def _fit(self, X, y): ------- self : reference to self """ + from statsmodels.genmod.generalized_linear_model import GLM + from statsmodels.tools import add_constant + if self.add_constant: X = add_constant(X) From 7952454d20110ad796938df2195ef01cdb8413b7 Mon Sep 17 00:00:00 2001 From: julian fong Date: Thu, 28 Mar 2024 15:17:33 -0400 Subject: [PATCH 14/27] moved Gaussian import to after super call --- skpro/regression/linear/_glm.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/skpro/regression/linear/_glm.py b/skpro/regression/linear/_glm.py index b2eaaa59..80388c74 100644 --- a/skpro/regression/linear/_glm.py +++ b/skpro/regression/linear/_glm.py @@ -2,7 +2,6 @@ # copyright: skpro developers, BSD-3-Clause License (see LICENSE file) import pandas as pd -from statsmodels.genmod.families.family import Gaussian from skpro.regression.base import BaseProbaRegressor @@ -237,6 +236,9 @@ def __init__( max_start_irls=3, add_constant=False, ): + super().__init__() + from statsmodels.genmod.families.family import Gaussian + self.family = Gaussian() self.offset = offset self.exposure = exposure @@ -256,8 +258,6 @@ def __init__( self.max_start_irls = max_start_irls self.add_constant = add_constant - super().__init__() - def _fit(self, X, y): """Fit regressor to training data. From 893936cc4d6df69d916b4386c6cfe545cba71a35 Mon Sep 17 00:00:00 2001 From: julian fong Date: Thu, 28 Mar 2024 16:54:07 -0400 Subject: [PATCH 15/27] added GaussianRegressor to linear --- skpro/regression/linear/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/skpro/regression/linear/__init__.py b/skpro/regression/linear/__init__.py index 21c1083c..6871be85 100644 --- a/skpro/regression/linear/__init__.py +++ b/skpro/regression/linear/__init__.py @@ -1,9 +1,11 @@ """Linear regression models.""" # copyright: skpro developers, BSD-3-Clause License (see LICENSE file) +from skpro.regression.linear._glm import GaussianRegressor from skpro.regression.linear._sklearn import ARDRegression, BayesianRidge __all__ = [ "ARDRegression", "BayesianRidge", + "GaussianRegressor", ] From 85fb3343884e565a34baec8d8a1f34061964a8cc Mon Sep 17 00:00:00 2001 From: julian fong Date: Thu, 28 Mar 2024 16:55:18 -0400 Subject: [PATCH 16/27] added GaussianRegressor to api_ref in docs under Linear regression --- docs/source/api_reference/regression.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/api_reference/regression.rst b/docs/source/api_reference/regression.rst index df7e140c..0fd9a6bc 100644 --- a/docs/source/api_reference/regression.rst +++ b/docs/source/api_reference/regression.rst @@ -101,6 +101,7 @@ Linear regression ARDRegression BayesianRidge + GaussianRegressor Gaussian process and kernel regression -------------------------------------- From 76489055c2398e3fed37b838627d0ca1038aa276 Mon Sep 17 00:00:00 2001 From: julian fong Date: Thu, 28 Mar 2024 20:06:19 -0400 Subject: [PATCH 17/27] renamed from GaussianRegressor to GLMRegressor --- docs/source/api_reference/regression.rst | 2 +- skpro/regression/linear/__init__.py | 4 ++-- skpro/regression/linear/_glm.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/source/api_reference/regression.rst b/docs/source/api_reference/regression.rst index 0fd9a6bc..fd69c859 100644 --- a/docs/source/api_reference/regression.rst +++ b/docs/source/api_reference/regression.rst @@ -101,7 +101,7 @@ Linear regression ARDRegression BayesianRidge - GaussianRegressor + GLMRegressor Gaussian process and kernel regression -------------------------------------- diff --git a/skpro/regression/linear/__init__.py b/skpro/regression/linear/__init__.py index 6871be85..edb8efd0 100644 --- a/skpro/regression/linear/__init__.py +++ b/skpro/regression/linear/__init__.py @@ -1,11 +1,11 @@ """Linear regression models.""" # copyright: skpro developers, BSD-3-Clause License (see LICENSE file) -from skpro.regression.linear._glm import GaussianRegressor +from skpro.regression.linear._glm import GLMRegressor from skpro.regression.linear._sklearn import ARDRegression, BayesianRidge __all__ = [ "ARDRegression", "BayesianRidge", - "GaussianRegressor", + "GLMRegressor", ] diff --git a/skpro/regression/linear/_glm.py b/skpro/regression/linear/_glm.py index 80388c74..fb9649bc 100644 --- a/skpro/regression/linear/_glm.py +++ b/skpro/regression/linear/_glm.py @@ -6,7 +6,7 @@ from skpro.regression.base import BaseProbaRegressor -class GaussianRegressor(BaseProbaRegressor): +class GLMRegressor(BaseProbaRegressor): """ Fits a generalized linear model with a gaussian link. From c95fec12f9bdd982c9fde130069dfe0790b49253 Mon Sep 17 00:00:00 2001 From: julian fong Date: Thu, 28 Mar 2024 20:07:35 -0400 Subject: [PATCH 18/27] fixed python_dependencies to include statsmodels --- skpro/regression/linear/_glm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skpro/regression/linear/_glm.py b/skpro/regression/linear/_glm.py index fb9649bc..32ebd96f 100644 --- a/skpro/regression/linear/_glm.py +++ b/skpro/regression/linear/_glm.py @@ -209,7 +209,7 @@ class GLMRegressor(BaseProbaRegressor): "authors": ["julian-fong"], "maintainers": ["julian-fong"], "python_version": None, - "python_dependencies": None, + "python_dependencies": "statsmodels", "capability:multioutput": False, "capability:missing": False, "X_inner_mtype": "pd_DataFrame_Table", From 2a54fa2e6a8a679c5e931cdd935c03f09fca1b2b Mon Sep 17 00:00:00 2001 From: julian fong Date: Fri, 29 Mar 2024 09:29:25 -0400 Subject: [PATCH 19/27] removed some params, fixed overwriting for family param in __init__ --- skpro/regression/linear/_glm.py | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/skpro/regression/linear/_glm.py b/skpro/regression/linear/_glm.py index 32ebd96f..dfb50349 100644 --- a/skpro/regression/linear/_glm.py +++ b/skpro/regression/linear/_glm.py @@ -118,32 +118,10 @@ class GLMRegressor(BaseProbaRegressor): Residual degrees of freedom is equal to the number of observation n minus the number of regressors p. - endog_ : pandas DataFrame - Note that endog is a reference to the data so that if data is already - an array and it is changed, then endog changes as well. - - exog_ : pandas DataFrame - Note that exog is a reference to the data so that if data is already - an array and it is changed, then exog changes as well. - - freq_weights_ : ndarray - Note that freq_weights is a reference to the data so that if data - is already an array and it is changed, then freq_weights changes - as well. - - var_weights_ : ndarray - Note that var_weights is a reference to the data so that if - data is already an array and it is changed, then var_weights - changes as well. - iteration_ : int The number of iterations that fit has run. Initialized at 0. Only available after fit is called - family_ : family class instance - he distribution family of the model. Can be any family - in statsmodels.families. Default is Gaussian. - mu_ : ndarray The mean response of the transformed variable. mu is the value of the inverse of the link function at lin_pred, where lin_pred is the linear @@ -320,11 +298,6 @@ def _fit(self, X, y): PARAMS_TO_FORWARD = { "df_model_": glm_estimator.df_model, "df_resid_": glm_estimator.df_resid, - "endog_": glm_estimator.endog, - "exog_": glm_estimator.exog, - "freq_weights_": glm_estimator.freq_weights, - "var_weights": glm_estimator.var_weights, - "family_": glm_estimator.family, "mu_": glm_estimator.mu, "n_trials_": glm_estimator.n_trials, "weights_": glm_estimator.weights, From a60cfb5f7a4b37a9eb1865a3e30fe1d34ce44d96 Mon Sep 17 00:00:00 2001 From: julian fong Date: Fri, 29 Mar 2024 11:29:52 -0400 Subject: [PATCH 20/27] fixed bugs with _predict_proba --- skpro/regression/linear/_glm.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/skpro/regression/linear/_glm.py b/skpro/regression/linear/_glm.py index dfb50349..8866c620 100644 --- a/skpro/regression/linear/_glm.py +++ b/skpro/regression/linear/_glm.py @@ -373,17 +373,12 @@ def _predict_proba(self, X): """ from skpro.distributions.normal import Normal - index = X.index - y_column = self.y_col - # instead of using the conventional predict() method, we use statsmodels # get_prediction method, which returns a pandas df that contains # the prediction and prediction variance i.e mu and sigma - y_predictions_df = self.glm_fit_.get_predictions(X).summary_frame() - y_mu = pd.DataFrame(y_predictions_df["mean"], index=index, columns=[y_column]) - y_sigma = pd.DataFrame( - y_predictions_df["mean_se"], index=index, columns=[y_column] - ) + y_predictions_df = self.glm_fit_.get_prediction(X).summary_frame() + y_mu = y_predictions_df["mean"].rename("mu").to_frame() + y_sigma = y_predictions_df["mean_se"].rename("sigma").to_frame() params = { "mu": y_mu, "sigma": y_sigma, From 48be6810f33be01f479d747b7a6c992868276ed8 Mon Sep 17 00:00:00 2001 From: julian fong Date: Sat, 30 Mar 2024 01:38:12 -0400 Subject: [PATCH 21/27] fixed bug --- skpro/regression/linear/_glm.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/skpro/regression/linear/_glm.py b/skpro/regression/linear/_glm.py index 8866c620..c063726c 100644 --- a/skpro/regression/linear/_glm.py +++ b/skpro/regression/linear/_glm.py @@ -348,7 +348,7 @@ def _predict(self, X): index = X.index y_column = self.y_col y_pred_series = self.glm_fit_.predict(X) - y_pred = pd.DataFrame(y_pred_series, index=index, columns=[y_column]) + y_pred = pd.DataFrame(y_pred_series, index=index, columns=y_column) return y_pred @@ -376,6 +376,7 @@ def _predict_proba(self, X): # instead of using the conventional predict() method, we use statsmodels # get_prediction method, which returns a pandas df that contains # the prediction and prediction variance i.e mu and sigma + y_column = self.y_col y_predictions_df = self.glm_fit_.get_prediction(X).summary_frame() y_mu = y_predictions_df["mean"].rename("mu").to_frame() y_sigma = y_predictions_df["mean_se"].rename("sigma").to_frame() @@ -383,7 +384,7 @@ def _predict_proba(self, X): "mu": y_mu, "sigma": y_sigma, "index": X.index, - "columns": y_mu.columns, + "columns": y_column, } y_pred = Normal(**params) return y_pred From 52ea9bf86f6cfc8b29275da9a5bf1d3a25bb502d Mon Sep 17 00:00:00 2001 From: julian fong Date: Sat, 30 Mar 2024 09:04:40 -0400 Subject: [PATCH 22/27] update test set params --- skpro/regression/linear/_glm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skpro/regression/linear/_glm.py b/skpro/regression/linear/_glm.py index c063726c..7a815d02 100644 --- a/skpro/regression/linear/_glm.py +++ b/skpro/regression/linear/_glm.py @@ -408,6 +408,6 @@ def get_test_params(cls, parameter_set="default"): `create_test_instance` uses the first (or only) dictionary in `params` """ params1 = {} - params2 = {"add_constant": True} + params2 = {"maxiter": 50} return [params1, params2] From 8c2e1d443d3d0af7e23a119517d23324be55eefa Mon Sep 17 00:00:00 2001 From: julian fong Date: Sat, 30 Mar 2024 15:49:58 -0400 Subject: [PATCH 23/27] wrote code to automatically append const column on inf methods if add_constant is True --- skpro/regression/linear/_glm.py | 38 +++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/skpro/regression/linear/_glm.py b/skpro/regression/linear/_glm.py index 7a815d02..f6e7ad26 100644 --- a/skpro/regression/linear/_glm.py +++ b/skpro/regression/linear/_glm.py @@ -345,9 +345,14 @@ def _predict(self, X): ------- y : pandas DataFrame, same length as `X`, with same columns as y in fit """ - index = X.index + if self.add_constant: + X_ = self._prep_x(X) + else: + X_ = X + + index = X_.index y_column = self.y_col - y_pred_series = self.glm_fit_.predict(X) + y_pred_series = self.glm_fit_.predict(X_) y_pred = pd.DataFrame(y_pred_series, index=index, columns=y_column) return y_pred @@ -373,22 +378,47 @@ def _predict_proba(self, X): """ from skpro.distributions.normal import Normal + if self.add_constant: + X_ = self._prep_x(X) + else: + X_ = X + # instead of using the conventional predict() method, we use statsmodels # get_prediction method, which returns a pandas df that contains # the prediction and prediction variance i.e mu and sigma y_column = self.y_col - y_predictions_df = self.glm_fit_.get_prediction(X).summary_frame() + y_predictions_df = self.glm_fit_.get_prediction(X_).summary_frame() y_mu = y_predictions_df["mean"].rename("mu").to_frame() y_sigma = y_predictions_df["mean_se"].rename("sigma").to_frame() params = { "mu": y_mu, "sigma": y_sigma, - "index": X.index, + "index": X_.index, "columns": y_column, } y_pred = Normal(**params) return y_pred + def _prep_x(self, X): + """ + Return a copy of X with an added constant of self.add_constant = True. + + Parameters + ---------- + X : pandas DataFrame + Dataset that the user is trying to do inference on + + Returns + ------- + X.copy : pandas DataFrame + A copy of the input X with an added column 'const' with is an + array of len(X) of 1s + """ + from statsmodels.tools import add_constant + + X_copy = add_constant(X) + return X_copy + @classmethod def get_test_params(cls, parameter_set="default"): """Return testing parameter settings for the estimator. From 05eb9d2c3a49a8e9da1055f75c83c4a20c594c33 Mon Sep 17 00:00:00 2001 From: julian fong Date: Sat, 30 Mar 2024 15:50:40 -0400 Subject: [PATCH 24/27] minor code change --- skpro/regression/linear/_glm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skpro/regression/linear/_glm.py b/skpro/regression/linear/_glm.py index f6e7ad26..aad9b503 100644 --- a/skpro/regression/linear/_glm.py +++ b/skpro/regression/linear/_glm.py @@ -438,6 +438,6 @@ def get_test_params(cls, parameter_set="default"): `create_test_instance` uses the first (or only) dictionary in `params` """ params1 = {} - params2 = {"maxiter": 50} + params2 = {"add_constant": True} return [params1, params2] From d2e6dc6371b2f2d2b549ae7c6e3f8b381a47aea6 Mon Sep 17 00:00:00 2001 From: julian fong Date: Sat, 30 Mar 2024 16:39:35 -0400 Subject: [PATCH 25/27] improved readability of code, moved if statement into _prep_x --- skpro/regression/linear/_glm.py | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/skpro/regression/linear/_glm.py b/skpro/regression/linear/_glm.py index aad9b503..8d7ea768 100644 --- a/skpro/regression/linear/_glm.py +++ b/skpro/regression/linear/_glm.py @@ -261,16 +261,14 @@ def _fit(self, X, y): self : reference to self """ from statsmodels.genmod.generalized_linear_model import GLM - from statsmodels.tools import add_constant - if self.add_constant: - X = add_constant(X) + X_ = self._prep_x(X) y_col = y.columns glm_estimator = GLM( endog=y, - exog=X, + exog=X_, family=self.family, offset=self.offset, exposure=self.exposure, @@ -345,10 +343,7 @@ def _predict(self, X): ------- y : pandas DataFrame, same length as `X`, with same columns as y in fit """ - if self.add_constant: - X_ = self._prep_x(X) - else: - X_ = X + X_ = self._prep_x(X) index = X_.index y_column = self.y_col @@ -378,10 +373,7 @@ def _predict_proba(self, X): """ from skpro.distributions.normal import Normal - if self.add_constant: - X_ = self._prep_x(X) - else: - X_ = X + X_ = self._prep_x(X) # instead of using the conventional predict() method, we use statsmodels # get_prediction method, which returns a pandas df that contains @@ -410,14 +402,17 @@ def _prep_x(self, X): Returns ------- - X.copy : pandas DataFrame + X_ : pandas DataFrame A copy of the input X with an added column 'const' with is an array of len(X) of 1s """ from statsmodels.tools import add_constant - X_copy = add_constant(X) - return X_copy + if self.add_constant: + X_ = add_constant(X) + return X_ + else: + return X @classmethod def get_test_params(cls, parameter_set="default"): From e6dff2c7bdbc6b4596c02cfa84f9ce4a48e2409f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Sun, 31 Mar 2024 22:30:10 +0200 Subject: [PATCH 26/27] removed params --- skpro/regression/linear/_glm.py | 61 +++++++-------------------------- 1 file changed, 13 insertions(+), 48 deletions(-) diff --git a/skpro/regression/linear/_glm.py b/skpro/regression/linear/_glm.py index 8d7ea768..59db76d6 100644 --- a/skpro/regression/linear/_glm.py +++ b/skpro/regression/linear/_glm.py @@ -22,29 +22,6 @@ class GLMRegressor(BaseProbaRegressor): Parameters ---------- - family : family class instance - To specify the Gaussian link. - See statsmodels.family.family for more information. - - offset : array_like or None - An offset to be included in the model. If provided, must be an array - whose length is the number of rows in exog (x). - - exposure : array_like or None - Log(exposure) will be added to the linear prediction in the model. - Exposure is only valid if the log link is used. If provided, it must be - an array with the same length as endog (y). - - freq_weights : array_like - 1d array of frequency weights. The default is None. If None is selected - or a blank value, then the algorithm will replace with an array of 1s - with length equal to the endog. - - var_weights : array_like - 1d array of variance (analytic) weights. The default is None. If None - is selected or a blank value, then the algorithm will replace with an - array of 1s with length equal to the endog. - missing : str Available options are 'none', 'drop' and 'raise'. If 'none', no nan checking is done. If 'drop', any observations with nans are dropped. @@ -57,52 +34,52 @@ class GLMRegressor(BaseProbaRegressor): mean will be calculated as np.dot(exog, start_params). This parameter is used inside the GLM fit() function. - maxiter : int + maxiter : int, optional, default=100 Number of iterations. This parameter is used inside the GLM fit() function. - method : str + method : str, optional, default='IRLS' Default is 'IRLS' for iteratively re-weighted least squares. This parameter is used inside the GLM fit() function. - tol : float + tol : float, optional, default=1e-8 Convergence tolerance. Default is 1e-8. This parameter is used inside the GLM fit() function. - scale : str/float + scale : str/float, optional, default=None scale can be 'X2', 'dev', or a float. The default value is None, which uses X2 for gamma, gaussian and inverse gaussian. X2 is Pearson's chi-square divided by df_resid. The default is 1 for the Bionmial and Poisson families. dev is the deviance divided by df_resid. This parameter is used inside the GLM fit() function. - cov_type : str + cov_type : str, optional, default='nonrobust' The type of parameter estimate covariance matrix to compute. This parameter is used inside the GLM fit() function. - cov_kwds : dict-like + cov_kwds : dict-like, optional, default=None Extra arguments for calculating the covariance of the parameter estimates. This parameter is used inside the GLM fit() function. - use_t : bool + use_t : bool, optional, default=False if True, the Student t-distribution if used for inference. This parameter is used inside the GLM fit() function. - full_output : bool + full_output : bool, optional, default=True Set to True to have all available output in the Results object’s mle_retvals attribute. The output is dependent on the solver. See LikelihoodModelResults notes section for more information. Not used if methhod is IRLS. This parameter is used inside the GLM fit() function. - disp : bool + disp : bool, optional, default=False Set to True to print convergence messages. Not used if method is IRLS. This parameter is used inside the GLM fit() function. - max_start_irls : int + max_start_irls : int, optional, default=3 The number of IRLS iterations used to obtain starting values for gradient optimization. Only relevenat if method is set to something other than "IRLS". This parameter is used inside the GLM fit() function. - add_constant : bool + add_constant : bool, optional, default=False statsmodels does not include an intercept by default. Specify this as True if you would like to add an intercept (floats of 1s) to the dataset X. Default = False. Note that when the input is a pandas @@ -196,10 +173,6 @@ class GLMRegressor(BaseProbaRegressor): def __init__( self, - offset=None, - exposure=None, - freq_weights=None, - var_weights=None, missing="none", start_params=None, maxiter=100, @@ -217,11 +190,7 @@ def __init__( super().__init__() from statsmodels.genmod.families.family import Gaussian - self.family = Gaussian() - self.offset = offset - self.exposure = exposure - self.freq_weights = freq_weights - self.var_weights = var_weights + self._family = Gaussian() self.missing = missing self.start_params = start_params self.maxiter = maxiter @@ -269,11 +238,7 @@ def _fit(self, X, y): glm_estimator = GLM( endog=y, exog=X_, - family=self.family, - offset=self.offset, - exposure=self.exposure, - freq_weights=self.freq_weights, - var_weights=self.var_weights, + family=self._family, missing=self.missing, ) From 0bd6f6faff56b349f01ea5a303bfc65778b4074a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Sun, 31 Mar 2024 22:30:23 +0200 Subject: [PATCH 27/27] Update _glm.py --- skpro/regression/linear/_glm.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/skpro/regression/linear/_glm.py b/skpro/regression/linear/_glm.py index 59db76d6..4f6637c6 100644 --- a/skpro/regression/linear/_glm.py +++ b/skpro/regression/linear/_glm.py @@ -13,10 +13,6 @@ class GLMRegressor(BaseProbaRegressor): Direct interface to ``statsmodels.genmod.generalized_linear_model.GLM`` from the ``statsmodels`` package. - statsmodels uses parameters 'exog' and 'endog' to denote the X and y values - respectively and supports two separate definition of weights: frequency - and variance. - For a direct link to statmodels' Generalized Linear Models module see: https://www.statsmodels.org/stable/glm.html#module-reference