Merge pull request #179 from dswah/poisson-bug

Poisson bug
dswah · Jun 29, 2018 · 62a2e57 · 62a2e57
2 parents fa70ebf + cac5231
commit 62a2e57
Show file tree

Hide file tree

Showing 4 changed files with 88 additions and 21 deletions.
diff --git a/pygam/distributions.py b/pygam/distributions.py
@@ -374,7 +374,12 @@ def log_pdf(self, y, mu, weights=None):
             weights = np.ones_like(mu)
         # in Poisson regression weights are proportional to the exposure
         # so we want to pump up all our predictions
-        # NOTE: we assume the targets are unchanged
+        # NOTE: we assume the targets are counts, not rate.
+        # ie if observations were scaled to account for exposure, they have
+        # been rescaled before calling this function.
+        # since some samples have higher exposure,
+        # they also need to have higher variance,
+        # we do this by multiplying mu by the weight=exposure
         mu = mu * weights
         return sp.stats.poisson.logpmf(y, mu=mu)
 

diff --git a/pygam/pygam.py b/pygam/pygam.py
@@ -588,7 +588,9 @@ def loglikelihood(self, X, y, weights=None):
         mu = self.predict_mu(X)
 
         if weights is not None:
-            weights = check_array(weights, name='sample weights', verbose=self.verbose)
+            weights = np.array(weights).astype('f').ravel()
+            weights = check_array(weights, name='sample weights',
+                                  ndim=1, verbose=self.verbose)
             check_lengths(y, weights)
         else:
             weights = np.ones_like(y).astype('float64')
@@ -1251,7 +1253,9 @@ def fit(self, X, y, weights=None):
         check_X_y(X, y)
 
         if weights is not None:
-            weights = check_array(weights, name='sample weights', verbose=self.verbose)
+            weights = np.array(weights).astype('f').ravel()
+            weights = check_array(weights, name='sample weights',
+                                  ndim=1, verbose=self.verbose)
             check_lengths(y, weights)
         else:
             weights = np.ones_like(y).astype('float64')
@@ -1303,7 +1307,9 @@ def deviance_residuals(self, X, y, weights=None, scaled=False):
         check_X_y(X, y)
 
         if weights is not None:
-            weights = check_array(weights, name='sample weights', verbose=self.verbose)
+            weights = np.array(weights).astype('f').ravel()
+            weights = check_array(weights, name='sample weights',
+                                  ndim=1, verbose=self.verbose)
             check_lengths(y, weights)
         else:
             weights = np.ones_like(y).astype('float64')
@@ -2013,7 +2019,9 @@ def gridsearch(self, X, y, weights=None, return_scores=False,
         check_X_y(X, y)
 
         if weights is not None:
-            weights = check_array(weights, name='sample weights', verbose=self.verbose)
+            weights = np.array(weights).astype('f').ravel()
+            weights = check_array(weights, name='sample weights',
+                                  ndim=1, verbose=self.verbose)
             check_lengths(y, weights)
         else:
             weights = np.ones_like(y).astype('float64')
@@ -2997,15 +3005,17 @@ def _loglikelihood(self, y, mu, weights=None, rescale_y=True):
         weights : array-like of shape (n,)
             containing sample weights
         rescale_y : boolean, defaul: True
-            whether to scale the targets back up by
+            whether to scale the targets back up.
+            useful when fitting with an exposure, in which case the count observations
+            were scaled into rates. this rescales rates into counts.
 
         Returns
         -------
         log-likelihood : np.array of shape (n,)
             containing log-likelihood scores
         """
         if rescale_y:
-            y = y * weights
+            y = np.round(y * weights).astype('int')
 
         return self.distribution.log_pdf(y=y, mu=mu, weights=weights).sum()
 
@@ -3034,7 +3044,9 @@ def loglikelihood(self, X, y, exposure=None, weights=None):
         mu = self.predict_mu(X)
 
         if weights is not None:
-            weights = check_array(weights, name='sample weights', verbose=self.verbose)
+            weights = np.array(weights).astype('f').ravel()
+            weights = check_array(weights, name='sample weights',
+                                  ndim=1, verbose=self.verbose)
             check_lengths(y, weights)
         else:
             weights = np.ones_like(y).astype('float64')
@@ -3063,23 +3075,33 @@ def _exposure_to_weights(self, y, exposure=None, weights=None):
         y : y normalized by exposure
         weights : array-like shape (n_samples,)
         """
+        y = y.ravel()
 
         if exposure is not None:
-            exposure = np.array(exposure).astype('f')
+            exposure = np.array(exposure).astype('f').ravel()
+            exposure = check_array(exposure, name='sample exposure',
+                                   ndim=1, verbose=self.verbose)
         else:
-            exposure = np.ones_like(y).astype('float64')
+            exposure = np.ones_like(y.ravel()).astype('float64')
+
+        # check data
+        exposure = exposure.ravel()
         check_lengths(y, exposure)
 
         # normalize response
         y = y / exposure
 
         if weights is not None:
-            weights = np.array(weights).astype('f')
+            weights = np.array(weights).astype('f').ravel()
+            weights = check_array(weights, name='sample weights',
+                                  ndim=1, verbose=self.verbose)
         else:
             weights = np.ones_like(y).astype('float64')
         check_lengths(weights, exposure)
 
         # set exposure as the weight
+        # we do this because we have divided our response
+        # so if we make an error of 1 now, we need it to count more heavily.
         weights = weights * exposure
 
         return y, weights

diff --git a/pygam/tests/test_GAM_methods.py b/pygam/tests/test_GAM_methods.py
@@ -490,3 +490,43 @@ def test_pvalue_invariant_to_scale(wage_X_y):
     gamB = LinearGAM(n_splines=10).fit(X, y)
 
     assert np.allclose(gamA.statistics_['p_values'], gamB.statistics_['p_values'])
+
+
+def test_2d_y_still_allow_fitting_in_PoissonGAM(coal_X_y):
+    """
+    regression test.
+
+    there was a bug where we forgot to check the y_array before converting
+    exposure to weights.
+    """
+    X, y = coal_X_y
+    two_d_data = np.ones_like(y).ravel()[:, None]
+
+    # 2d y should cause no problems now
+    gam = PoissonGAM().fit(X, y[:, None])
+    assert gam._is_fitted
+
+    # 2d weghts should cause no problems now
+    gam = PoissonGAM().fit(X, y, weights=two_d_data)
+    assert gam._is_fitted
+
+    # 2d exposure should cause no problems now
+    gam = PoissonGAM().fit(X, y, exposure=two_d_data)
+    assert gam._is_fitted
+
+def test_non_int_exposure_produced_no_inf_in_PoissonGAM_ll(coal_X_y):
+    """
+    regression test.
+
+    there was a bug where we forgot to round the rescaled counts before
+    computing the loglikelihood. since Poisson requires integer observations,
+    small numerical errors caused the pmf to return -inf, which shows up
+    in the loglikelihood computations, AIC, AICc..
+    """
+    X, y = coal_X_y
+
+    rate = 1.2 + np.cos(np.linspace(0, 2. * np.pi, len(y)))
+
+    gam = PoissonGAM().fit(X, y, exposure=rate)
+
+    assert np.isfinite(gam.statistics_['loglikelihood'])
diff --git a/pygam/utils.py b/pygam/utils.py
@@ -179,14 +179,14 @@ def make_2d(array, verbose=True):
     return array
 
 
-def check_array(array, force_2d=False, n_feats=None, n_dims=None,
+def check_array(array, force_2d=False, n_feats=None, ndim=None,
                 min_samples=1, name='Input data', verbose=True):
     """
     tool to perform basic data validation.
     called by check_X and check_y.
 
     ensures that data:
-    - is n_dims dimensional
+    - is ndim dimensional
     - contains float-compatible data-types
     - has at least min_samples
     - has n_feats
@@ -196,11 +196,11 @@ def check_array(array, force_2d=False, n_feats=None, n_dims=None,
     ----------
     array : array-like
     force_2d : boolean, default: False
-        whether to force a 2d array. Setting to True forces n_dims = 2
+        whether to force a 2d array. Setting to True forces ndim = 2
     n_feats : int, default: None
               represents number of features that the array should have.
               not enforced if n_feats is None.
-    n_dims : int default: None
+    ndim : int default: None
         number of dimensions expected in the array
     min_samples : int, default: 1
     name : str, default: 'Input data'
@@ -215,7 +215,7 @@ def check_array(array, force_2d=False, n_feats=None, n_dims=None,
     # make array
     if force_2d:
         array = make_2d(array, verbose=verbose)
-        n_dims = 2
+        ndim = 2
     else:
         array = np.array(array)
 
@@ -234,11 +234,11 @@ def check_array(array, force_2d=False, n_feats=None, n_dims=None,
     if not(np.isfinite(array).all()):
         raise ValueError('{} must not contain Inf nor NaN'.format(name))
 
-    # check n_dims
-    if n_dims is not None:
-        if array.ndim != n_dims:
+    # check ndim
+    if ndim is not None:
+        if array.ndim != ndim:
             raise ValueError('{} must have {} dimensions. '\
-                             'found shape {}'.format(name, n_dims, array.shape))
+                             'found shape {}'.format(name, ndim, array.shape))
 
     # check n_feats
     if n_feats is not None:
@@ -279,7 +279,7 @@ def check_y(y, link, dist, min_samples=1, verbose=True):
     """
     y = np.ravel(y)
 
-    y = check_array(y, force_2d=False, min_samples=min_samples, n_dims=1,
+    y = check_array(y, force_2d=False, min_samples=min_samples, ndim=1,
                     name='y data', verbose=verbose)
 
     warnings.filterwarnings('ignore', 'divide by zero encountered in log')