Skip to content

Commit

Permalink
Merge pull request #179 from dswah/poisson-bug
Browse files Browse the repository at this point in the history
Poisson bug
  • Loading branch information
dswah committed Jun 29, 2018
2 parents fa70ebf + cac5231 commit 62a2e57
Show file tree
Hide file tree
Showing 4 changed files with 88 additions and 21 deletions.
7 changes: 6 additions & 1 deletion pygam/distributions.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,7 +374,12 @@ def log_pdf(self, y, mu, weights=None):
weights = np.ones_like(mu)
# in Poisson regression weights are proportional to the exposure
# so we want to pump up all our predictions
# NOTE: we assume the targets are unchanged
# NOTE: we assume the targets are counts, not rate.
# ie if observations were scaled to account for exposure, they have
# been rescaled before calling this function.
# since some samples have higher exposure,
# they also need to have higher variance,
# we do this by multiplying mu by the weight=exposure
mu = mu * weights
return sp.stats.poisson.logpmf(y, mu=mu)

Expand Down
42 changes: 32 additions & 10 deletions pygam/pygam.py
Original file line number Diff line number Diff line change
Expand Up @@ -588,7 +588,9 @@ def loglikelihood(self, X, y, weights=None):
mu = self.predict_mu(X)

if weights is not None:
weights = check_array(weights, name='sample weights', verbose=self.verbose)
weights = np.array(weights).astype('f').ravel()
weights = check_array(weights, name='sample weights',
ndim=1, verbose=self.verbose)
check_lengths(y, weights)
else:
weights = np.ones_like(y).astype('float64')
Expand Down Expand Up @@ -1251,7 +1253,9 @@ def fit(self, X, y, weights=None):
check_X_y(X, y)

if weights is not None:
weights = check_array(weights, name='sample weights', verbose=self.verbose)
weights = np.array(weights).astype('f').ravel()
weights = check_array(weights, name='sample weights',
ndim=1, verbose=self.verbose)
check_lengths(y, weights)
else:
weights = np.ones_like(y).astype('float64')
Expand Down Expand Up @@ -1303,7 +1307,9 @@ def deviance_residuals(self, X, y, weights=None, scaled=False):
check_X_y(X, y)

if weights is not None:
weights = check_array(weights, name='sample weights', verbose=self.verbose)
weights = np.array(weights).astype('f').ravel()
weights = check_array(weights, name='sample weights',
ndim=1, verbose=self.verbose)
check_lengths(y, weights)
else:
weights = np.ones_like(y).astype('float64')
Expand Down Expand Up @@ -2013,7 +2019,9 @@ def gridsearch(self, X, y, weights=None, return_scores=False,
check_X_y(X, y)

if weights is not None:
weights = check_array(weights, name='sample weights', verbose=self.verbose)
weights = np.array(weights).astype('f').ravel()
weights = check_array(weights, name='sample weights',
ndim=1, verbose=self.verbose)
check_lengths(y, weights)
else:
weights = np.ones_like(y).astype('float64')
Expand Down Expand Up @@ -2997,15 +3005,17 @@ def _loglikelihood(self, y, mu, weights=None, rescale_y=True):
weights : array-like of shape (n,)
containing sample weights
rescale_y : boolean, defaul: True
whether to scale the targets back up by
whether to scale the targets back up.
useful when fitting with an exposure, in which case the count observations
were scaled into rates. this rescales rates into counts.
Returns
-------
log-likelihood : np.array of shape (n,)
containing log-likelihood scores
"""
if rescale_y:
y = y * weights
y = np.round(y * weights).astype('int')

return self.distribution.log_pdf(y=y, mu=mu, weights=weights).sum()

Expand Down Expand Up @@ -3034,7 +3044,9 @@ def loglikelihood(self, X, y, exposure=None, weights=None):
mu = self.predict_mu(X)

if weights is not None:
weights = check_array(weights, name='sample weights', verbose=self.verbose)
weights = np.array(weights).astype('f').ravel()
weights = check_array(weights, name='sample weights',
ndim=1, verbose=self.verbose)
check_lengths(y, weights)
else:
weights = np.ones_like(y).astype('float64')
Expand Down Expand Up @@ -3063,23 +3075,33 @@ def _exposure_to_weights(self, y, exposure=None, weights=None):
y : y normalized by exposure
weights : array-like shape (n_samples,)
"""
y = y.ravel()

if exposure is not None:
exposure = np.array(exposure).astype('f')
exposure = np.array(exposure).astype('f').ravel()
exposure = check_array(exposure, name='sample exposure',
ndim=1, verbose=self.verbose)
else:
exposure = np.ones_like(y).astype('float64')
exposure = np.ones_like(y.ravel()).astype('float64')

# check data
exposure = exposure.ravel()
check_lengths(y, exposure)

# normalize response
y = y / exposure

if weights is not None:
weights = np.array(weights).astype('f')
weights = np.array(weights).astype('f').ravel()
weights = check_array(weights, name='sample weights',
ndim=1, verbose=self.verbose)
else:
weights = np.ones_like(y).astype('float64')
check_lengths(weights, exposure)

# set exposure as the weight
# we do this because we have divided our response
# so if we make an error of 1 now, we need it to count more heavily.
weights = weights * exposure

return y, weights
Expand Down
40 changes: 40 additions & 0 deletions pygam/tests/test_GAM_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -490,3 +490,43 @@ def test_pvalue_invariant_to_scale(wage_X_y):
gamB = LinearGAM(n_splines=10).fit(X, y)

assert np.allclose(gamA.statistics_['p_values'], gamB.statistics_['p_values'])


def test_2d_y_still_allow_fitting_in_PoissonGAM(coal_X_y):
"""
regression test.
there was a bug where we forgot to check the y_array before converting
exposure to weights.
"""
X, y = coal_X_y
two_d_data = np.ones_like(y).ravel()[:, None]

# 2d y should cause no problems now
gam = PoissonGAM().fit(X, y[:, None])
assert gam._is_fitted

# 2d weghts should cause no problems now
gam = PoissonGAM().fit(X, y, weights=two_d_data)
assert gam._is_fitted

# 2d exposure should cause no problems now
gam = PoissonGAM().fit(X, y, exposure=two_d_data)
assert gam._is_fitted

def test_non_int_exposure_produced_no_inf_in_PoissonGAM_ll(coal_X_y):
"""
regression test.
there was a bug where we forgot to round the rescaled counts before
computing the loglikelihood. since Poisson requires integer observations,
small numerical errors caused the pmf to return -inf, which shows up
in the loglikelihood computations, AIC, AICc..
"""
X, y = coal_X_y

rate = 1.2 + np.cos(np.linspace(0, 2. * np.pi, len(y)))

gam = PoissonGAM().fit(X, y, exposure=rate)

assert np.isfinite(gam.statistics_['loglikelihood'])
20 changes: 10 additions & 10 deletions pygam/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,14 +179,14 @@ def make_2d(array, verbose=True):
return array


def check_array(array, force_2d=False, n_feats=None, n_dims=None,
def check_array(array, force_2d=False, n_feats=None, ndim=None,
min_samples=1, name='Input data', verbose=True):
"""
tool to perform basic data validation.
called by check_X and check_y.
ensures that data:
- is n_dims dimensional
- is ndim dimensional
- contains float-compatible data-types
- has at least min_samples
- has n_feats
Expand All @@ -196,11 +196,11 @@ def check_array(array, force_2d=False, n_feats=None, n_dims=None,
----------
array : array-like
force_2d : boolean, default: False
whether to force a 2d array. Setting to True forces n_dims = 2
whether to force a 2d array. Setting to True forces ndim = 2
n_feats : int, default: None
represents number of features that the array should have.
not enforced if n_feats is None.
n_dims : int default: None
ndim : int default: None
number of dimensions expected in the array
min_samples : int, default: 1
name : str, default: 'Input data'
Expand All @@ -215,7 +215,7 @@ def check_array(array, force_2d=False, n_feats=None, n_dims=None,
# make array
if force_2d:
array = make_2d(array, verbose=verbose)
n_dims = 2
ndim = 2
else:
array = np.array(array)

Expand All @@ -234,11 +234,11 @@ def check_array(array, force_2d=False, n_feats=None, n_dims=None,
if not(np.isfinite(array).all()):
raise ValueError('{} must not contain Inf nor NaN'.format(name))

# check n_dims
if n_dims is not None:
if array.ndim != n_dims:
# check ndim
if ndim is not None:
if array.ndim != ndim:
raise ValueError('{} must have {} dimensions. '\
'found shape {}'.format(name, n_dims, array.shape))
'found shape {}'.format(name, ndim, array.shape))

# check n_feats
if n_feats is not None:
Expand Down Expand Up @@ -279,7 +279,7 @@ def check_y(y, link, dist, min_samples=1, verbose=True):
"""
y = np.ravel(y)

y = check_array(y, force_2d=False, min_samples=min_samples, n_dims=1,
y = check_array(y, force_2d=False, min_samples=min_samples, ndim=1,
name='y data', verbose=verbose)

warnings.filterwarnings('ignore', 'divide by zero encountered in log')
Expand Down

0 comments on commit 62a2e57

Please sign in to comment.