Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bootstrap weights #485

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
5 changes: 3 additions & 2 deletions src/estimagic/estimation/msm_weighting.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ def get_moments_cov(
moment_kwargs (dict): Additional keyword arguments for calculate_moments.
bootstrap_kwargs (dict): Additional keyword arguments that govern the
bootstrapping. Allowed arguments are "n_draws", "seed", "n_cores",
"batch_evaluator", "cluster_by" and "error_handling". For details see the
bootstrap function.
"batch_evaluator", "weight_by", "cluster_by" and "error_handling".
For details see the bootstrap function.

Returns:
pandas.DataFrame or numpy.ndarray: The covariance matrix of the moment
Expand All @@ -39,6 +39,7 @@ def get_moments_cov(
"n_draws",
"seed",
"batch_evaluator",
"weight_by",
"cluster_by",
"error_handling",
"existing_result",
Expand Down
5 changes: 4 additions & 1 deletion src/estimagic/inference/bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def bootstrap(
existing_result=None,
outcome_kwargs=None,
n_draws=1_000,
weight_by=None,
cluster_by=None,
seed=None,
n_cores=1,
Expand All @@ -41,6 +42,7 @@ def bootstrap(
n_draws (int): Number of bootstrap samples to draw.
If len(existing_outcomes) >= n_draws, a random subset of existing_outcomes
is used.
weight_by (str): Column name of variable with weights or None.
cluster_by (str): Column name of variable to cluster by or None.
seed (Union[None, int, numpy.random.Generator]): If seed is None or int the
numpy.random.default_rng is used seeded with seed. If seed is already a
Expand All @@ -59,7 +61,7 @@ def bootstrap(

"""
if callable(outcome):
check_inputs(data=data, cluster_by=cluster_by)
check_inputs(data=data, weight_by=weight_by, cluster_by=cluster_by)

if outcome_kwargs is not None:
outcome = functools.partial(outcome, **outcome_kwargs)
Expand All @@ -82,6 +84,7 @@ def bootstrap(
new_outcomes = get_bootstrap_outcomes(
data=data,
outcome=outcome,
weight_by=weight_by,
cluster_by=cluster_by,
rng=rng,
n_draws=n_draws - n_existing,
Expand Down
12 changes: 11 additions & 1 deletion src/estimagic/inference/bootstrap_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,18 @@


def check_inputs(
data=None, cluster_by=None, ci_method="percentile", ci_level=0.95, skipdata=False
data=None,
weight_by=None,
cluster_by=None,
ci_method="percentile",
ci_level=0.95,
skipdata=False,
):
"""Check validity of inputs.

Args:
data (pd.DataFrame): Dataset.
weight_by (str): Column name of variable with weights.
cluster_by (str): Column name of variable to cluster by.
ci_method (str): Method of choice for computing confidence intervals.
The default is "percentile".
Expand All @@ -21,6 +27,10 @@ def check_inputs(
if not skipdata:
if not isinstance(data, pd.DataFrame) and not isinstance(data, pd.Series):
raise TypeError("Data must be a pandas.DataFrame or pandas.Series.")
elif (weight_by is not None) and (weight_by not in data.columns.tolist()):
raise ValueError(
"Input 'weight_by' must be None or a column name of 'data'."
)
elif (cluster_by is not None) and (cluster_by not in data.columns.tolist()):
raise ValueError(
"Input 'cluster_by' must be None or a column name of 'data'."
Expand Down
5 changes: 4 additions & 1 deletion src/estimagic/inference/bootstrap_outcomes.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
def get_bootstrap_outcomes(
data,
outcome,
weight_by=None,
cluster_by=None,
rng=None,
n_draws=1000,
Expand All @@ -19,6 +20,7 @@ def get_bootstrap_outcomes(
data (pandas.DataFrame): original dataset.
outcome (callable): function of the dataset calculating statistic of interest.
Returns a general pytree (e.g. pandas Series, dict, numpy array, etc.).
weight_by (str): column name of the variable with weights.
cluster_by (str): column name of the variable to cluster by.
rng (numpy.random.Generator): A random number generator.
n_draws (int): number of bootstrap draws.
Expand All @@ -34,12 +36,13 @@ def get_bootstrap_outcomes(
estimates (list): List of pytrees of estimated bootstrap outcomes.

"""
check_inputs(data=data, cluster_by=cluster_by)
check_inputs(data=data, weight_by=weight_by, cluster_by=cluster_by)
batch_evaluator = process_batch_evaluator(batch_evaluator)

indices = get_bootstrap_indices(
data=data,
rng=rng,
weight_by=weight_by,
cluster_by=cluster_by,
n_draws=n_draws,
)
Expand Down
50 changes: 46 additions & 4 deletions src/estimagic/inference/bootstrap_samples.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,13 @@
import pandas as pd


def get_bootstrap_indices(data, rng, cluster_by=None, n_draws=1000):
def get_bootstrap_indices(
data,
rng,
weight_by=None,
cluster_by=None,
n_draws=1000,
):
"""Draw positional indices for the construction of bootstrap samples.

Storing the positional indices instead of the full bootstrap samples saves a lot
Expand All @@ -11,6 +17,7 @@ def get_bootstrap_indices(data, rng, cluster_by=None, n_draws=1000):
Args:
data (pandas.DataFrame): original dataset.
rng (numpy.random.Generator): A random number generator.
weight_by (str): column name of the variable with weights.
cluster_by (str): column name of the variable to cluster by.
n_draws (int): number of draws, only relevant if seeds is None.

Expand All @@ -19,12 +26,16 @@ def get_bootstrap_indices(data, rng, cluster_by=None, n_draws=1000):

"""
n_obs = len(data)
probs = _get_probs_for_bootstrap_indices(data, weight_by, cluster_by)

if cluster_by is None:
bootstrap_indices = list(rng.integers(0, n_obs, size=(n_draws, n_obs)))
bootstrap_indices = list(
rng.choice(n_obs, size=(n_draws, n_obs), replace=True, p=probs)
)
else:
clusters = data[cluster_by].unique()
drawn_clusters = rng.choice(
clusters, size=(n_draws, len(clusters)), replace=True
clusters, size=(n_draws, len(clusters)), replace=True, p=probs
)

bootstrap_indices = _convert_cluster_ids_to_indices(
Expand All @@ -34,6 +45,29 @@ def get_bootstrap_indices(data, rng, cluster_by=None, n_draws=1000):
return bootstrap_indices


def _get_probs_for_bootstrap_indices(data, weight_by, cluster_by):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I prefer the function name _calculate_bootstrap_indices_weights. In that case, the first line of the docstring needs to be updated as well.

"""Calculate probabilities for drawing bootstrap indices.

Args:
data (pandas.DataFrame): original dataset.
weight_by (str): column name of the variable with weights.
cluster_by (str): column name of the variable to cluster by.

Returns:
list: numpy array with probabilities.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking at the code, the output will be either None or a pd.Series, and not a list or list of arrays, right?


"""
if weight_by is None:
probs = None
else:
if cluster_by is None:
probs = data[weight_by] / data[weight_by].sum()
else:
cluster_weights = data.groupby(cluster_by, sort=False)[weight_by].sum()
probs = cluster_weights / cluster_weights.sum()
return probs


def _convert_cluster_ids_to_indices(cluster_col, drawn_clusters):
"""Convert the drawn clusters to positional indices of individual observations.

Expand All @@ -48,7 +82,13 @@ def _convert_cluster_ids_to_indices(cluster_col, drawn_clusters):
return bootstrap_indices


def get_bootstrap_samples(data, rng, cluster_by=None, n_draws=1000):
def get_bootstrap_samples(
data,
rng,
weight_by=None,
cluster_by=None,
n_draws=1000,
):
"""Draw bootstrap samples.

If you have memory issues you should use get_bootstrap_indices instead and construct
Expand All @@ -57,6 +97,7 @@ def get_bootstrap_samples(data, rng, cluster_by=None, n_draws=1000):
Args:
data (pandas.DataFrame): original dataset.
rng (numpy.random.Generator): A random number generator.
weight_by (str): weights for the observations.
cluster_by (str): column name of the variable to cluster by.
n_draws (int): number of draws, only relevant if seeds is None.

Expand All @@ -67,6 +108,7 @@ def get_bootstrap_samples(data, rng, cluster_by=None, n_draws=1000):
indices = get_bootstrap_indices(
data=data,
rng=rng,
weight_by=weight_by,
cluster_by=cluster_by,
n_draws=n_draws,
)
Expand Down
9 changes: 9 additions & 0 deletions tests/inference/test_bootstrap_ci.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,15 @@ def test_check_inputs_data():
assert str(error.value) == expected_msg


def test_check_inputs_weight_by(setup):
weights = "this is not a column name of df"
expected = "Input 'weight_by' must be None or a column name of 'data'."

with pytest.raises(ValueError) as error:
check_inputs(data=setup["df"], weight_by=weights)
assert str(error.value) == expected
Comment on lines +92 to +97
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
weights = "this is not a column name of df"
expected = "Input 'weight_by' must be None or a column name of 'data'."
with pytest.raises(ValueError) as error:
check_inputs(data=setup["df"], weight_by=weights)
assert str(error.value) == expected
expected_error_msg = "Input 'weight_by' must be None or a column name of 'data'."
with pytest.raises(ValueError, match=expected_error_msg):
check_inputs(data=setup["df"], weight_by="this is not a column name of df")



def test_check_inputs_cluster_by(setup):
cluster_by = "this is not a column name of df"
expected_msg = "Input 'cluster_by' must be None or a column name of 'data'."
Expand Down
32 changes: 32 additions & 0 deletions tests/inference/test_bootstrap_samples.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def data():
df = pd.DataFrame()
df["id"] = np.arange(900)
df["hh"] = [3, 1, 2, 0, 0, 2, 5, 4, 5] * 100
df["weights"] = np.ones(900)
return df


Expand All @@ -32,6 +33,37 @@ def test_get_bootstrap_indices_radomization_works_with_clustering(data):
assert set(res[0]) != set(res[1])


def test_get_bootstrap_indices_randomization_works_with_weights(data):
rng = get_rng(seed=12345)
res = get_bootstrap_indices(data, weight_by="weights", n_draws=2, rng=rng)
assert set(res[0]) != set(res[1])


def test_get_bootstrap_indices_randomization_works_with_weights_and_clustering(data):
rng = get_rng(seed=12345)
res = get_bootstrap_indices(
data, weight_by="weights", cluster_by="hh", n_draws=2, rng=rng
)
assert set(res[0]) != set(res[1])


def test_get_bootstrap_indices_randomization_works_with_and_without_weights(data):
rng1 = get_rng(seed=12345)
rng2 = get_rng(seed=12345)
res1 = get_bootstrap_indices(data, n_draws=1, rng=rng1)
res2 = get_bootstrap_indices(data, weight_by="weights", n_draws=1, rng=rng2)
assert not np.array_equal(res1, res2)


def test_get_boostrap_indices_randomization_works_with_extreme_case(data):
rng = get_rng(seed=12345)
weights = np.zeros(900)
weights[0] = 1.0
data["weights"] = weights
res = get_bootstrap_indices(data, weight_by="weights", n_draws=1, rng=rng)
assert len(np.unique(res)) == 1


def test_clustering_leaves_households_intact(data):
rng = get_rng(seed=12345)
indices = get_bootstrap_indices(data, cluster_by="hh", n_draws=1, rng=rng)[0]
Expand Down