optimagic-dev · alanlujan91 · Mar 4, 2024 · Mar 4, 2024 · Mar 4, 2024 · Mar 4, 2024
diff --git a/src/estimagic/estimation/msm_weighting.py b/src/estimagic/estimation/msm_weighting.py
@@ -24,8 +24,8 @@ def get_moments_cov(
         moment_kwargs (dict): Additional keyword arguments for calculate_moments.
         bootstrap_kwargs (dict): Additional keyword arguments that govern the
             bootstrapping. Allowed arguments are "n_draws", "seed", "n_cores",
-            "batch_evaluator", "cluster_by" and "error_handling". For details see the
-            bootstrap function.
+            "batch_evaluator", "weight_by", "cluster_by" and "error_handling".
+            For details see the bootstrap function.
 
     Returns:
         pandas.DataFrame or numpy.ndarray: The covariance matrix of the moment
@@ -39,6 +39,7 @@ def get_moments_cov(
         "n_draws",
         "seed",
         "batch_evaluator",
+        "weight_by",
         "cluster_by",
         "error_handling",
         "existing_result",

diff --git a/src/estimagic/inference/bootstrap.py b/src/estimagic/inference/bootstrap.py
@@ -24,6 +24,7 @@ def bootstrap(
     existing_result=None,
     outcome_kwargs=None,
     n_draws=1_000,
+    weight_by=None,
     cluster_by=None,
     seed=None,
     n_cores=1,
@@ -41,6 +42,7 @@ def bootstrap(
         n_draws (int): Number of bootstrap samples to draw.
             If len(existing_outcomes) >= n_draws, a random subset of existing_outcomes
             is used.
+        weight_by (str): Column name of variable with weights or None.
         cluster_by (str): Column name of variable to cluster by or None.
         seed (Union[None, int, numpy.random.Generator]): If seed is None or int the
             numpy.random.default_rng is used seeded with seed. If seed is already a
@@ -59,7 +61,7 @@ def bootstrap(
 
     """
     if callable(outcome):
-        check_inputs(data=data, cluster_by=cluster_by)
+        check_inputs(data=data, weight_by=weight_by, cluster_by=cluster_by)
 
         if outcome_kwargs is not None:
             outcome = functools.partial(outcome, **outcome_kwargs)
@@ -82,6 +84,7 @@ def bootstrap(
         new_outcomes = get_bootstrap_outcomes(
             data=data,
             outcome=outcome,
+            weight_by=weight_by,
             cluster_by=cluster_by,
             rng=rng,
             n_draws=n_draws - n_existing,

diff --git a/src/estimagic/inference/bootstrap_helpers.py b/src/estimagic/inference/bootstrap_helpers.py
@@ -2,12 +2,18 @@
 
 
 def check_inputs(
-    data=None, cluster_by=None, ci_method="percentile", ci_level=0.95, skipdata=False
+    data=None,
+    weight_by=None,
+    cluster_by=None,
+    ci_method="percentile",
+    ci_level=0.95,
+    skipdata=False,
 ):
     """Check validity of inputs.
 
     Args:
         data (pd.DataFrame): Dataset.
+        weight_by (str): Column name of variable with weights.
         cluster_by (str): Column name of variable to cluster by.
         ci_method (str): Method of choice for computing confidence intervals.
             The default is "percentile".
@@ -21,6 +27,10 @@ def check_inputs(
     if not skipdata:
         if not isinstance(data, pd.DataFrame) and not isinstance(data, pd.Series):
             raise TypeError("Data must be a pandas.DataFrame or pandas.Series.")
+        elif (weight_by is not None) and (weight_by not in data.columns.tolist()):
+            raise ValueError(
+                "Input 'weight_by' must be None or a column name of 'data'."
+            )
         elif (cluster_by is not None) and (cluster_by not in data.columns.tolist()):
             raise ValueError(
                 "Input 'cluster_by' must be None or a column name of 'data'."

diff --git a/src/estimagic/inference/bootstrap_outcomes.py b/src/estimagic/inference/bootstrap_outcomes.py
@@ -6,6 +6,7 @@
 def get_bootstrap_outcomes(
     data,
     outcome,
+    weight_by=None,
     cluster_by=None,
     rng=None,
     n_draws=1000,
@@ -19,6 +20,7 @@ def get_bootstrap_outcomes(
         data (pandas.DataFrame): original dataset.
         outcome (callable): function of the dataset calculating statistic of interest.
             Returns a general pytree (e.g. pandas Series, dict, numpy array, etc.).
+        weight_by (str): column name of the variable with weights.
         cluster_by (str): column name of the variable to cluster by.
         rng (numpy.random.Generator): A random number generator.
         n_draws (int): number of bootstrap draws.
@@ -34,12 +36,13 @@ def get_bootstrap_outcomes(
         estimates (list):  List of pytrees of estimated bootstrap outcomes.
 
     """
-    check_inputs(data=data, cluster_by=cluster_by)
+    check_inputs(data=data, weight_by=weight_by, cluster_by=cluster_by)
     batch_evaluator = process_batch_evaluator(batch_evaluator)
 
     indices = get_bootstrap_indices(
         data=data,
         rng=rng,
+        weight_by=weight_by,
         cluster_by=cluster_by,
         n_draws=n_draws,
     )

diff --git a/src/estimagic/inference/bootstrap_samples.py b/src/estimagic/inference/bootstrap_samples.py
@@ -2,7 +2,13 @@
 import pandas as pd
 
 
-def get_bootstrap_indices(data, rng, cluster_by=None, n_draws=1000):
+def get_bootstrap_indices(
+    data,
+    rng,
+    weight_by=None,
+    cluster_by=None,
+    n_draws=1000,
+):
     """Draw positional indices for the construction of bootstrap samples.
 
     Storing the positional indices instead of the full bootstrap samples saves a lot
@@ -11,6 +17,7 @@ def get_bootstrap_indices(data, rng, cluster_by=None, n_draws=1000):
     Args:
         data (pandas.DataFrame): original dataset.
         rng (numpy.random.Generator): A random number generator.
+        weight_by (str): column name of the variable with weights.
         cluster_by (str): column name of the variable to cluster by.
         n_draws (int): number of draws, only relevant if seeds is None.
 
@@ -19,12 +26,16 @@ def get_bootstrap_indices(data, rng, cluster_by=None, n_draws=1000):
 
     """
     n_obs = len(data)
+    probs = _get_probs_for_bootstrap_indices(data, weight_by, cluster_by)
+
     if cluster_by is None:
-        bootstrap_indices = list(rng.integers(0, n_obs, size=(n_draws, n_obs)))
+        bootstrap_indices = list(
+            rng.choice(n_obs, size=(n_draws, n_obs), replace=True, p=probs)
+        )
     else:
         clusters = data[cluster_by].unique()
         drawn_clusters = rng.choice(
-            clusters, size=(n_draws, len(clusters)), replace=True
+            clusters, size=(n_draws, len(clusters)), replace=True, p=probs
         )
 
         bootstrap_indices = _convert_cluster_ids_to_indices(
@@ -34,6 +45,29 @@ def get_bootstrap_indices(data, rng, cluster_by=None, n_draws=1000):
     return bootstrap_indices
 
 
+def _get_probs_for_bootstrap_indices(data, weight_by, cluster_by):
+    """Calculate probabilities for drawing bootstrap indices.
+
+    Args:
+        data (pandas.DataFrame): original dataset.
+        weight_by (str): column name of the variable with weights.
+        cluster_by (str): column name of the variable to cluster by.
+
+    Returns:
+        list: numpy array with probabilities.
+
+    """
+    if weight_by is None:
+        probs = None
+    else:
+        if cluster_by is None:
+            probs = data[weight_by] / data[weight_by].sum()
+        else:
+            cluster_weights = data.groupby(cluster_by, sort=False)[weight_by].sum()
+            probs = cluster_weights / cluster_weights.sum()
+    return probs
+
+
 def _convert_cluster_ids_to_indices(cluster_col, drawn_clusters):
     """Convert the drawn clusters to positional indices of individual observations.
 
@@ -48,7 +82,13 @@ def _convert_cluster_ids_to_indices(cluster_col, drawn_clusters):
     return bootstrap_indices
 
 
-def get_bootstrap_samples(data, rng, cluster_by=None, n_draws=1000):
+def get_bootstrap_samples(
+    data,
+    rng,
+    weight_by=None,
+    cluster_by=None,
+    n_draws=1000,
+):
     """Draw bootstrap samples.
 
     If you have memory issues you should use get_bootstrap_indices instead and construct
@@ -57,6 +97,7 @@ def get_bootstrap_samples(data, rng, cluster_by=None, n_draws=1000):
     Args:
         data (pandas.DataFrame): original dataset.
         rng (numpy.random.Generator): A random number generator.
+        weight_by (str): weights for the observations.
         cluster_by (str): column name of the variable to cluster by.
         n_draws (int): number of draws, only relevant if seeds is None.
 
@@ -67,6 +108,7 @@ def get_bootstrap_samples(data, rng, cluster_by=None, n_draws=1000):
     indices = get_bootstrap_indices(
         data=data,
         rng=rng,
+        weight_by=weight_by,
         cluster_by=cluster_by,
         n_draws=n_draws,
     )

diff --git a/tests/inference/test_bootstrap_ci.py b/tests/inference/test_bootstrap_ci.py
@@ -88,6 +88,15 @@ def test_check_inputs_data():
     assert str(error.value) == expected_msg
 
 
+def test_check_inputs_weight_by(setup):
+    weights = "this is not a column name of df"
+    expected = "Input 'weight_by' must be None or a column name of 'data'."
+
+    with pytest.raises(ValueError) as error:
+        check_inputs(data=setup["df"], weight_by=weights)
+    assert str(error.value) == expected
-    weights = "this is not a column name of df"
-    expected = "Input 'weight_by' must be None or a column name of 'data'."
-
-    with pytest.raises(ValueError) as error:
-        check_inputs(data=setup["df"], weight_by=weights)
-    assert str(error.value) == expected
+    expected_error_msg = "Input 'weight_by' must be None or a column name of 'data'."
+
+    with pytest.raises(ValueError, match=expected_error_msg):
+        check_inputs(data=setup["df"], weight_by="this is not a column name of df")
-    weights = "this is not a column name of df"
-    expected = "Input 'weight_by' must be None or a column name of 'data'."
-
-    with pytest.raises(ValueError) as error:
-        check_inputs(data=setup["df"], weight_by=weights)
-    assert str(error.value) == expected
+    expected_error_msg = "Input 'weight_by' must be None or a column name of 'data'."
+
+    with pytest.raises(ValueError, match=expected_error_msg):
+        check_inputs(data=setup["df"], weight_by="this is not a column name of df")
+
+
 def test_check_inputs_cluster_by(setup):
     cluster_by = "this is not a column name of df"
     expected_msg = "Input 'cluster_by' must be None or a column name of 'data'."

diff --git a/tests/inference/test_bootstrap_samples.py b/tests/inference/test_bootstrap_samples.py
@@ -17,6 +17,7 @@ def data():
     df = pd.DataFrame()
     df["id"] = np.arange(900)
     df["hh"] = [3, 1, 2, 0, 0, 2, 5, 4, 5] * 100
+    df["weights"] = np.ones(900)
     return df
 
 
@@ -32,6 +33,37 @@ def test_get_bootstrap_indices_radomization_works_with_clustering(data):
     assert set(res[0]) != set(res[1])
 
 
+def test_get_bootstrap_indices_randomization_works_with_weights(data):
+    rng = get_rng(seed=12345)
+    res = get_bootstrap_indices(data, weight_by="weights", n_draws=2, rng=rng)
+    assert set(res[0]) != set(res[1])
+
+
+def test_get_bootstrap_indices_randomization_works_with_weights_and_clustering(data):
+    rng = get_rng(seed=12345)
+    res = get_bootstrap_indices(
+        data, weight_by="weights", cluster_by="hh", n_draws=2, rng=rng
+    )
+    assert set(res[0]) != set(res[1])
+
+
+def test_get_bootstrap_indices_randomization_works_with_and_without_weights(data):
+    rng1 = get_rng(seed=12345)
+    rng2 = get_rng(seed=12345)
+    res1 = get_bootstrap_indices(data, n_draws=1, rng=rng1)
+    res2 = get_bootstrap_indices(data, weight_by="weights", n_draws=1, rng=rng2)
+    assert not np.array_equal(res1, res2)
+
+
+def test_get_boostrap_indices_randomization_works_with_extreme_case(data):
+    rng = get_rng(seed=12345)
+    weights = np.zeros(900)
+    weights[0] = 1.0
+    data["weights"] = weights
+    res = get_bootstrap_indices(data, weight_by="weights", n_draws=1, rng=rng)
+    assert len(np.unique(res)) == 1
+
+
 def test_clustering_leaves_households_intact(data):
     rng = get_rng(seed=12345)
     indices = get_bootstrap_indices(data, cluster_by="hh", n_draws=1, rng=rng)[0]