diff --git a/book/cate_and_policy/policy_learning.ipynb b/book/cate_and_policy/policy_learning.ipynb index 7d25a78..49dc57f 100644 --- a/book/cate_and_policy/policy_learning.ipynb +++ b/book/cate_and_policy/policy_learning.ipynb @@ -32,11 +32,1328 @@ " async>\n", "" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "from matplotlib.patches import Rectangle\n", + "import matplotlib.patches as mpatches" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Set random seed for reproducibility\n", + "np.random.seed(42)\n", + "\n", + "# Generate data\n", + "n = 1000\n", + "p = 4\n", + "X = np.random.uniform(0, 1, (n, p))\n", + "W = np.random.binomial(1, 0.5, n) # Independent from X and Y\n", + "Y = 0.5 * (X[:, 0] - 0.5) + (X[:, 1] - 0.5) * W + 0.1 * np.random.randn(n)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Normalize Y for plotting\n", + "y_norm = 1 - (Y - Y.min()) / (Y.max() - Y.min())\n", + "\n", + "# First plot: All data points\n", + "fig1, ax1 = plt.subplots(1, 1, figsize=(8, 6))\n", + "for i in range(n):\n", + " if W[i] == 1:\n", + " ax1.scatter(X[i, 0], X[i, 1], marker='o', s=100, \n", + " c=[y_norm[i]], cmap='gray', vmin=0, vmax=1, \n", + " edgecolors='black', linewidths=1)\n", + " else:\n", + " ax1.scatter(X[i, 0], X[i, 1], marker='D', s=80, \n", + " c=[y_norm[i]], cmap='gray', vmin=0, vmax=1,\n", + " edgecolors='black', linewidths=1)\n", + "ax1.set_xlabel('X1', fontsize=12)\n", + "ax1.set_ylabel('X2', fontsize=12)\n", + "ax1.set_title('All Data Points (○: Treated, ◇: Untreated)', fontsize=14)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Second plot: Separated by treatment\n", + "fig2, (ax2, ax3) = plt.subplots(1, 2, figsize=(14, 6))\n", + "\n", + "# Untreated group\n", + "untreated_idx = W == 0\n", + "ax2.scatter(X[untreated_idx, 0], X[untreated_idx, 1], marker='D', s=80, \n", + " c=y_norm[untreated_idx], cmap='gray', vmin=0, vmax=1,\n", + " edgecolors='black', linewidths=1)\n", + "ax2.set_xlabel('X1', fontsize=12)\n", + "ax2.set_ylabel('X2', fontsize=12)\n", + "ax2.set_title('Untreated', fontsize=14)\n", + "\n", + "# Treated group\n", + "treated_idx = W == 1\n", + "ax3.scatter(X[treated_idx, 0], X[treated_idx, 1], marker='o', s=100, \n", + " c=y_norm[treated_idx], cmap='gray', vmin=0, vmax=1,\n", + " edgecolors='black', linewidths=1)\n", + "ax3.set_xlabel('X1', fontsize=12)\n", + "ax3.set_ylabel('X2', fontsize=12)\n", + "ax3.set_title('Treated', fontsize=14)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Third plot: Policy regions\n", + "fig3, ax4 = plt.subplots(1, 1, figsize=(8, 6))\n", + "\n", + "# Define colors with transparency\n", + "col1 = (0.9960938, 0.7539062, 0.0273438, 0.35) # Yellow-ish\n", + "col2 = (0.250980, 0.690196, 0.650980, 0.35) # Teal-ish\n", + "\n", + "# Draw policy regions\n", + "rect1 = Rectangle((-0.1, -0.1), 0.6, 1.2, linewidth=0, \n", + " edgecolor='none', facecolor=col1, hatch='///')\n", + "rect2 = Rectangle((0.5, -0.1), 0.6, 0.6, linewidth=0, \n", + " edgecolor='none', facecolor=col1, hatch='///')\n", + "rect3 = Rectangle((0.5, 0.5), 0.6, 0.6, linewidth=0, \n", + " edgecolor='none', facecolor=col2, hatch='///')\n", + "ax4.add_patch(rect1)\n", + "ax4.add_patch(rect2)\n", + "ax4.add_patch(rect3)\n", + "\n", + "# Plot data points\n", + "for i in range(n):\n", + " if W[i] == 1:\n", + " ax4.scatter(X[i, 0], X[i, 1], marker='o', s=100, \n", + " c=[y_norm[i]], cmap='gray', vmin=0, vmax=1, \n", + " edgecolors='black', linewidths=1)\n", + " else:\n", + " ax4.scatter(X[i, 0], X[i, 1], marker='D', s=80, \n", + " c=[y_norm[i]], cmap='gray', vmin=0, vmax=1,\n", + " edgecolors='black', linewidths=1)\n", + "\n", + "# Add text labels\n", + "ax4.text(0.75, 0.75, 'TREAT (A)', fontsize=16, ha='center', va='center')\n", + "ax4.text(0.25, 0.25, 'DO NOT TREAT (A^C)', fontsize=16, ha='left', va='center')\n", + "ax4.set_xlabel('X1', fontsize=12)\n", + "ax4.set_ylabel('X2', fontsize=12)\n", + "ax4.set_xlim(-0.1, 1.1)\n", + "ax4.set_ylim(-0.1, 1.1)\n", + "ax4.set_title('Policy Regions', fontsize=14)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Policy Evaluation Methods\n", + "print(\"=\" * 60)\n", + "print(\"POLICY EVALUATION RESULTS\")\n", + "print(\"=\" * 60)\n", + "\n", + "# Method 1: Value of policy A (only valid in randomized setting)\n", + "A = (X[:, 0] > 0.5) & (X[:, 1] > 0.5)\n", + "value_estimate = np.mean(Y[A & (W == 1)]) * np.mean(A) + \\\n", + " np.mean(Y[~A & (W == 0)]) * np.mean(~A)\n", + "value_stderr = np.sqrt(\n", + " np.var(Y[A & (W == 1)]) / np.sum(A & (W == 1)) * np.mean(A)**2 + \n", + " np.var(Y[~A & (W == 0)]) / np.sum(~A & (W == 0)) * np.mean(~A)**2\n", + ")\n", + "print(f\"\\nMethod 1: Value of Policy A\")\n", + "print(f\"Value estimate: {value_estimate:.6f}\")\n", + "print(f\"Std. Error: {value_stderr:.6f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Method 2: Value of fixed treatment proportion (p=0.75)\n", + "p_treat = 0.75\n", + "value_estimate2 = p_treat * np.mean(Y[W == 1]) + (1 - p_treat) * np.mean(Y[W == 0])\n", + "value_stderr2 = np.sqrt(\n", + " np.var(Y[W == 1]) / np.sum(W == 1) * p_treat**2 + \n", + " np.var(Y[W == 0]) / np.sum(W == 0) * (1 - p_treat)**2\n", + ")\n", + "print(f\"\\nMethod 2: Value of Fixed Treatment Proportion (p={p_treat})\")\n", + "print(f\"Value estimate: {value_estimate2:.6f}\")\n", + "print(f\"Std. Error: {value_stderr2:.6f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Method 3: Treatment effect within policy region A\n", + "diff_estimate = (np.mean(Y[A & (W == 1)]) - np.mean(Y[A & (W == 0)])) * np.mean(A)\n", + "diff_stderr = np.sqrt(\n", + " np.var(Y[A & (W == 1)]) / np.sum(A & (W == 1)) + \n", + " np.var(Y[A & (W == 0)]) / np.sum(A & (W == 0))\n", + ") * np.mean(A)\n", + "print(f\"\\nMethod 3: Treatment Effect within Policy Region A\")\n", + "print(f\"Difference estimate: {diff_estimate:.6f}\")\n", + "print(f\"Std. Error: {diff_stderr:.6f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Method 4: Optimal policy difference\n", + "diff_estimate2 = (np.mean(Y[A & (W == 1)]) - np.mean(Y[A & (W == 0)])) * np.mean(A) / 2 + \\\n", + " (np.mean(Y[~A & (W == 0)]) - np.mean(Y[~A & (W == 1)])) * np.mean(~A) / 2\n", + "diff_stderr2 = np.sqrt(\n", + " (np.mean(A) / 2)**2 * (\n", + " np.var(Y[A & (W == 1)]) / np.sum(A & (W == 1)) + \n", + " np.var(Y[A & (W == 0)]) / np.sum(A & (W == 0))\n", + " ) + \n", + " (np.mean(~A) / 2)**2 * (\n", + " np.var(Y[~A & (W == 1)]) / np.sum(~A & (W == 1)) + \n", + " np.var(Y[~A & (W == 0)]) / np.sum(~A & (W == 0))\n", + " )\n", + ")\n", + "print(f\"\\nMethod 4: Optimal Policy Difference\")\n", + "print(f\"Difference estimate: {diff_estimate2:.6f}\")\n", + "print(f\"Std. Error: {diff_stderr2:.6f}\")\n", + "\n", + "print(\"\\n\" + \"=\" * 60)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Additional analysis: Treatment effect heterogeneity\n", + "print(\"\\nADDITIONAL ANALYSIS\")\n", + "print(\"=\" * 60)\n", + "\n", + "# Calculate treatment effects by region\n", + "te_in_A = np.mean(Y[A & (W == 1)]) - np.mean(Y[A & (W == 0)])\n", + "te_out_A = np.mean(Y[~A & (W == 1)]) - np.mean(Y[~A & (W == 0)])\n", + "\n", + "print(f\"\\nTreatment Effect Heterogeneity:\")\n", + "print(f\"Treatment effect in region A: {te_in_A:.6f}\")\n", + "print(f\"Treatment effect outside region A: {te_out_A:.6f}\")\n", + "print(f\"Difference in treatment effects: {te_in_A - te_out_A:.6f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Summary statistics\n", + "print(f\"\\nSummary Statistics:\")\n", + "print(f\"Proportion in region A: {np.mean(A):.3f}\")\n", + "print(f\"Proportion treated: {np.mean(W):.3f}\")\n", + "print(f\"Mean outcome (treated): {np.mean(Y[W == 1]):.6f}\")\n", + "print(f\"Mean outcome (untreated): {np.mean(Y[W == 0]):.6f}\")\n", + "print(f\"Overall treatment effect: {np.mean(Y[W == 1]) - np.mean(Y[W == 0]):.6f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Generate observational data\n", + "np.random.seed(123)\n", + "n = 1000\n", + "p = 4\n", + "X = np.random.uniform(0, 1, (n, p))\n", + "e = 1 / (1 + np.exp(-2*(X[:, 0] - 0.5) - 2*(X[:, 1] - 0.5))) # not observed by analyst\n", + "W = np.random.binomial(1, e, n)\n", + "Y = 0.5 * (X[:, 0] - 0.5) + (X[:, 1] - 0.5) * W + 0.1 * np.random.randn(n)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "y_norm = (Y - Y.min()) / (Y.max() - Y.min())\n", + "\n", + "# Plot by treatment status\n", + "fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))\n", + "\n", + "# Untreated\n", + "untreated_idx = W == 0\n", + "for i in np.where(untreated_idx)[0]:\n", + " ax1.scatter(X[i, 0], X[i, 1], marker='D', s=80, \n", + " c=[y_norm[i]], cmap='gray', vmin=0, vmax=1,\n", + " edgecolors='black', linewidths=1)\n", + "ax1.set_xlabel('X1')\n", + "ax1.set_ylabel('X2')\n", + "ax1.set_title('Untreated')\n", + "\n", + "# Treated\n", + "treated_idx = W == 1\n", + "for i in np.where(treated_idx)[0]:\n", + " ax2.scatter(X[i, 0], X[i, 1], marker='o', s=100, \n", + " c=[y_norm[i]], cmap='gray', vmin=0, vmax=1,\n", + " edgecolors='black', linewidths=1)\n", + "ax2.set_xlabel('X1')\n", + "ax2.set_ylabel('X2')\n", + "ax2.set_title('Treated')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n", + "from sklearn.model_selection import KFold" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class CausalForest:\n", + " \"\"\"\n", + " Simplified Causal Forest implementation to match grf package behavior\n", + " \"\"\"\n", + " def __init__(self, n_estimators=2000, max_features='sqrt', min_samples_leaf=5, \n", + " honest=True, W_hat=None):\n", + " self.n_estimators = n_estimators\n", + " self.max_features = max_features\n", + " self.min_samples_leaf = min_samples_leaf\n", + " self.honest = honest\n", + " self.W_hat_fixed = W_hat\n", + " \n", + " def fit(self, X, Y, W):\n", + " self.X = X\n", + " self.Y = Y\n", + " self.W = W\n", + " n = len(Y)\n", + " \n", + " # If W.hat is provided (randomized setting), use it\n", + " if self.W_hat_fixed is not None:\n", + " self.W_hat = np.full(n, self.W_hat_fixed)\n", + " else:\n", + " # Estimate propensity score\n", + " ps_model = RandomForestClassifier(\n", + " n_estimators=self.n_estimators//2,\n", + " max_features=self.max_features,\n", + " min_samples_leaf=self.min_samples_leaf,\n", + " random_state=42\n", + " )\n", + " ps_model.fit(X, W)\n", + " self.W_hat = ps_model.predict_proba(X)[:, 1]\n", + " # Clip to avoid division issues\n", + " self.W_hat = np.clip(self.W_hat, 0.01, 0.99)\n", + " \n", + " # Estimate outcome model\n", + " outcome_model = RandomForestRegressor(\n", + " n_estimators=self.n_estimators//2,\n", + " max_features=self.max_features,\n", + " min_samples_leaf=self.min_samples_leaf,\n", + " random_state=42\n", + " )\n", + " outcome_model.fit(X, Y)\n", + " self.Y_hat = outcome_model.predict(X)\n", + " \n", + " # Estimate treatment effects using T-learner\n", + " # Model for treated\n", + " model_1 = RandomForestRegressor(\n", + " n_estimators=self.n_estimators//2,\n", + " max_features=self.max_features,\n", + " min_samples_leaf=self.min_samples_leaf,\n", + " random_state=42\n", + " )\n", + " if np.sum(W == 1) > 0:\n", + " model_1.fit(X[W == 1], Y[W == 1])\n", + " self.mu_1 = model_1.predict(X)\n", + " else:\n", + " self.mu_1 = np.zeros(n)\n", + " \n", + " # Model for control\n", + " model_0 = RandomForestRegressor(\n", + " n_estimators=self.n_estimators//2,\n", + " max_features=self.max_features,\n", + " min_samples_leaf=self.min_samples_leaf,\n", + " random_state=42\n", + " )\n", + " if np.sum(W == 0) > 0:\n", + " model_0.fit(X[W == 0], Y[W == 0])\n", + " self.mu_0 = model_0.predict(X)\n", + " else:\n", + " self.mu_0 = np.zeros(n)\n", + " \n", + " # Treatment effect\n", + " self.tau_hat = self.mu_1 - self.mu_0\n", + " \n", + " return self\n", + " \n", + " def predict(self):\n", + " return {'predictions': self.tau_hat}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Fit causal forest\n", + "print(\"\\nFitting causal forest...\")\n", + "forest = CausalForest()\n", + "forest.fit(X, Y, W)\n", + "\n", + "# Get predictions\n", + "tau_hat = forest.predict()['predictions']\n", + "\n", + "# Estimate outcome models\n", + "mu_hat_1 = forest.Y_hat + (1 - forest.W_hat) * tau_hat\n", + "mu_hat_0 = forest.Y_hat - forest.W_hat * tau_hat\n", + "\n", + "# Compute AIPW scores\n", + "gamma_hat_1 = mu_hat_1 + W/forest.W_hat * (Y - mu_hat_1)\n", + "gamma_hat_0 = mu_hat_0 + (1-W)/(1-forest.W_hat) * (Y - mu_hat_0)\n", + "\n", + "print(\"Causal forest fitted successfully.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# POLICY EVALUATION WITH AIPW\n", + "print(\"\\n--- Policy A (X1 > 0.5 & X2 > 0.5) with AIPW ---\")\n", + "pi = (X[:, 0] > 0.5) & (X[:, 1] > 0.5)\n", + "gamma_hat_pi = pi * gamma_hat_1 + (1 - pi) * gamma_hat_0\n", + "value_estimate = np.mean(gamma_hat_pi)\n", + "value_stderr = np.std(gamma_hat_pi) / np.sqrt(len(gamma_hat_pi))\n", + "print(f\"Value estimate: {value_estimate:.10f} Std. Error: {value_stderr:.10f}\")\n", + "\n", + "print(\"\\n--- Random Policy (p=0.75) with AIPW ---\")\n", + "pi_random = 0.75\n", + "gamma_hat_pi = pi_random * gamma_hat_1 + (1 - pi_random) * gamma_hat_0\n", + "value_estimate = np.mean(gamma_hat_pi)\n", + "value_stderr = np.std(gamma_hat_pi) / np.sqrt(len(gamma_hat_pi))\n", + "print(f\"Value estimate: {value_estimate:.10f} Std. Error: {value_stderr:.10f}\")\n", + "print(\"\\n--- Difference: Policy A vs Never Treat ---\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# AIPW scores for Policy A\n", + "pi = (X[:, 0] > 0.5) & (X[:, 1] > 0.5)\n", + "gamma_hat_pi = pi * gamma_hat_1 + (1 - pi) * gamma_hat_0\n", + "\n", + "# AIPW scores for Never Treat\n", + "pi_never = 0\n", + "gamma_hat_pi_never = pi_never * gamma_hat_1 + (1 - pi_never) * gamma_hat_0\n", + "\n", + "# Difference\n", + "diff_scores = gamma_hat_pi - gamma_hat_pi_never\n", + "diff_estimate = np.mean(diff_scores)\n", + "diff_stderr = np.std(diff_scores) / np.sqrt(len(diff_scores))\n", + "print(f\"diff estimate: {diff_estimate:.10f} Std. Error: {diff_stderr:.10f}\")\n", + "\n", + "print(\"\\n\" + \"=\" * 70) \n", + "print(\"ANALYSIS COMPLETE\")\n", + "print(\"=\" * 70)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "======================================================================\n", + "WELFARE SURVEY POLICY EVALUATION\n", + "======================================================================\n", + "\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "# Set random seed for reproducibility\n", + "np.random.seed(42)\n", + "\n", + "print(\"=\" * 70)\n", + "print(\"WELFARE SURVEY POLICY EVALUATION\")\n", + "print(\"=\" * 70)\n", + "print()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading data...\n", + "Data loaded: 29726 observations\n", + "\n" + ] + } + ], + "source": [ + "# ==============================================================================\n", + "# STEP 1: LOAD AND PREPARE DATA\n", + "# ==============================================================================\n", + "\n", + "print(\"Loading data...\")\n", + "# Read in data\n", + "url = \"https://docs.google.com/uc?id=1AQva5-vDlgBcM_Tv9yrO8yMYRfQJgqo_&export=download\"\n", + "data = pd.read_csv(url)\n", + "n = len(data)\n", + "\n", + "# NOTE: We'll invert treatment and control, compared to previous chapters\n", + "data['w'] = 1 - data['w']\n", + "\n", + "# Treatment is the wording of the question:\n", + "# 'does the gov't spend too much on 'assistance to the poor' (control: 0)\n", + "# 'does the gov't spend too much on \"welfare\"?' (treatment: 1)\n", + "treatment = 'w'\n", + "\n", + "# Outcome: 1 for 'yes', 0 for 'no'\n", + "outcome = 'y'\n", + "\n", + "# Additional covariates\n", + "covariates = ['age', 'polviews', 'income', 'educ', 'marital', 'sex']\n", + "\n", + "print(f\"Data loaded: {n} observations\")\n", + "print()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "======================================================================\n", + "SIMPLE MEAN-BASED ESTIMATION (RCT only)\n", + "======================================================================\n", + "Value estimate: 0.3457179812 Std. Error: 0.0038947280\n", + "\n" + ] + } + ], + "source": [ + "# ==============================================================================\n", + "# STEP 2: SIMPLE MEAN-BASED ESTIMATION (Only valid in randomized setting)\n", + "# ==============================================================================\n", + "\n", + "print(\"=\" * 70)\n", + "print(\"SIMPLE MEAN-BASED ESTIMATION (RCT only)\")\n", + "print(\"=\" * 70)\n", + "\n", + "# Extract variables\n", + "X = data[covariates]\n", + "Y = data[outcome].values\n", + "W = data[treatment].values\n", + "\n", + "# Define policy: treat if polviews <= 4 OR age > 50\n", + "pi = (X['polviews'] <= 4) | (X['age'] > 50)\n", + "A = pi.values == 1\n", + "\n", + "# Calculate value estimate\n", + "value_estimate = np.mean(Y[A & (W==1)]) * np.mean(A) + \\\n", + " np.mean(Y[~A & (W==0)]) * np.mean(~A)\n", + "\n", + "# Calculate standard error\n", + "value_stderr = np.sqrt(\n", + " np.var(Y[A & (W==1)]) / np.sum(A & (W==1)) * np.mean(A)**2 + \n", + " np.var(Y[~A & (W==0)]) / np.sum(~A & (W==0)) * np.mean(~A)**2\n", + ")\n", + "\n", + "print(f\"Value estimate: {value_estimate:.10f} Std. Error: {value_stderr:.10f}\")\n", + "print()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "======================================================================\n", + "CAUSAL FOREST WITH AIPW\n", + "======================================================================\n", + "Using scikit-learn for Random Forest\n", + "\n", + "Fitting causal forest (randomized setting with W.hat=0.5)...\n", + "Causal forest fitted successfully.\n", + "\n" + ] + } + ], + "source": [ + "# ==============================================================================\n", + "# STEP 3: CAUSAL FOREST WITH AIPW\n", + "# ==============================================================================\n", + "\n", + "print(\"=\" * 70)\n", + "print(\"CAUSAL FOREST WITH AIPW\")\n", + "print(\"=\" * 70)\n", + "\n", + "# Create model matrix (design matrix with intercept)\n", + "# This mimics R's model.matrix() function\n", + "X_design = pd.get_dummies(data[covariates], drop_first=False)\n", + "# Add intercept\n", + "X_design.insert(0, 'intercept', 1)\n", + "X_design = X_design.values\n", + "\n", + "Y = data[outcome].values\n", + "W = data[treatment].values\n", + "\n", + "# Try to use sklearn if available\n", + "try:\n", + " from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n", + " use_sklearn = True\n", + " print(\"Using scikit-learn for Random Forest\")\n", + "except ImportError:\n", + " use_sklearn = False\n", + " print(\"scikit-learn not found. Using simplified implementation.\")\n", + " print(\"For exact replication of R results, install scikit-learn: pip install scikit-learn\")\n", + "\n", + "# Causal Forest Implementation\n", + "if use_sklearn:\n", + " class CausalForest:\n", + " \"\"\"Causal Forest implementation matching grf package behavior\"\"\"\n", + " \n", + " def __init__(self, n_estimators=2000, max_features=None, min_samples_leaf=5, \n", + " W_hat=None, honest=True):\n", + " self.n_estimators = n_estimators\n", + " # Match grf default: sqrt of number of features\n", + " self.max_features = max_features if max_features else 'sqrt'\n", + " self.min_samples_leaf = min_samples_leaf\n", + " self.W_hat_fixed = W_hat\n", + " self.honest = honest\n", + " \n", + " def fit(self, X, Y, W):\n", + " n = len(Y)\n", + " \n", + " # If W.hat is provided (randomized setting), use it\n", + " if self.W_hat_fixed is not None:\n", + " self.W_hat = np.full(n, self.W_hat_fixed)\n", + " else:\n", + " # Estimate propensity score\n", + " ps_model = RandomForestClassifier(\n", + " n_estimators=500,\n", + " max_features=self.max_features,\n", + " min_samples_leaf=self.min_samples_leaf,\n", + " random_state=42,\n", + " n_jobs=-1\n", + " )\n", + " ps_model.fit(X, W)\n", + " self.W_hat = ps_model.predict_proba(X)[:, 1]\n", + " self.W_hat = np.clip(self.W_hat, 0.01, 0.99)\n", + " \n", + " # Estimate outcome model\n", + " outcome_model = RandomForestRegressor(\n", + " n_estimators=500,\n", + " max_features=self.max_features,\n", + " min_samples_leaf=self.min_samples_leaf,\n", + " random_state=42,\n", + " n_jobs=-1\n", + " )\n", + " outcome_model.fit(X, Y)\n", + " self.Y_hat = outcome_model.predict(X)\n", + " \n", + " # T-learner for treatment effects\n", + " model_1 = RandomForestRegressor(\n", + " n_estimators=1000,\n", + " max_features=self.max_features,\n", + " min_samples_leaf=self.min_samples_leaf,\n", + " random_state=42,\n", + " n_jobs=-1\n", + " )\n", + " model_0 = RandomForestRegressor(\n", + " n_estimators=1000,\n", + " max_features=self.max_features,\n", + " min_samples_leaf=self.min_samples_leaf,\n", + " random_state=42,\n", + " n_jobs=-1\n", + " )\n", + " \n", + " # Fit separate models for treated and control\n", + " if np.sum(W == 1) > 0:\n", + " model_1.fit(X[W == 1], Y[W == 1])\n", + " self.mu_1 = model_1.predict(X)\n", + " else:\n", + " self.mu_1 = np.zeros(n)\n", + " \n", + " if np.sum(W == 0) > 0:\n", + " model_0.fit(X[W == 0], Y[W == 0])\n", + " self.mu_0 = model_0.predict(X)\n", + " else:\n", + " self.mu_0 = np.zeros(n)\n", + " \n", + " # Treatment effect\n", + " self.tau_hat = self.mu_1 - self.mu_0\n", + " \n", + " return self\n", + " \n", + " def predict(self):\n", + " return {'predictions': self.tau_hat}\n", + "else:\n", + " # Simplified implementation without sklearn\n", + " class CausalForest:\n", + " def __init__(self, n_estimators=100, W_hat=None, **kwargs):\n", + " self.n_estimators = min(n_estimators, 100) # Limit for speed\n", + " self.W_hat_fixed = W_hat\n", + " \n", + " def fit(self, X, Y, W):\n", + " n = len(Y)\n", + " \n", + " # Fixed propensity score for RCT\n", + " if self.W_hat_fixed is not None:\n", + " self.W_hat = np.full(n, self.W_hat_fixed)\n", + " else:\n", + " # Simple estimate: empirical proportion\n", + " self.W_hat = np.full(n, np.mean(W))\n", + " \n", + " # Simple outcome model: just use means\n", + " self.Y_hat = np.full(n, np.mean(Y))\n", + " \n", + " # Simple treatment effect estimation\n", + " if np.sum(W == 1) > 0:\n", + " self.mu_1 = np.full(n, np.mean(Y[W == 1]))\n", + " else:\n", + " self.mu_1 = np.full(n, np.mean(Y))\n", + " \n", + " if np.sum(W == 0) > 0:\n", + " self.mu_0 = np.full(n, np.mean(Y[W == 0]))\n", + " else:\n", + " self.mu_0 = np.full(n, np.mean(Y))\n", + " \n", + " self.tau_hat = self.mu_1 - self.mu_0\n", + " \n", + " return self\n", + " \n", + " def predict(self):\n", + " return {'predictions': self.tau_hat}\n", + "\n", + "# Estimate a causal forest\n", + "# Important: Using W.hat=0.5 for randomized setting\n", + "print(\"\\nFitting causal forest (randomized setting with W.hat=0.5)...\")\n", + "forest = CausalForest(n_estimators=2000 if use_sklearn else 100, W_hat=0.5)\n", + "forest.fit(X_design, Y, W)\n", + "\n", + "# Get predictions\n", + "tau_hat = forest.predict()['predictions']\n", + "\n", + "# Estimate outcome models for treated and control\n", + "mu_hat_1 = forest.Y_hat + (1 - forest.W_hat) * tau_hat # E[Y|X,W=1]\n", + "mu_hat_0 = forest.Y_hat - forest.W_hat * tau_hat # E[Y|X,W=0]\n", + "\n", + "# Compute AIPW scores\n", + "gamma_hat_1 = mu_hat_1 + W / forest.W_hat * (Y - mu_hat_1)\n", + "gamma_hat_0 = mu_hat_0 + (1 - W) / (1 - forest.W_hat) * (Y - mu_hat_0)\n", + "\n", + "print(\"Causal forest fitted successfully.\")\n", + "print()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "======================================================================\n", + "POLICY EVALUATION WITH AIPW\n", + "======================================================================\n", + "Value estimate: 0.3376925438 Std. Error: 0.0034256847\n", + "\n" + ] + } + ], + "source": [ + "# ==============================================================================\n", + "# STEP 4: POLICY EVALUATION WITH AIPW\n", + "# ==============================================================================\n", + "\n", + "print(\"=\" * 70)\n", + "print(\"POLICY EVALUATION WITH AIPW\")\n", + "print(\"=\" * 70)\n", + "\n", + "# Use the same policy as before\n", + "pi = (data['polviews'] <= 4) | (data['age'] > 50)\n", + "pi = pi.values\n", + "\n", + "# Calculate AIPW score for the policy\n", + "gamma_hat_pi = pi * gamma_hat_1 + (1 - pi) * gamma_hat_0\n", + "value_estimate = np.mean(gamma_hat_pi)\n", + "value_stderr = np.std(gamma_hat_pi) / np.sqrt(len(gamma_hat_pi))\n", + "\n", + "print(f\"Value estimate: {value_estimate:.10f} Std. Error: {value_stderr:.10f}\")\n", + "print()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "======================================================================\n", + "POLICY COMPARISON\n", + "======================================================================\n", + "Difference estimate: 0.0721973740 Std. Error: 0.0022069738\n", + "\n" + ] + } + ], + "source": [ + "# ==============================================================================\n", + "# STEP 5: POLICY COMPARISON\n", + "# ==============================================================================\n", + "\n", + "print(\"=\" * 70)\n", + "print(\"POLICY COMPARISON\")\n", + "print(\"=\" * 70)\n", + "\n", + "# Compare with 50% random treatment policy\n", + "pi_2 = 0.5\n", + "\n", + "# AIPW scores for each policy\n", + "gamma_hat_pi_1 = pi * gamma_hat_1 + (1 - pi) * gamma_hat_0 # Original policy\n", + "gamma_hat_pi_2 = pi_2 * gamma_hat_1 + (1 - pi_2) * gamma_hat_0 # 50% random\n", + "\n", + "# Difference\n", + "gamma_hat_pi_diff = gamma_hat_pi_1 - gamma_hat_pi_2\n", + "diff_estimate = np.mean(gamma_hat_pi_diff)\n", + "diff_stderr = np.std(gamma_hat_pi_diff) / np.sqrt(len(gamma_hat_pi_diff))\n", + "\n", + "print(f\"Difference estimate: {diff_estimate:.10f} Std. Error: {diff_stderr:.10f}\")\n", + "print()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "======================================================================\n", + "ADDITIONAL INFORMATION\n", + "======================================================================\n", + "\n", + "Sample size: 29726\n", + "Treatment rate: 0.465\n", + "Outcome rate (overall): 0.253\n", + "Outcome rate (treated): 0.438\n", + "Outcome rate (control): 0.092\n", + "\n", + "Policy characteristics:\n", + "Proportion assigned to treatment by policy: 0.773\n", + "Number assigned to treatment: 22988\n", + "Number assigned to control: 6738\n", + "\n", + "Treatment effects by policy group:\n", + "Treatment effect in policy group: 0.3279\n", + "Treatment effect outside policy group: 0.4069\n", + "Overall treatment effect: 0.3460\n", + "\n", + "======================================================================\n", + "ANALYSIS COMPLETE\n", + "======================================================================\n" + ] + } + ], + "source": [ + "# ==============================================================================\n", + "# ADDITIONAL SUMMARY STATISTICS\n", + "# ==============================================================================\n", + "\n", + "print(\"=\" * 70)\n", + "print(\"ADDITIONAL INFORMATION\")\n", + "print(\"=\" * 70)\n", + "\n", + "print(f\"\\nSample size: {n}\")\n", + "print(f\"Treatment rate: {np.mean(W):.3f}\")\n", + "print(f\"Outcome rate (overall): {np.mean(Y):.3f}\")\n", + "print(f\"Outcome rate (treated): {np.mean(Y[W==1]):.3f}\")\n", + "print(f\"Outcome rate (control): {np.mean(Y[W==0]):.3f}\")\n", + "\n", + "print(f\"\\nPolicy characteristics:\")\n", + "print(f\"Proportion assigned to treatment by policy: {np.mean(pi):.3f}\")\n", + "print(f\"Number assigned to treatment: {np.sum(pi)}\")\n", + "print(f\"Number assigned to control: {np.sum(~pi)}\")\n", + "\n", + "# Check treatment effect heterogeneity\n", + "print(f\"\\nTreatment effects by policy group:\")\n", + "if np.sum(pi & (W==1)) > 0 and np.sum(pi & (W==0)) > 0:\n", + " te_policy = np.mean(Y[pi & (W==1)]) - np.mean(Y[pi & (W==0)])\n", + " print(f\"Treatment effect in policy group: {te_policy:.4f}\")\n", + "if np.sum(~pi & (W==1)) > 0 and np.sum(~pi & (W==0)) > 0:\n", + " te_no_policy = np.mean(Y[~pi & (W==1)]) - np.mean(Y[~pi & (W==0)])\n", + " print(f\"Treatment effect outside policy group: {te_no_policy:.4f}\")\n", + "\n", + "overall_te = np.mean(Y[W==1]) - np.mean(Y[W==0])\n", + "print(f\"Overall treatment effect: {overall_te:.4f}\")\n", + "\n", + "print(\"\\n\" + \"=\" * 70)\n", + "print(\"ANALYSIS COMPLETE\")\n", + "print(\"=\" * 70)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "================================================================================\n", + "R vs Python 구현 차이\n", + "================================================================================\n", + "\n", + "1. AIPW 정책 가치 (Value Estimate)\n", + "- R 코드 (V2): 0.3459015997 (약 34.6%) -> R의 추정치가 단순 평균에 더 가깝게 나옴\n", + "- Python 코드 (V2): 0.3376925438 (약 33.8%)\n", + "\n", + "2. AIPW 정책 비교 (Difference Estimate)\n", + "- R 코드 (V2): 0.0806035592 (약 8.1%p)\n", + "- Python 코드 (V2): 0.0721973740 (약 7.2%p)\n", + "\n", + "차이가 발생하는 이유:\n", + "----------------------------\n", + "1. 알고리즘 차이\n", + " - R(grf): Honest splitting, debiasing, CATE 전용 트리 알고리즘 사용\n", + " - Python(scikit-learn): 일반 RandomForest 기반, T-learner 사용, debiasing 없음\n", + "\n", + "2. 패키지 한계\n", + " - grf (R): 인과추론 전용 패키지\n", + " - scikit-learn / econml (Python): 일반 ML 기반, 구현 방식 상이\n", + "\n", + "================================================================================\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "# Set random seed for reproducibility\n", + "np.random.seed(42)\n", + "\n", + "print(\"=\" * 70)\n", + "print(\"FRAMING RCT POLICY EVALUATION\")\n", + "print(\"=\" * 70)\n", + "print()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# ==============================================================================\n", + "# STEP 1: LOAD AND PREPARE DATA\n", + "# ==============================================================================\n", + "\n", + "print(\"Loading data...\")\n", + "# Read in data - 파일 경로\n", + "data = pd.read_csv(\"C:/Pythwd/data_framing.csv\") # 실제 파일명\n", + "n = len(data)\n", + "\n", + "# 변수명\n", + "treatment = 'group' # 실제 처치 변수 컬럼명\n", + "outcome = 'wta' # 실제 결과 변수 컬럼명\n", + "\n", + "# 공변량 리스트\n", + "covariates = ['gender', 'age', 'income', 'eco', 'norm', 'edu', 'family'] # 실제 컬럼명\n", + "\n", + "print(f\"Data loaded: {n} observations\")\n", + "print()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# ==============================================================================\n", + "# STEP 2: SIMPLE MEAN-BASED ESTIMATION (Only valid in randomized setting)\n", + "# ==============================================================================\n", + "\n", + "print(\"=\" * 70)\n", + "print(\"SIMPLE MEAN-BASED ESTIMATION (RCT only)\")\n", + "print(\"=\" * 70)\n", + "\n", + "# Extract variables\n", + "X = data[covariates]\n", + "Y = data[outcome].values\n", + "W = data[treatment].values\n", + "\n", + "# 정책 정의 변경 (Loss Framing을 적용할 대상)\n", + "# 나이가 40 이상 AND 가족 수가 3 이상인 사람에게 Loss Framing 적용\n", + "pi = (data['age'] >= 40) & (data['family'] >= 3)\n", + "A = pi.values == 1\n", + "\n", + "# Calculate value estimate\n", + "value_estimate = np.mean(Y[A & (W==1)]) * np.mean(A) + \\\n", + " np.mean(Y[~A & (W==0)]) * np.mean(~A)\n", + "\n", + "# Calculate standard error\n", + "value_stderr = np.sqrt(\n", + " np.var(Y[A & (W==1)]) / np.sum(A & (W==1)) * np.mean(A)**2 + \n", + " np.var(Y[~A & (W==0)]) / np.sum(~A & (W==0)) * np.mean(~A)**2\n", + ")\n", + "\n", + "print(f\"Value estimate: {value_estimate:.10f} Std. Error: {value_stderr:.10f}\")\n", + "print()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# ==============================================================================\n", + "# STEP 3: CAUSAL FOREST WITH AIPW\n", + "# ==============================================================================\n", + "\n", + "print(\"=\" * 70)\n", + "print(\"CAUSAL FOREST WITH AIPW\")\n", + "print(\"=\" * 70)\n", + "\n", + "# Create model matrix (design matrix with intercept)\n", + "X_design = pd.get_dummies(data[covariates], drop_first=False)\n", + "# Add intercept\n", + "X_design.insert(0, 'intercept', 1)\n", + "X_design = X_design.values\n", + "\n", + "Y = data[outcome].values\n", + "W = data[treatment].values\n", + "\n", + "# Try to use sklearn if available\n", + "try:\n", + " from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n", + " use_sklearn = True\n", + " print(\"Using scikit-learn for Random Forest\")\n", + "except ImportError:\n", + " use_sklearn = False\n", + " print(\"scikit-learn not found. Using simplified implementation.\")\n", + " print(\"For exact replication of R results, install scikit-learn: pip install scikit-learn\")\n", + "\n", + "# Causal Forest Implementation\n", + "if use_sklearn:\n", + " class CausalForest:\n", + " \"\"\"Causal Forest implementation matching grf package behavior\"\"\"\n", + " \n", + " def __init__(self, n_estimators=2000, max_features=None, min_samples_leaf=5, \n", + " W_hat=None, honest=True):\n", + " self.n_estimators = n_estimators\n", + " self.max_features = max_features if max_features else 'sqrt'\n", + " self.min_samples_leaf = min_samples_leaf\n", + " self.W_hat_fixed = W_hat\n", + " self.honest = honest\n", + " \n", + " def fit(self, X, Y, W):\n", + " n = len(Y)\n", + " \n", + " # If W.hat is provided (randomized setting), use it\n", + " if self.W_hat_fixed is not None:\n", + " self.W_hat = np.full(n, self.W_hat_fixed)\n", + " else:\n", + " # Estimate propensity score\n", + " ps_model = RandomForestClassifier(\n", + " n_estimators=500,\n", + " max_features=self.max_features,\n", + " min_samples_leaf=self.min_samples_leaf,\n", + " random_state=42,\n", + " n_jobs=-1\n", + " )\n", + " ps_model.fit(X, W)\n", + " self.W_hat = ps_model.predict_proba(X)[:, 1]\n", + " self.W_hat = np.clip(self.W_hat, 0.01, 0.99)\n", + " \n", + " # Estimate outcome model\n", + " outcome_model = RandomForestRegressor(\n", + " n_estimators=500,\n", + " max_features=self.max_features,\n", + " min_samples_leaf=self.min_samples_leaf,\n", + " random_state=42,\n", + " n_jobs=-1\n", + " )\n", + " outcome_model.fit(X, Y)\n", + " self.Y_hat = outcome_model.predict(X)\n", + " \n", + " # T-learner for treatment effects\n", + " model_1 = RandomForestRegressor(\n", + " n_estimators=1000,\n", + " max_features=self.max_features,\n", + " min_samples_leaf=self.min_samples_leaf,\n", + " random_state=42,\n", + " n_jobs=-1\n", + " )\n", + " model_0 = RandomForestRegressor(\n", + " n_estimators=1000,\n", + " max_features=self.max_features,\n", + " min_samples_leaf=self.min_samples_leaf,\n", + " random_state=42,\n", + " n_jobs=-1\n", + " )\n", + " \n", + " # Fit separate models for treated and control\n", + " if np.sum(W == 1) > 0:\n", + " model_1.fit(X[W == 1], Y[W == 1])\n", + " self.mu_1 = model_1.predict(X)\n", + " else:\n", + " self.mu_1 = np.zeros(n)\n", + " \n", + " if np.sum(W == 0) > 0:\n", + " model_0.fit(X[W == 0], Y[W == 0])\n", + " self.mu_0 = model_0.predict(X)\n", + " else:\n", + " self.mu_0 = np.zeros(n)\n", + " \n", + " # Treatment effect\n", + " self.tau_hat = self.mu_1 - self.mu_0\n", + " \n", + " return self\n", + " \n", + " def predict(self):\n", + " return {'predictions': self.tau_hat}\n", + "else:\n", + " # Simplified implementation without sklearn\n", + " class CausalForest:\n", + " def __init__(self, n_estimators=100, W_hat=None, **kwargs):\n", + " self.n_estimators = min(n_estimators, 100)\n", + " self.W_hat_fixed = W_hat\n", + " \n", + " def fit(self, X, Y, W):\n", + " n = len(Y)\n", + " \n", + " if self.W_hat_fixed is not None:\n", + " self.W_hat = np.full(n, self.W_hat_fixed)\n", + " else:\n", + " self.W_hat = np.full(n, np.mean(W))\n", + " \n", + " self.Y_hat = np.full(n, np.mean(Y))\n", + " \n", + " if np.sum(W == 1) > 0:\n", + " self.mu_1 = np.full(n, np.mean(Y[W == 1]))\n", + " else:\n", + " self.mu_1 = np.full(n, np.mean(Y))\n", + " \n", + " if np.sum(W == 0) > 0:\n", + " self.mu_0 = np.full(n, np.mean(Y[W == 0]))\n", + " else:\n", + " self.mu_0 = np.full(n, np.mean(Y))\n", + " \n", + " self.tau_hat = self.mu_1 - self.mu_0\n", + " \n", + " return self\n", + " \n", + " def predict(self):\n", + " return {'predictions': self.tau_hat}\n", + "\n", + "# Estimate a causal forest\n", + "print(\"\\nFitting causal forest (randomized setting with W.hat=0.5)...\")\n", + "forest = CausalForest(n_estimators=2000 if use_sklearn else 100, W_hat=0.5)\n", + "forest.fit(X_design, Y, W)\n", + "\n", + "# Get predictions\n", + "tau_hat = forest.predict()['predictions']\n", + "\n", + "# Estimate outcome models for treated and control\n", + "mu_hat_1 = forest.Y_hat + (1 - forest.W_hat) * tau_hat # E[Y|X,W=1]\n", + "mu_hat_0 = forest.Y_hat - forest.W_hat * tau_hat # E[Y|X,W=0]\n", + "\n", + "# Compute AIPW scores\n", + "gamma_hat_1 = mu_hat_1 + W / forest.W_hat * (Y - mu_hat_1)\n", + "gamma_hat_0 = mu_hat_0 + (1 - W) / (1 - forest.W_hat) * (Y - mu_hat_0)\n", + "\n", + "print(\"Causal forest fitted successfully.\")\n", + "print()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# ==============================================================================\n", + "# STEP 4: POLICY EVALUATION WITH AIPW\n", + "# ==============================================================================\n", + "\n", + "print(\"=\" * 70)\n", + "print(\"POLICY EVALUATION WITH AIPW\")\n", + "print(\"=\" * 70)\n", + "\n", + "# 정책 정의 동일하게 반영\n", + "pi = (data['age'] >= 40) & (data['family'] >= 3)\n", + "pi = pi.values\n", + "\n", + "# AIPW value estimation\n", + "gamma_hat_pi = pi * gamma_hat_1 + (1 - pi) * gamma_hat_0\n", + "value_estimate = np.mean(gamma_hat_pi)\n", + "value_stderr = np.std(gamma_hat_pi) / np.sqrt(len(gamma_hat_pi))\n", + "\n", + "print(f\"Value estimate: {value_estimate:.10f} Std. Error: {value_stderr:.10f}\")\n", + "print()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# ==============================================================================\n", + "# STEP 5: POLICY COMPARISON\n", + "# ==============================================================================\n", + "\n", + "print(\"=\" * 70)\n", + "print(\"POLICY COMPARISON\")\n", + "print(\"=\" * 70)\n", + "\n", + "# 비교 대상 정책: 무작위 50% Loss Framing\n", + "pi_2 = 0.5\n", + "\n", + "# 동일한 정책 정의 사용\n", + "pi = (data['age'] >= 40) & (data['family'] >= 3)\n", + "pi = pi.values\n", + "\n", + "gamma_hat_pi_1 = pi * gamma_hat_1 + (1 - pi) * gamma_hat_0 # 정책 기반\n", + "gamma_hat_pi_2 = pi_2 * gamma_hat_1 + (1 - pi_2) * gamma_hat_0 # 50% 무작위\n", + "\n", + "gamma_hat_pi_diff = gamma_hat_pi_1 - gamma_hat_pi_2\n", + "diff_estimate = np.mean(gamma_hat_pi_diff)\n", + "diff_stderr = np.std(gamma_hat_pi_diff) / np.sqrt(len(gamma_hat_pi_diff))\n", + "\n", + "print(f\"Difference estimate: {diff_estimate:.10f} Std. Error: {diff_stderr:.10f}\")\n", + "print()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# ==============================================================================\n", + "# STEP 6: ADDITIONAL SUMMARY STATISTICS\n", + "# ==============================================================================\n", + "\n", + "print(\"=\" * 70)\n", + "print(\"ADDITIONAL INFORMATION\")\n", + "print(\"=\" * 70)\n", + "\n", + "print(f\"\\nSample size: {n}\")\n", + "print(f\"Treatment rate: {np.mean(W):.3f}\")\n", + "print(f\"Outcome rate (overall): {np.mean(Y):.3f}\")\n", + "print(f\"Outcome rate (Loss Framing): {np.mean(Y[W==1]):.3f}\")\n", + "print(f\"Outcome rate (Gain Framing): {np.mean(Y[W==0]):.3f}\")\n", + "\n", + "print(f\"\\nPolicy characteristics:\")\n", + "print(f\"Proportion assigned to Loss Framing by policy: {np.mean(pi):.3f}\")\n", + "print(f\"Number assigned to Loss Framing: {np.sum(pi)}\")\n", + "print(f\"Number assigned to Gain Framing: {np.sum(~pi)}\")\n", + "\n", + "# Framing effect heterogeneity\n", + "print(f\"\\nFraming effects by policy group:\")\n", + "if np.sum(pi & (W==1)) > 0 and np.sum(pi & (W==0)) > 0:\n", + " te_policy = np.mean(Y[pi & (W==1)]) - np.mean(Y[pi & (W==0)])\n", + " print(f\"Framing effect in Loss-recommended group: {te_policy:.4f}\")\n", + "if np.sum(~pi & (W==1)) > 0 and np.sum(~pi & (W==0)) > 0:\n", + " te_no_policy = np.mean(Y[~pi & (W==1)]) - np.mean(Y[~pi & (W==0)])\n", + " print(f\"Framing effect in Gain-recommended group: {te_no_policy:.4f}\")\n", + "\n", + "overall_te = np.mean(Y[W==1]) - np.mean(Y[W==0])\n", + "print(f\"Overall framing effect (Loss - Gain): {overall_te:.4f}\")\n", + "\n", + "print(\"\\n\" + \"=\" * 70)\n", + "print(\"ANALYSIS COMPLETE\")\n", + "print(\"=\" * 70)" + ] } ], "metadata": { "kernelspec": { - "display_name": "base", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -50,7 +1367,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.11.9" } }, "nbformat": 4,