Skip to content

maxleungtszchun/Replication-of-Petersen-2009-Simulation

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

3 Commits
 
 
 
 
 
 
 
 

Repository files navigation

Replication of Petersen (2009) Simulation

knitr::opts_chunk$set(echo = TRUE)
library(parallel)
library(foreach)
library(doParallel)
library(lmtest)
library(sandwich)
library(plm)
library(knitr)
library(tidyverse)
set.seed(5, kind = "L'Ecuyer-CMRG")

Petersen (2009) is an influential paper guiding researchers picking correct standard errors when using linear panel models (for the case without Omitted Variable Bias (OVB)). A brief summary of the paper’s results can be found in my study note of linear panel model (Section 7.1).

Here is my R script replicating the paper’s key results. It can be executed in both Windows and MacOS. However, it will be run much faster in MacOS by using multiple CPUs due to the use of doParallel package. To reduce running time, only 500 simulations instead of 5000 simulations executed in the paper are performed.

gen_long_data <- function (N, T, var_gamma_i, var_mu_i, fe = "individual") {
  d <- tibble(.rows = N) %>% mutate(i = seq_len(N))
  d["gamma_i"] <- rnorm(N, mean = 0, sd = sqrt(var_gamma_i))
  d["mu_i"] <- rnorm(N, mean = 0, sd = sqrt(var_mu_i))

  gen_wide_data_i_eff <- function (df) {
    var_eta_it <- 4 - var_gamma_i
    var_upsilon_it <- 1 - var_mu_i
    for (t in seq_len(T)) {
      df[str_c("eta_i", t)] <- rnorm(N, mean = 0, sd = sqrt(var_eta_it))
      df[str_c("varepsilon_i", t)] <- df[["gamma_i"]] + df[[str_c("eta_i", t)]]
      df[str_c("upsilon_i", t)] <- rnorm(N, mean = 0, sd = sqrt(var_upsilon_it))
      df[str_c("x_i", t)] <- df[["mu_i"]] + df[[str_c("upsilon_i", t)]]
      df[str_c("y_i", t)] <- df[[str_c("x_i", t)]] * 1 + df[[str_c("varepsilon_i", t)]]
    }
    return(df)
  }

  gen_wide_data_t_eff <- function (df) {
    var_delta_t <- var_gamma_i
    var_zeta_t <- var_mu_i

    var_eta_it <- 4 - var_delta_t
    var_upsilon_it <- 1 - var_zeta_t
    for (t in seq_len(T)) {
      df[str_c("eta_i", t)] <- rnorm(N, mean = 0, sd = sqrt(var_eta_it))
      df[str_c("varepsilon_i", t)] <- rnorm(1, mean = 0, sd = sqrt(var_delta_t)) + df[[str_c("eta_i", t)]]
      df[str_c("upsilon_i", t)] <- rnorm(N, mean = 0, sd = sqrt(var_upsilon_it))
      df[str_c("x_i", t)] <- rnorm(1, mean = 0, sd = sqrt(var_zeta_t)) + df[[str_c("upsilon_i", t)]]
      df[str_c("y_i", t)] <- df[[str_c("x_i", t)]] * 1 + df[[str_c("varepsilon_i", t)]]
    }
    return(df)
  }

  gen_wide_data_it_eff <- function (df) {
    var_delta_t <- var_gamma_i
    var_zeta_t <- var_mu_i

    var_eta_it <- var_gamma_i
    var_upsilon_it <- var_mu_i
    for (t in seq_len(T)) {
      df[str_c("eta_i", t)] <- rnorm(N, mean = 0, sd = sqrt(var_eta_it))
      df[str_c("varepsilon_i", t)] <-
        df[["gamma_i"]] + rnorm(1, mean = 0, sd = sqrt(var_delta_t)) + df[[str_c("eta_i", t)]]
      df[str_c("upsilon_i", t)] <- rnorm(N, mean = 0, sd = sqrt(var_upsilon_it))
      df[str_c("x_i", t)] <-
        df[["mu_i"]] + rnorm(1, mean = 0, sd = sqrt(var_zeta_t)) + df[[str_c("upsilon_i", t)]]
      df[str_c("y_i", t)] <- df[[str_c("x_i", t)]] * 1 + df[[str_c("varepsilon_i", t)]]
    }
    return(df)
  }

  wide_data <-
    switch(
      fe,
      individual = gen_wide_data_i_eff(d),
      time = gen_wide_data_t_eff(d),
      both = gen_wide_data_it_eff(d),
      stop("only individual or time or both fe")
    )

  long_data <-
    wide_data %>%
    pivot_longer(
      cols = starts_with(c("y_i","x_i", "varepsilon_i", "eta_i", "upsilon_i")),
      names_to = c(".value", "t"),
      names_sep = "_i"
    )

  return(long_data)
}

simulation <- function (N, T, var_gamma_i, var_mu_i, sim_num, fe = "individual") {
  get_results <- function () {
    long_data <-
      gen_long_data(
        N = N,
        T = T,
        var_gamma_i = var_gamma_i,
        var_mu_i = var_mu_i,
        fe = fe
      ) %>%
      mutate(t = as.numeric(t))
    long_data_plm <- pdata.frame(long_data, index = c("i", "t"))

    pooled_ols <- plm(y ~ x + 0, model = "pooling", data = long_data_plm)
    ols <- lm(y ~ x + 0, data = long_data) # for calculating White's S.E. later by using sandwich::vcovHC()
    pooled_ols_coef_default_se <- coeftest(pooled_ols)
    clustered_i_var <- plm::vcovHC(pooled_ols, type = "HC1", cluster = "group")
    pooled_ols_coef_clustered_se_i <- coeftest(pooled_ols, vcov = clustered_i_var)
    clustered_t_var <- plm::vcovHC(pooled_ols, type = "HC1", cluster = "time")
    pooled_ols_coef_clustered_se_t <- coeftest(pooled_ols, vcov = clustered_t_var)
    # plm::vcovHC() =/= sandwich::vcovHC()
    clustered_both_var <- clustered_i_var + clustered_t_var - sandwich::vcovHC(ols, type = "HC1")
    pooled_ols_coef_clustered_se_both <- coeftest(pooled_ols, vcov = clustered_both_var)

    pooled_ols_coef_clustered_se <-
      switch(
        fe,
        individual = pooled_ols_coef_clustered_se_i,
        time = pooled_ols_coef_clustered_se_t,
        both = pooled_ols_coef_clustered_se_both,
        stop("only individual or time or both fe")
      )

    cs_betas <- list()
    for (j in seq_len(T)) {
      cs_betas[[j]] <-
        long_data %>%
        filter(t == j) %>%
        lm(y ~ x + 0, data = .) %>%
        summary(.) %>%
        .$coefficients %>%
        .[1,"Estimate"]
    }
    cs_betas <- as.numeric(cs_betas)
    fm_beta <- mean(cs_betas, na.rm = TRUE)
    fm_se <- sqrt(var(cs_betas, na.rm = TRUE) / T)

    return(list(
      "default" = pooled_ols_coef_default_se,
      "clustered" = pooled_ols_coef_clustered_se,
      "fm_beta" = fm_beta,
      "fm_se" = fm_se,
      "fm_t" = (fm_beta - 1) / fm_se
    ))
  }

  if (Sys.info()["sysname"] == "Windows") {
    results <- list()
    for (i in seq_len(sim_num)) {
      results[[i]] <- get_results()
    }
  } else {
    # using all CPUs in non-Windows OS e.g. MacOS
    registerDoParallel(detectCores())
    results <-
      foreach (i = seq_len(sim_num)) %dopar% {
        get_results()
      }
    stopImplicitCluster()
  }

  return(results)
}

get_sim_table <- function (N, T, var_gamma_i, var_mu_i, sim_num, fe = "individual") {
  sim_data <-
    simulation(
      N = N,
      T = T,
      var_gamma_i = var_gamma_i,
      var_mu_i = var_mu_i,
      sim_num = sim_num,
      fe = fe
    )

  t_default <- map_dbl(sim_data, ~ .x$default[1,"t value"] - 1 / .x$default[1,"Std. Error"])
  t_cluster <- map_dbl(sim_data, ~ .x$clustered[1,"t value"] - 1 / .x$clustered[1,"Std. Error"])
  t_fm <- map_dbl(sim_data, ~ .x$fm_t)

  per_sig_t_ols <- mean(as.numeric(abs(t_default) > 2.58), na.rm = TRUE)
  per_sig_t_c <- mean(as.numeric(abs(t_cluster) > 2.58), na.rm = TRUE)
  per_sig_t_fm <- mean(as.numeric(abs(t_fm) > 2.58), na.rm = TRUE)

  if (var_gamma_i == 2 & var_mu_i == 0.5) {
    plotHist <- function (d, title) {
      hist(d, prob = TRUE, xlim = c(-5, 5), ylim = c(0, 1), main = title, col = "orange", "border" = "white")
    }
    plotHist(t_default, "Density of simulated t with default standard error")
    plotHist(t_cluster, "Density of simulated t with clustered standard error")
    plotHist(t_fm, "Density of simulated t with Fama-Macbeth standard error")
  }

  simulated_pooled_ols_betas <- map_dbl(sim_data, ~ .x$default[ ,"Estimate"])
  return(list(
    "avg_beta_ols" = mean(simulated_pooled_ols_betas, na.rm = TRUE),
    "std_beta_ols" = sd(simulated_pooled_ols_betas, na.rm = TRUE),
    "avg_se_ols" = mean(map_dbl(sim_data, ~ .x$default[ ,"Std. Error"]), na.rm = TRUE),
    "per_sig_t_ols" = per_sig_t_ols,
    "avg_se_c" = mean(map_dbl(sim_data, ~ .x$cluster[ ,"Std. Error"]), na.rm = TRUE),
    "per_sig_t_c" = per_sig_t_c,
    "avg_beta_fm" = mean(map_dbl(sim_data, ~ .x$fm_beta), na.rm = TRUE),
    "std_beta_fm" = sd(map_dbl(sim_data, ~ .x$fm_beta), na.rm = TRUE),
    "avg_se_fm" = mean(map_dbl(sim_data, ~ .x$fm_se), na.rm = TRUE),
    "per_sig_t_fm" = per_sig_t_fm
  ))
}

# Var(varepsilon_it) = 4
# Var(x_it) = 1
sim_num <- 500
N <- 500
T <- 10
var_gamma_var_mu_list <-
  list(
    list(0, 0),
    list(0, 0.25),
    list(0, 0.5),
    list(0, 0.75),
    list(1, 0),
    list(1, 0.25),
    list(1, 0.5),
    list(1, 0.75),
    list(2, 0),
    list(2, 0.25),
    list(2, 0.5),
    list(2, 0.75),
    list(3, 0),
    list(3, 0.25),
    list(3, 0.5),
    list(3, 0.75)
  )

start_time <- Sys.time()
sim_table_i <-
  var_gamma_var_mu_list %>%
  map( ~ get_sim_table(N = N, T = T, var_gamma_i = .x[[1]],
                       var_mu_i = .x[[2]], sim_num = sim_num)) %>%
  do.call(bind_rows, .) %>%
  t()

colnames(sim_table_i) <- var_gamma_var_mu_list
kable(sim_table_i, format = "simple", caption = "Only Individual Fixed Effect")

Only Individual Fixed Effect

list(0, 0) list(0, 0.25) list(0, 0.5) list(0, 0.75) list(1, 0) list(1, 0.25) list(1, 0.5) list(1, 0.75) list(2, 0) list(2, 0.25) list(2, 0.5) list(2, 0.75) list(3, 0) list(3, 0.25) list(3, 0.5) list(3, 0.75)
avg_beta_ols 0.9999291 0.9995866 0.9997824 0.9999791 0.9991195 1.0003254 1.0012801 1.0021393 0.9991312 1.0006163 1.0018140 1.0028820 0.9992020 1.0008652 1.0022242 1.0034282
std_beta_ols 0.0285109 0.0280292 0.0281000 0.0285018 0.0283401 0.0367740 0.0422690 0.0467749 0.0281704 0.0423512 0.0516759 0.0592588 0.0281024 0.0470339 0.0593996 0.0694035
avg_se_ols 0.0283031 0.0283100 0.0283379 0.0283698 0.0282668 0.0282895 0.0282978 0.0283130 0.0282454 0.0282807 0.0282874 0.0283009 0.0282216 0.0282739 0.0282791 0.0282910
per_sig_t_ols 0.0020000 0.0120000 0.0120000 0.0120000 0.0080000 0.0460000 0.0780000 0.1240000 0.0100000 0.0880000 0.1560000 0.2280000 0.0080000 0.1260000 0.2320000 0.3040000
avg_se_c 0.0282244 0.0282858 0.0283324 0.0283652 0.0281755 0.0352350 0.0410593 0.0461880 0.0281488 0.0410663 0.0507524 0.0589098 0.0281247 0.0461638 0.0588659 0.0693281
per_sig_t_c 0.0040000 0.0120000 0.0100000 0.0180000 0.0100000 0.0120000 0.0100000 0.0120000 0.0080000 0.0160000 0.0120000 0.0100000 0.0060000 0.0080000 0.0080000 0.0080000
avg_beta_fm 0.9998982 0.9996348 0.9998447 1.0000274 0.9990867 1.0002552 1.0012440 1.0021378 0.9990996 1.0005623 1.0017885 1.0028835 0.9991739 1.0008311 1.0022111 1.0034327
std_beta_fm 0.0285680 0.0280354 0.0281115 0.0285241 0.0283751 0.0368012 0.0422958 0.0468072 0.0281999 0.0423862 0.0517259 0.0593111 0.0281386 0.0470771 0.0594673 0.0694679
avg_se_fm 0.0272252 0.0274720 0.0277030 0.0278883 0.0270076 0.0260943 0.0251176 0.0241653 0.0270049 0.0253624 0.0233805 0.0212648 0.0271049 0.0246264 0.0215344 0.0179261
per_sig_t_fm 0.0340000 0.0160000 0.0220000 0.0280000 0.0280000 0.1000000 0.1400000 0.2220000 0.0180000 0.1380000 0.2820000 0.3840000 0.0180000 0.1940000 0.3760000 0.5200000
# for time fixed effect,
# var_gamma_i in get_sim_table() is actually var_delta_t, and
# var_mu_i in get_sim_table() is actually var_zeta_t
sim_table_t <-
  var_gamma_var_mu_list %>%
  map( ~ get_sim_table(N = N, T = T, var_gamma_i = .x[[1]],
                       var_mu_i = .x[[2]], sim_num = sim_num, fe = "time")) %>%
  do.call(bind_rows, .) %>%
  t()

colnames(sim_table_t) <- var_gamma_var_mu_list
kable(sim_table_t, format = "simple", caption = "Only Time Fixed Effect")

Only Time Fixed Effect

list(0, 0) list(0, 0.25) list(0, 0.5) list(0, 0.75) list(1, 0) list(1, 0.25) list(1, 0.5) list(1, 0.75) list(2, 0) list(2, 0.25) list(2, 0.5) list(2, 0.75) list(3, 0) list(3, 0.25) list(3, 0.5) list(3, 0.75)
avg_beta_ols 0.9999291 1.0012365 1.0013543 1.0013210 0.9987830 1.0141492 1.0190271 1.0226802 0.9985202 1.0195067 1.0264324 1.0316380 0.9983583 1.0235349 1.0320363 1.0384397
std_beta_ols 0.0285109 0.0299101 0.0306535 0.0314519 0.0283706 0.1485192 0.2071813 0.2599469 0.0278140 0.2081587 0.2917997 0.3667642 0.0271388 0.2541608 0.3569387 0.4489400
avg_se_ols 0.0283031 0.0284491 0.0288185 0.0294875 0.0282321 0.0281379 0.0283100 0.0287603 0.0280546 0.0278666 0.0279585 0.0283182 0.0277899 0.0275023 0.0275096 0.0277719
per_sig_t_ols 0.0020000 0.0100000 0.0100000 0.0100000 0.0060000 0.6400000 0.7280000 0.7800000 0.0100000 0.7260000 0.8060000 0.8400000 0.0080000 0.7880000 0.8440000 0.8660000
avg_se_c 0.0258505 0.0257176 0.0255487 0.0254029 0.0259109 0.1347283 0.1848355 0.2274532 0.0258553 0.1884824 0.2599701 0.3205851 0.0255778 0.2299699 0.3177989 0.3921891
per_sig_t_c 0.0380000 0.0400000 0.0460000 0.0620000 0.0380000 0.0440000 0.0520000 0.0700000 0.0340000 0.0420000 0.0560000 0.0660000 0.0240000 0.0340000 0.0480000 0.0620000
avg_beta_fm 0.9998982 1.0012481 1.0014645 1.0016270 0.9988956 1.0079709 1.0060311 1.0018340 0.9986174 1.0108680 1.0084250 1.0029442 0.9984335 1.0130245 1.0102448 1.0038537
std_beta_fm 0.0285680 0.0306572 0.0336908 0.0403699 0.0283730 0.1281308 0.1782373 0.2515866 0.0278434 0.1786821 0.2498522 0.3532767 0.0271906 0.2177488 0.3050279 0.4314840
avg_se_fm 0.0272252 0.0284584 0.0314351 0.0381028 0.0273694 0.1241828 0.1712766 0.2394546 0.0273269 0.1730625 0.2400866 0.3364972 0.0270501 0.2108919 0.2931724 0.4112550
per_sig_t_fm 0.0340000 0.0240000 0.0260000 0.0300000 0.0300000 0.0240000 0.0260000 0.0280000 0.0280000 0.0180000 0.0220000 0.0260000 0.0200000 0.0180000 0.0220000 0.0260000
sim_table_both <-
  list(list(4/3, 1/3)) %>%
  map( ~ get_sim_table(N = 100, T = 100, var_gamma_i = .x[[1]],
                       var_mu_i = .x[[2]], sim_num = sim_num, fe = "both")) %>%
  do.call(bind_rows, .) %>%
  t()
kable(sim_table_both, format = "simple", caption = "Both Individual and Time Fixed Effect")

Both Individual and Time Fixed Effect

avg_beta_ols 1.0074608
std_beta_ols 0.1035387
avg_se_ols 0.0200694
per_sig_t_ols 0.6100000
avg_se_c 0.0947216
per_sig_t_c 0.0200000
avg_beta_fm 1.0066888
std_beta_fm 0.1023621
avg_se_fm 0.0553240
per_sig_t_fm 0.1620000
Sys.time() - start_time

Time difference of 2.686733 mins

About

Replication of Petersen (2009) Simulation

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published

Languages