IQSS
diff --git a/‎.Rbuildignore
+4 b/‎.Rbuildignore
+4
diff --git a/‎.gitignore
+1 b/‎.gitignore
+1
diff --git a/‎DESCRIPTION
+24 b/‎DESCRIPTION
+24
diff --git a/‎NAMESPACE
+11 b/‎NAMESPACE
+11
diff --git a/‎R/aux_functions.R
+147 b/‎R/aux_functions.R
+147
diff --git a/‎R/eepd-package.R
+7 b/‎R/eepd-package.R
+7
diff --git a/‎R/eepd_boot.R
+134 b/‎R/eepd_boot.R
+134
@@ -0,0 +1,4 @@
+^_dev$
+^.*\.Rproj$
+^\.Rproj\.user$
+^README\.Rmd$
@@ -47,3 +47,4 @@ po/*~
 
 # RStudio Connect folder
 rsconnect/
+inst/doc
@@ -0,0 +1,24 @@
+Package: eepd
+Title: What the Package Does (One Line, Title Case)
+Version: 0.0.0.9000
+Authors@R: 
+    person("Noah", "Greifer", email = "[email protected]", role = c("aut", "cre"),
+           comment = c(ORCID = "0000-0003-3067-7154"))
+Description: What the package does (one paragraph).
+License: GPL (>= 2)
+Imports:
+  stats,
+  marginaleffects (>= 0.14.0),
+  pbapply (>= 1.7-2),
+  fwb (>= 0.2.0),
+  chk (>= 0.9.0)
+Encoding: UTF-8
+Roxygen: list(markdown = TRUE)
+RoxygenNote: 7.3.0
+Depends: 
+    R (>= 2.10)
+LazyData: true
+Suggests: 
+    knitr,
+    rmarkdown
+VignetteBuilder: knitr
@@ -0,0 +1,11 @@
+# Generated by roxygen2: do not edit by hand
+
+S3method(c,eepd_models)
+S3method(plot,eepd_sim)
+S3method(print,eepd_models)
+S3method(print,eepd_sim)
+export(eepd_boot)
+export(eepd_fit)
+export(eepd_mod)
+export(eepd_sim)
+import(ggplot2)
@@ -0,0 +1,147 @@
+#Fast (weighted) mean, optionally with subset
+.wtd_mean <- function(x, w = NULL, subset = NULL) {
+    if (is.null(subset)) {
+        if (is.null(w)) {
+            sum(x) / length(x)
+        }
+        else {
+            sum(x * w) / sum(w)
+        }
+    }
+    else {
+        x <- x[subset]
+        if (is.null(w)) {
+            sum(x) / length(x)
+        }
+        else {
+            w <- w[subset]
+            sum(x * w) / sum(w)
+        }
+    }
+}
+
+.wtd_sd <- function(x, w = NULL, subset = NULL) {
+    if (is.null(subset)) {
+        if (is.null(w)) {
+            sqrt(sum((x - .wtd_mean(x))^2)/(length(x) - 1))
+        }
+        else {
+            sum_w <- sum(w)
+            sqrt((sum_w / (sum_w^2 - sum(w^2))) * sum(w * (x - .wtd_mean(x, w))^2))
+        }
+    }
+    else {
+        x <- x[subset]
+        if (is.null(w)) {
+            sqrt(sum((x - .wtd_mean(x))^2)/(length(x) - 1))
+        }
+        else {
+            w <- w[subset]
+            sum_w <- sum(w)
+            sqrt((sum_w / (sum_w^2 - sum(w^2))) * sum(w * (x - .wtd_mean(x, w))^2))
+        }
+    }
+}
+
+#Binds together multiple smaller square matrices (e.g., vcovs) into a larger one
+#with 0s in the empty spaces. Used to create large covariance matrix
+.block_diagonal <- function(matlist) {
+    dim1 <- sum(unlist(lapply(matlist, ncol)))
+    
+    out <- matrix(0, nrow = dim1, ncol = dim1)
+    k <- 0
+    for (v in matlist) {
+        ind <- k + seq_len(ncol(v))
+        out[ind, ind] <- v
+        k <- k + ncol(v)
+    }
+    
+    out
+}
+
+#Checks if a given family specification is okay
+.okay_family <- function(family) {
+    if (is.character(family)) {
+        if (length(family) != 1 || anyNA(family)) return(FALSE)
+        if (family %in% c("negbin", "negative.binomial", "Negative Binomial")) return(TRUE)
+        family <- get(family, mode = "function", envir = parent.frame(2))
+    }
+    if (is.function(family)) {
+        family <- family()
+    }
+    
+    !is.null(family$family) && is.function(family$variance) &&
+        is.function(family$linkinv)
+}
+
+#Joint covariance matrix of coefficients across multiple models. Requires same units in all models,
+#use nonzero weights to subset (e.g., weights of 1 for present and 1e-8 for absent). Should give
+#same results as M-estimation (HC0 vcov).
+vcovSUEST <- function(fits) {
+    coef_lengths <- lengths(lapply(fits, function(f) na.omit(coef(f))))
+    l <- c(0, cumsum(coef_lengths))
+    coef_inds <- lapply(seq_along(fits), function(i) seq_len(coef_lengths[i]) + l[i])
+    
+    inf_func <- lapply(fits, function(f) { 
+        b <- .bread(f)
+        ef <- sandwich::estfun(f)
+        inf <- tcrossprod(b, ef)/nobs(f)
+        inf[is.na(inf)] <- 0
+        inf
+    })
+    
+    #VCOV matrix to be returned
+    V <- matrix(NA_real_, nrow = sum(coef_lengths), ncol = sum(coef_lengths))
+    
+    for (i in seq_along(fits)) {
+        ind_i <- coef_inds[[i]]
+        
+        #Usual within-model HC0 vcov
+        V[ind_i, ind_i] <- tcrossprod(inf_func[[i]], inf_func[[i]])
+        
+        for (j in seq_along(fits)[-seq_len(i)]) {
+            ind_j <- coef_inds[[j]]
+            
+            #between-model vcov components
+            V[ind_i, ind_j] <- tcrossprod(inf_func[[i]], inf_func[[j]])
+            V[ind_j, ind_i] <- t(V[ind_i, ind_j])
+        }
+    }
+    
+    V
+}
+
+#Quickly get bread matrix; for non-lm and nonglm objects, uses sandwich::bread()
+.bread <- function(x) {
+    if (!class(x)[1] %in% c("lm", "glm")) {
+        return(sandwich::bread(x))
+    }
+    
+    p <- x$rank
+    
+    if (p == 0) {
+        return(matrix(NA_real_, 0L, 0L))
+    }
+    
+    Qr <- x$qr
+    
+    coef.p <- x$coefficients[Qr$pivot[1:p]]
+    cov.unscaled <- chol2inv(Qr$qr[1:p, 1:p, drop = FALSE])
+    dimnames(cov.unscaled) <- list(names(coef.p), names(coef.p))
+    
+    df <- p + x$df.residual
+    
+    out <- cov.unscaled * df
+        
+    if (class(x)[1] == "glm" && !substr(x$family$family, 1L, 17L) %in% c("poisson", "binomial", "Negative Binomial")) {
+        ww <- weights(x, "working")
+        wres <- as.vector(residuals(x, "working")) * ww
+        dispersion <- sum(wres^2, na.rm = TRUE) / sum(ww, na.rm = TRUE)
+        out <- out * dispersion
+    }
+    
+    out
+}
+
+
+
@@ -0,0 +1,7 @@
+#' @keywords internal
+"_PACKAGE"
+
+## usethis namespace: start
+#' @import ggplot2
+## usethis namespace: end
+NULL
@@ -0,0 +1,134 @@
+#' @title Bootstrap estimation of ATTs
+#' 
+#' @description `eepd_boot()` bootstraps the selection of optimal models and estimation of ATTs done by [eepd_sim()] in order to account for uncertainty in sampling from the population. Bootstrapping is done by [fwb::fwb()], which uses the fractional weighted bootstrap or the traditional bootstrap.
+#' 
+#' @inheritParams eepd_fit
+#' @inheritParams fwb::fwb
+#' @param models either an `eepd_models` object (the output of a call to [eepd_mod()]) or an `eepd_fits` object (the output of a call to [eepd_fit()]). If the latter, the arguments `data`, `group_var`, `unit_var`, `time_var`, `val_times`, and `post_time` should be left empty as they will be extracted from the supplied object.
+#' @param nboot the number of bootstrap iterations to use; default is 999. More is better but takes longer.
+#' @param boot_type string; the type of bootstrap to perform. See the `wtype` argument of [fwb::fwb()] for allowable options. The default is `"exp"`, which requests the fractional weighted bootstrap using weights drawn from an Exp(1) distribution. `"multinom"` requests the usual bootstrap, which can fail when key observation requird to fit certain models happen not to be selected into a given bootstrap sample.
+#' @param nsim the number of simulation iteration to perform in each bootstrap sample. Default is 200. More is better but takes longer.
+#' 
+#' @returns
+#' An `fwb` object containing the estimated ATTs in each bootstrap iteration. See [fwb::fwb()] for details. `summary()`, `plot()`, and `print()` methods are available; see [fwb::summary.fwb()] and [fwb::plot.fwb()] for details.
+#' 
+#' @seealso [eepd_fit()]; [eepd_sim()]; [fwb::fwb()]
+#' 
+#' @examples 
+#' data("ptpdata")
+#' 
+#' # Combination of 8 models: 2 baseline formulas,
+#' # 2 families, 2 lags
+#' models <- eepd_mod(list(crude_rate ~ 1,
+#'                         crude_rate ~ year),
+#'                    log = c(FALSE, TRUE))
+#' models
+#' 
+#' # Fit the models to data; unit_var must be supplied for
+#' # fixed effects
+#' cl <- parallel::detectCores()
+#' boot_out <- eepd_boot(models, data = ptpdata,
+#'                       nboot = 99, nsim = 100,
+#'                       group_var = "group",
+#'                       time_var = "year",
+#'                       val_times = 1999:2003,
+#'                       post_time = 2008,
+#'                       unit_var = "state",
+#'                       cl = cl, verbose = TRUE)
+#' 
+#' summary(boot_out, ci.type = "perc")
+
+#' @export 
+eepd_boot <- function(models, data, nboot = 999, boot_type = getOption("fwb_wtype", "exp"), nsim = 200,
+                      group_var, unit_var, time_var,
+                      val_times, post_time, cl = NULL, verbose = FALSE) {
+    # Argument checks
+    mcall <- match.call()
+    chk::chk_not_missing(models, "`models`")
+    
+    if (inherits(models, "eepd_models")) {
+        chk::chk_not_missing(data, "`data`")
+        chk::chk_data(data)
+        
+        # Process and order dataset
+        data <- as.data.frame(data)
+        
+        if (is.null(rownames(data))) {
+            rownames(data) <- seq_len(nrow(data))
+        }
+        
+        chk::chk_not_missing(group_var, "`group_var`")
+        chk::chk_string(group_var)
+        chk::chk_subset(group_var, names(data))
+        if (length(unique(data[[group_var]])) != 2) {
+            chk::err("the grouping variable must have exactly 2 unique values")
+        }
+        data[[group_var]] <- factor(data[[group_var]])
+        levels(data[[group_var]]) <- c("0", "1")
+        
+        chk::chk_not_missing(time_var, "`time_var`")
+        chk::chk_string(time_var)
+        chk::chk_subset(time_var, names(data))
+        
+        if (any(vapply(models, function(m) m$diff_k > 0 || m$lag > 0 || m$fixef, logical(1L)))) {
+            chk::chk_not_missing(unit_var, "`unit_var`")
+            chk::chk_string(unit_var)
+            chk::chk_subset(unit_var, names(data))
+            data[[unit_var]] <- factor(data[[unit_var]])
+            
+            data <- data[order(data[[unit_var]], data[[time_var]]),, drop = FALSE]
+        }
+        
+        chk::chk_not_missing(val_times, "`val_times`")
+        chk::chk_numeric(val_times)
+        chk::chk_subset(val_times, data[[time_var]])
+        
+        chk::chk_not_missing(post_time, "`post_time`")
+        chk::chk_number(post_time)
+        chk::chk_subset(post_time, data[[time_var]])
+        chk::chk_gt(post_time, val_times)
+    }
+    else if (inherits(models, "eepd_fits")) {
+        chk::chk_missing(data, "`data`")
+        chk::chk_missing(group_var, "`group_var`")
+        chk::chk_missing(time_var, "`time_var`")
+        chk::chk_missing(unit_var, "`unit_var`")
+        chk::chk_missing(val_times, "`val_times`")
+        chk::chk_missing(post_time, "`post_time`")
+        
+        data <- models$data
+        group_var <- attr(models, "group_var")
+        time_var <- attr(models, "time_var")
+        unit_var <- attr(models, "unit_var")
+        val_times <- models$val_times
+        post_time <- models$post_time
+        models <- models$models
+    }
+    else {
+        chk::err("`models` must be an `eepd_models` object (the output of a call to `eepd_mod()`) or an `eepd_fits` object (the output of a call to `eepd_fit()`")
+    }
+    
+    chk::chk_flag(verbose)
+    
+    .boot_fun <- function(.data, .weights) {
+        
+        fits <- .fit_models_internal(models, .data, .weights, group_var, unit_var, time_var,
+                                     val_times, post_time)
+
+        if (all(.weights == 1)) {
+            sim_est <- eepd_sim(fits)
+        }
+        else {
+            sim_est <- eepd_sim(fits, nsim)
+        }
+        
+        c(ATT = unname(mean(sim_est$atts)))
+    }
+    
+    boot_out <- fwb::fwb(data, .boot_fun, R = nboot, cluster = data[[unit_var]],
+                         wtype = boot_type, verbose = verbose, cl = cl)
+    
+    class(boot_out) <- c("eepd_boot", class(boot_out))
+    
+    boot_out
+}
Original file line number	Diff line number	Diff line change
`@@ -47,3 +47,4 @@ po/*~`
`47`	`47`
`48`	`48`	`# RStudio Connect folder`
`49`	`49`	`rsconnect/`
	`50`	`+inst/doc`