-
Notifications
You must be signed in to change notification settings - Fork 48
/
Copy pathsimulation.R
239 lines (232 loc) · 7.88 KB
/
simulation.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
#' Simulate Data from Structural Equation Model
#'
#' Interprets the input graph as a structural equation model, generates random path
#' coefficients, and simulates data from the model. This is a very bare-bones
#' function and probably not very useful
#' except for quick validation purposes (e.g. checking that an implied vanishing
#' tetrad truly vanishes in simulated data). For more elaborate simulation studies, please
#' use the lavaan package or similar facilities in other packages.
#'
#' @param x the input graph, a DAG (which may contain bidirected edges).
#' @param N number of samples to generate.
#' @param b.lower lower bound for random path coefficients, applied if \code{b.default=NULL}.
#' @param b.upper upper bound for path coefficients.
#' @param b.default default path coefficient applied to arrows for which no coefficient is
#' defined in the model syntax.
#' @param eps residual variance (only meaningful if \code{standardized=FALSE}).
#' @param empirical logical. If true, the empirical covariance matrix will be equal to the
#' population covariance matrix.
#' @param standardized logical. If true, a standardized population covariance matrix
#' is generated (all variables have variance 1).
#' @param verbose logical. If true, prints the generated population covariance matrix.
#'
#' @return Returns a data frame containing \code{N} values for each variable in \code{x}.
#'
#' @details Data are generated in the following manner.
#' Each directed arrow is assigned a path coefficient that can be given using the attribute
#' "beta" in the model syntax (see the examples). All coefficients not set in this manner are
#' set to the \code{b.default} argument, or if that is not given, are chosen uniformly
#' at random from the interval given by \code{b.lower} and \code{b.upper} (inclusive; set
#' both parameters to the same value for constant path coefficients). Each bidirected
#' arrow a <-> b is replaced by a substructure a <- L -> b, where L is an exogenous latent
#' variable. Path coefficients on such substructures are set to \code{sqrt(x)}, where
#' \code{x} is again chosen at random from the given interval; if \code{x} is negative,
#' one path coefficient is set to \code{-sqrt(x)} and the other to \code{sqrt(x)}. All
#' residual variances are set to \code{eps}.
#'
#' If \code{standardized=TRUE}, all path coefficients are interpreted as standardized coefficients.
#' But not all standardized coefficients are compatible with all graph structures.
#' For instance, the graph structure z <- x -> y -> z is incompatible with standardized
#' coefficients of 0.9, since this would imply that the variance of z must be larger than
#' 1. For large graphs with many parallel paths, it can be very difficult to find coefficients
#' that work.
#'
#'
#' @examples
#' ## Simulate data with pre-defined path coefficients of -.6
#' g <- dagitty('dag{z -> x [beta=-.6] x <- y [beta=-.6] }')
#' x <- simulateSEM( g )
#' cov(x)
#'
#'
#' @export
simulateSEM <- function( x, b.default=NULL, b.lower=-.6, b.upper=.6, eps=1, N=500, standardized=TRUE,
empirical=FALSE, verbose=FALSE ){
if( !requireNamespace( "MASS", quietly=TRUE ) ){
stop("This function requires the 'MASS' package!")
}
.supportsTypes( x, c("dag") )
x <- as.dagitty( x )
e <- .edgeAttributes( x, "beta" )
e$a <- as.double(as.character(e$a))
b.not.set <- is.na(e$a)
if( is.null( b.default ) ){
e$a[b.not.set] <- runif(sum(b.not.set),b.lower,b.upper)
} else {
e$a[b.not.set] <- b.default
}
v <- .vertexAttributes( x, "eps" )
v$a <- as.double(as.character(v$a))
v.not.set <- is.na(v$a)
v$a[v.not.set] <- eps
ovars <- names(x)
lats <- c()
nV <- length(ovars)
nL <- sum(e$e=="<->")
if( nrow(e) > 0 ){
vars <- paste0("v",ovars)
if( nL > 0 ){
lats <- paste0("l",seq_len(nL))
} else {
lats <- c()
}
Beta <- matrix( 0, nrow=nV+nL, ncol=nV+nL )
rownames(Beta) <- colnames(Beta) <- c(vars,lats)
cL <- 1
for( i in seq_len( nrow(e) ) ){
b <- e$a[i]
if( e$e[i] == "<->" ){
lV <- paste0("l",cL)
lb <- sqrt(abs(b))
Beta[lV,paste0("v",e$v[i])] <- lb
if( b < 0 ){
Beta[lV,paste0("v",e$w[i])] <- -lb
} else {
Beta[lV,paste0("v",e$w[i])] <- lb
}
cL <- cL + 1
} else if( e$e[i] == "->" ){
Beta[paste0("v",e$v[i]),paste0("v",e$w[i])] <- b
}
}
L <- (diag( 1, nV+nL ) - Beta)
Li <- MASS::ginv( L )
if( standardized == TRUE ){
Phi <- MASS::ginv( t(Li)^2 ) %*% rep(eps,nrow(Beta))
Phi <- diag( c(Phi), nV+nL )
} else {
veps <- rep( eps, nV+nL )
veps[1:nV] <- v$a[match(ovars,v$v)]
Phi <- diag( veps, nV+nL )
}
Sigma <- t(Li) %*% Phi %*% Li
} else {
if( standardized == TRUE ){
Sigma <- diag(eps,nV+nL)
} else {
veps <- rep( eps, nV+nL )
veps[1:nV] <- v$a[match(ovars,v$v)]
Sigma <- diag( veps, nV+nL )
}
}
if( verbose ){
SigmaC <- Sigma
colnames( SigmaC ) <- rownames( SigmaC ) <- c(ovars,lats)
print( veps )
print( SigmaC )
}
r <- MASS::mvrnorm( N, rep(0,nV+nL), Sigma, empirical=empirical )[,1:nV,drop=FALSE]
colnames(r) <- ovars
r <- as.data.frame(r)
r[,setdiff(ovars,latents(x)),drop=FALSE]
}
#' Simulate Binary Data from DAG Structure
#'
#' Interprets input DAG as a structural description of a logistic
#' model in which each variable is binary and its log-odds ratio is
#' a linear combination of its parent values.
#'
#' @param x the input graph, a DAG (which may contain bidirected edges).
#' @param N number of samples to generate.
#' @param b.lower lower bound for random path coefficients, applied if \code{b.default=NULL}.
#' @param b.upper upper bound for path coefficients.
#' @param b.default default path coefficient applied to arrows for which no coefficient is
#' defined in the model syntax.
#' @param eps base log-odds ratio.
#' @param verbose logical. If true, prints the order in which the data are generated (which
#' should be a topological order).
#'
#' @export
simulateLogistic <- function( x, b.default=NULL,
b.lower=-.6, b.upper=.6, eps=0, N=500,
verbose=FALSE ){
.supportsTypes( x, c("dag") )
x <- as.dagitty( x )
e <- .edgeAttributes( x, "beta" )
e$a <- as.double(as.character(e$a))
b.not.set <- is.na(e$a)
if( is.null( b.default ) ){
e$a[b.not.set] <- runif(sum(b.not.set),b.lower,b.upper)
} else {
e$a[b.not.set] <- b.default
}
v <- .vertexAttributes( x, "eps" )
v$a <- as.double(as.character(v$a))
v.not.set <- is.na(v$a)
v$a[v.not.set] <- eps
v$v <- paste0("v",v$v)
ovars <- names(x)
nV <- length(ovars)
nL <- sum(e$e=="<->")
lats <- c()
vars <- paste0("v",ovars)
if( nrow(e) > 0 ){
if( nL > 0 ){
lats <- paste0("l",seq_len(nL))
}
Beta <- matrix( 0, nrow=nV+nL, ncol=nV+nL )
rownames(Beta) <- colnames(Beta) <- c(vars,lats)
cL <- 1
for( i in seq_len( nrow(e) ) ){
b <- e$a[i]
if( e$e[i] == "<->" ){
lV <- paste0("l",cL)
lb <- sqrt(abs(b))
Beta[lV,paste0("v",e$v[i])] <- lb
if( b < 0 ){
Beta[lV,paste0("v",e$w[i])] <- -lb
} else {
Beta[lV,paste0("v",e$w[i])] <- lb
}
cL <- cL + 1
} else if( e$e[i] == "->" ){
Beta[paste0("v",e$v[i]),paste0("v",e$w[i])] <- b
}
}
} else {
Beta <- diag(1,nV)
colnames(Beta) <- vars
}
if( verbose ){
print(Beta)
}
rootNodes <- which( Beta != 0 )
r <- matrix( 0, ncol=ncol(Beta), nrow=N )
colnames(r) <- colnames(Beta)
roots <- !(colnames(Beta) %in% paste0("v",unique(e$w)))
for( i in colnames(Beta)[roots] ){
if( verbose ){
print(paste("Generating",i))
}
r[,i] <- 2*rbinom( N, 1, .odds2p( v$a[v$v==i] ) )-1
}
tord <- unlist(topologicalOrdering( x )[ovars])
if( verbose ){
print("topological ordering: ")
print(tord)
}
for( i in which(!roots)[order(tord[!roots])] ){
cn <- colnames(Beta)[i]
if( verbose ){
print(paste("Generating",cn))
}
p <- .odds2p( r %*% Beta[,i] + v$a[v$v==cn] )
r[,cn] <- 2*rbinom( N, 1, p )-1
}
r <- r[,seq(1,nV),drop=FALSE]
colnames(r) <- ovars
r <- as.data.frame(r)
r <- r[,setdiff(ovars,latents(x)),drop=FALSE]
r[] <- lapply(r,function(x) factor(x,levels=c(-1,1)))
r
}