-
Notifications
You must be signed in to change notification settings - Fork 82
/
rstan_MixedModelBetaRegression.R
178 lines (138 loc) · 5.83 KB
/
rstan_MixedModelBetaRegression.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
####################################################################################
### The following is an attempt to use Stan/rstan for a mixed model assuming a ###
### beta distribution for the response. See the helpfile for GasolineYield for ###
### information on the data. A similar example in JAGS can be found in ###
### jags_MixedModelBetaRegression.R. In general you'd probably want tweak the ###
### settings for both some more for optimal results, but for demonstration the ###
### following will produce adequate results. ###
####################################################################################
####################
### Get the data ###
####################
data(GasolineYield, package='betareg')
hist(GasolineYield$yield)
# for a quick comparison we might use the mgcv package; note that mgcv seems to
# be the closest one can get to an 'out-of-the-package' mixed model for a beta
# distributed response in R (maybe try gamlss, but its results seem problematic).
# However, for random slopes, e.g. with gamm4/gamm, you no longer have access to
# the beta distribution due to underlying use of lme4/nlme. The addition of
# random slopes doesn't seem to have too much effect here, so it can still be a
# useful comparison to see if it and Stan are in the same ball park as far as
# the random intercepts go; otherwise take out the random slopes of the Stan
# model for a direct comparison, and you'll get practically identical estimates.
# As mentioned above, a JAGS version of the model is available for direct
# comparison.
library(mgcv)
modgam = gam(yield ~ scale(temp, scale=F) + s(batch, bs='re'), data=GasolineYield, family=betar(link="logit"))
# summary(modgam)
#######################
### Stan model code ###
#######################
stanmodelcode ='
data {
int<lower=0> N; // number of observations
int<lower=1> L; // number of batches
vector[N] yield; // response
int<lower=1,upper=L> id[N]; // batch
vector[N] temp; // temperature
}
transformed data {
vector[N] tempCen;
tempCen = temp - mean(temp); // centered explanatory variable
}
parameters {
real Intercept; // "fixed" effects
real betaTemp;
real<lower=0> phi; // dispersion parameter
real<lower=0> sd_int; // sd for ints
real<lower=0> sd_beta; // sd for temp
vector[L] gammaIntercept; // individual effects
vector[L] gammaTemp; // individual effects
}
transformed parameters{
vector<lower=0>[N] A; // parameter for beta distn
vector<lower=0>[N] B; // parameter for beta distn
vector<lower=0,upper=1>[N] yhat; // transformed linear predictor
vector[L] IntRE;
vector[L] SlopeRE;
for (l in 1:L){
IntRE[l] = gammaIntercept[l]*sd_int;
SlopeRE[l] = gammaTemp[l]*sd_beta ;
}
// model calculations
for(n in 1:N) {
yhat[n] = inv_logit((IntRE[id[n]] + Intercept) + (SlopeRE[id[n]] + betaTemp) * tempCen[n]);
}
A = yhat * phi;
B = (1.0-yhat) * phi;
}
model {
// priors
Intercept ~ normal(0, 10);
betaTemp ~ normal(0, 1);
sd_int ~ cauchy(0, 2.5);
sd_beta ~ cauchy(0, 2.5);
phi ~ cauchy(0, 5);
// matt trick used for following;
// else slower and convergence issues
gammaIntercept ~ normal(0, 1); // random intercepts for each batch
gammaTemp ~ normal(0, 1); // random slopes for each batch
// likelihood
yield ~ beta(A, B);
}
'
################
### Test run ###
################
### Data set up
library(rstan)
modelMatrix = data.frame(1, GasolineYield[,'temp', drop=F])
dat = list(N = nrow(GasolineYield), yield=GasolineYield$yield,
temp=GasolineYield$temp, id=as.numeric(GasolineYield$batch), L=10) # 10 = number of batches
### Run the model and check diagnostics
test = stan(model_code = stanmodelcode, model_name = "example",
init=0, # initializing unbounded at zero or setting the range for random init values proved useful
# init_r=.01,
data = dat, iter = 2000, warmup=200, thin=1, chains = 2,
pars = c('Intercept','betaTemp','IntRE', 'SlopeRE', 'phi', 'sd_int', 'sd_beta'), verbose = F, refresh=2000/4)
# diagnostics
ainfo = get_adaptation_info(test)
cat(ainfo[[1]])
samplerpar = get_sampler_params(test)[[1]]
summary(samplerpar)
shinystan::launch_shinystan(test)
# summary
print(test, digits_summary=4)
################
### Full run ###
################
### set iterations etc.
iters = 22000
wu = 2000
thin = 20
nchains = 4
### setup and run in parallel
fit2 = stan(model_code = stanmodelcode, model_name = "mixedreg",
init=0, # see comment in test run
# init_r=.01,
fit = test,
data = dat, iter = iters, warmup=wu, thin=thin,
cores = 4,
verbose = T, refresh=iters/4)
### combine the chains
fit2 = sflist2stanfit(parfit)
### examine some diagnostics
ainfo = get_adaptation_info(fit2)
cat(ainfo[[1]])
samplerpar = get_sampler_params(fit2)[[1]]
summary(samplerpar)
### examine model
print(fit2, pars = c('Intercept','betaTemp','IntRE', 'SlopeRE', 'phi', 'sd_int', 'sd_beta', 'lp__'),
digits=4, probs = c(.025, .5, 0.975))
shinystan::launch_shinystan(fit2)
### Compare to mgcv
summary(modgam)
gam.vcomp(modgam)
gamIntRE = modgam$coeff[grep('batch', names(modgam$coef))]
stanIntRE = colMeans(extract(fit2, par='IntRE')[[1]])
cbind(gamIntRE, stanIntRE)