-
Notifications
You must be signed in to change notification settings - Fork 0
/
Lab2.R
258 lines (212 loc) · 8.72 KB
/
Lab2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
#Generate Kery's wallcreeper data
set.seed(123)
n <- 16 #number of years
a <- 40 #intercept
b <- -1.5 #slope
sigma2 <- 25 #residual variance
years <- 1:16 #years
eps <- rnorm(n,mean=0,sd=sqrt(sigma2))
prcnt <- a + b*years + eps
junk <- rnorm(16,3,2) #also generate a junk predictor variable
KW <- as.data.frame(cbind(prcnt,years,junk))
#plot it
plot(KW$years,KW$prcnt,xlab = "Year",las=1,ylab="% occupied",xaxt='n',pch=19)
years <- c(1990:2005)
axis(side = 1, lab=years,at=c(1:16))
###################################################
# #
# SOLVING FOR BETA-HAT AND Y-HAT #
# #
###################################################
#this is our design matrix
d.mat <- cbind(rep(1,1),KW$years)
#solve for b-hat the hard way
#note that this is not ideal way to solve this
#can turn out poorly with messier data
(inv.xtx <- solve(t(d.mat)%*%d.mat))
(beta.hat <- inv.xtx%*%t(d.mat)%*%KW$prcnt)
(y.hat <- d.mat%*%beta.hat)
#and let's just get the hat matrix
#hat matrix is n*n dimensional
hat.matrix <- d.mat%*%inv.xtx%*%t(d.mat)
#we can also use our hat matrix to get y.hat (and some other stuff later)
(y.hat <- hat.matrix%*%KW$prcnt)
#plot the regression line and y.hat
abline(a = beta.hat[1], b = beta.hat[2],col="red")
points(y.hat,col="red",pch=19)
#we can also do this the easy way
(summary(mod <- lm(prcnt ~ years,data = KW)))
#and plot it again - it will be the same
plot(KW$years,KW$prcnt,xlab = "Year",las=1,ylab="% occupied",xaxt='n',pch=19)
years <- c(1990:2005)
axis(side = 1, lab=years,at=c(1:16))
abline(mod,col="red")
points(y.hat,col="red",pch=19)
#and a third way to solve this
#this is useful to know later when you want to customize a DM
(summary(mod.v2 <- lm(KW$prcnt ~ -1 + d.mat)))
#What are the residuals?
(resid <- KW$prcnt-y.hat)
#or
(resid <- KW$prcnt-predict(mod,type="response"))
#or
resid <- residuals(mod)
#plot the residuals - are there any patterns?
plot(x = KW$years, y = resid,xlab ="Year", ylab = "Residuals")
###################################################
# #
# TESTS #
# #
###################################################
#get Total, Residual, and Model SS
(TotSS <- t(KW$prcnt - mean(KW$prcnt))%*%(KW$prcnt-mean(KW$prcnt)) )
(ResSS <- t(KW$prcnt-y.hat) %*% (KW$prcnt-y.hat))
#note that we can also get ResSS with our hat matrix
#and note that diag(16) is a 16*16 identity matrix
(ResSS <- t(KW$prcnt)%*%(diag(16) - hat.matrix) %*% KW$prcnt)
(ModSS <- TotSS - ResSS)
#Mean SS
(M.ModSS <- ModSS/1) #df = ncol(X)-1
(M.ResSS <- ResSS/14) #df = df(total)-df(model)
(M.TotSS <- TotSS/15) #df = n-1
#and finally our F statistic and associated p-value
(f.stat <- (TotSS-ResSS)/(15-14)/(ResSS/14) )
pf(f.stat,15-14,14,lower.tail = F)
#Here's an example to demonstrate that RSS will always decline with increasing model complexity
#even if not by much
mod <- lm(prcnt ~ years, data = KW)
(RSS.mod <- anova(mod)$'Sum Sq' [2])
#this is a "junk" model because I added a random variable to it
mod.junk <- lm(prcnt ~ years + junk, data = KW)
(RSS.junk.mod <- anova(mod.junk)$'Sum Sq' [3])
#the RSS decreases by about 7.5% in this realization
(RSS.mod - RSS.junk.mod)/RSS.mod
#how do we get the sampling (residual) standard error from the RSS?
# we an also see this by looking at summary(mod)
(sigma <- sqrt( anova(mod)$'Sum Sq'[2]/(n-2) ) )
#F-test
#here we are testing against TotSS (i.e., testing all predictors)
#the F is always positively valued
#this is for model (mod)
( F.stat <- ( (TotSS - ResSS)/(2-1) )/ ( ResSS / (16-2)) )
pf(F.stat,2-1,16-2,lower.tail = F)
#this is the same as we get from summary(mod)
summary(mod)
#here we are testing just one predictor
anova(mod,mod.junk) #null hypothesis is that beta(junk) = 0 | b0, b1 in model
#note that this is equivalent to p we would get from t-test
summary(mod.junk)
#Here is a model with just the mean (intercept)
mod.mean <- lm(prcnt ~ 1, data = KW)
#so we can test a pair of predictors
#the p-value tells us to retain the pair of predictors
#though one of them is "junk"
#because the other one is useful
anova(mod.junk,mod.mean)
#and we can test a model subspace, such as H0: b(x1) = b(x2)
mod.equal <- lm(prcnt ~ I(years + junk), data = KW)
anova(mod.equal,mod.junk)
###################################################
# #
# PERMUTATION TESTS #
# #
###################################################
#we can test the entire model, i.e., H0: b1 = b2 = 0
nreps <- 100000
fstats <- numeric(nreps)
for(i in 1:nreps){
lmods <- lm(sample(prcnt) ~ years + junk, data = KW)
# lmods <- update(mod.junk, sample(prcnt) ~ .) #this is equivalent
fstats[i] <- summary(lmods)$fstat[1]
}
mean(fstats > summary(mod.junk)$fstat[1])
#we can test one predictor, e.g., H0: b2 = 0 | b1 in model
nreps <- 100000
tstats <- numeric(nreps)
for(i in 1:nreps){
lmods <- lm(prcnt ~ years + sample(junk), data = KW)
tstats[i] <- summary(lmods)$coefficients[3,3]
}
mean(abs(tstats) > abs(summary(mod.junk)$coefficients[3,3]) )
###################################################
# #
# CONFIDENCE INTERVALS on Beta #
# #
###################################################
summary(mod.junk)
( crit.t <- qt(0.975,16-3) )
summary(mod.junk)$coefficients [,1] - summary(mod.junk)$coefficients[,2] * crit.t
summary(mod.junk)$coefficients [,1] + summary(mod.junk)$coefficients[,2] * crit.t
#or, for a shortcut
confint(mod.junk)
#relationship to t-value
( p.value <- summary( mod.junk)$coefficients[3,4] )
confint(mod.junk, level = 1-p.value)[3,]
#Bootstrap CIs
nb <- 1000
coefmat <- matrix(NA,nb,3)
resids <- residuals (mod.junk)
#Note the following is equivalent to below: preds <- model.matrix(mod.junk)%*%coef(mod.junk)
preds <- fitted (mod.junk)
for(i in 1:nb){
boot <- preds + sample(resids, rep = T)
bmod <- update(mod.junk, boot ~ .)
coefmat [i, ] <- coef(bmod)
}
colnames(coefmat) <- c("Intercept","Years","Junk")
coefmat <- data.frame(coefmat)
apply(coefmat,2,function(x) quantile(x,probs = c(0.025,0.975)))
#above should look very similar to confint(mod.junk)
###################################################
# #
# CONFIDENCE INTERVALS on y-hat #
# #
###################################################
#CI
summary( mod.junk <- lm(prcnt ~ years, data = KW) )
x <- model.matrix(mod)
# get the median year
(x0 <- apply(x,2,median))
#what is the expected number of species in the median year?
(y0 <- sum(x0*coef(mod)))
#we can also use this command
predict(mod,new = data.frame (t(x0)))
#this command to get a prediction interval
predict(mod, new = data.frame (t(x0)), interval = "prediction")
#to get this from the equation
tcrit <- qt(0.975, df = (16-2))
sigma <- summary(mod)$sigma
y0 - tcrit*sigma*sqrt(1 + t(x0) %*% inv.xtx %*% x0)
y0 + tcrit*sigma*sqrt(1 + t(x0) %*% inv.xtx %*% x0)
#this command to get a confidence interval on the prediction
predict(mod, new = data.frame (t(x0)), interval = "confidence")
#and from equation
y0 - tcrit*sigma*sqrt(t(x0) %*% inv.xtx %*% x0)
y0 + tcrit*sigma*sqrt(t(x0) %*% inv.xtx %*% x0)
###################################################
# #
# What is the SE on beta-hat? #
# #
###################################################
#generate 1000 realizations of a data set
reps <- 1000
store <- matrix(NA,nrow=2,ncol=reps)
rownames(store) <- c("beta","se.beta")
for(i in 1:reps){
n <- 16 #number of years
a <- 40 #intercept
b <- -1.5 #slope
sigma2 <- 25 #residual variance
years <- 1:16 #years
eps <- rnorm(n,mean=0,sd=sqrt(sigma2))
prcnt <- a + b*years + eps
KW <- as.data.frame(cbind(prcnt,years))
#fit a model and store the estimate and the se of the coefficient
mod <- lm(prcnt ~ years, KW)
store[1,i] <- summary(mod)$coefficients[2,1]
store[2,i] <- summary(mod)$coefficients[2,2]
}
#the standard deviation of the estimate is the mean estimate of the standard error
#the standard error is a measure of how variable the beta-hat would be from multiple realizations of the dataset
sd(store[1,])
mean(store[2,])