-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_analysis_3.R
267 lines (189 loc) · 9.53 KB
/
data_analysis_3.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
# $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
# Data Analysis with R #
# Khagendra Adhikari #
# 2016 #
# #
# $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
# Use packages
library(MASS)
library(ggplot2)
library(lmtest)
library(boot)
library(caret)
library(DAAG)
# birthwt {MAAS}
# Risk Factors Associated with Low Infant Birth Weight.
# ************************************************************************************
## The birthwt data frame has 189 rows and 10 columns.
## The data were collected at Baystate Medical Center, Springfield, Mass during 1986.
# ************************************************************************************
# structure of dataset birthwt.
str(birthwt)
# low: indicator of birth weight less than 2.5 kg.
#
# age: mother's age in years.
#
# low: mother's weight in pounds at last menstrual period.
#
# race: mother's race (1 = white, 2 = black, 3 = other).
#
# smoke: smoking status during pregnancy.
#
# ptl: number of previous premature labours.
#
# ht: history of hypertension.
#
# ui: presence of uterine irritability.
#
# ftv: number of physician visits during the first trimester.
#
# bwt: birth weight in grams.
# &&&&&&&&&&&&&&&&&&&&&&&&&&&&&*********************************&&&&&&&&&&&&&&&&&&&&&
# Ans.1 #
# &&&&&&&&&&&&&&&&&&&&&&&&&&&&&*********************************&&&&&&&&&&&&&&&&&&&&&
attach(birthwt)
# Attach dataset birthwt, so the objects in the database can be accessed
# by simply giving their names.
(race <- factor(race, labels = c("white", "black", "other")))
# The factor function creates factor variable from numeric variable.
# The levels in variable race has been modified as "white" for 1, "black"
# for 2, and "other" for 3.
(ptd <- factor(ptl > 0))
# Create new logic variable from numeric variable with
# TRUE for ptl > 0 and FALSE otherwise.
table(ftv)
# Display variable ftv in tabulated form. i.e. elements with corresponding frequency.
(ftv <- factor(ftv))
# Convert variable ftv of type numeric to factor.
levels(ftv) # Orignial levels.
(levels(ftv)[-(1:2)] <- "2+")
# New level "2+" to all the levels of ftv except 1st ("0") and 2nd ("1") levels.
# i.e. replace levels 2 or highter with new level "2+".
levels(ftv) # new levels after rename.
table(ftv)
# Display the factor variable ftv in tabulated form.
bw <- data.frame(low = factor(low), age, lwt, race, smoke = (smoke > 0),
ptd, ht = (ht > 0), ui = (ui > 0), ftv)
# reate new data frame bw updaing variable "low" as a factor,
# and variable "smoke", "ht", and "ui" as logic.
detach()
# Detach the attachment birthwt such that dataset birthwt should be
# referenced in order to access the variable. i.e. birthwt$race.
rm(race, ptd, ftv)
# Remove objects in lists (race, ptd, ftv).
# &&&&&&&&&&&&&&&&&&&&&&&&&&&&&*********************************&&&&&&&&&&&&&&&&&&&&&
# Ans.2 #
# &&&&&&&&&&&&&&&&&&&&&&&&&&&&&*********************************&&&&&&&&&&&&&&&&&&&&&
model <- glm(low ~ age * ftv, family = binomial(link = logit), data = bw)
summary(model)
# Yes, the data from this study confirm that ftv is an important
# predicator of low birthweight. The interation effect is significant
# between age of mother and "2+" number of physician visits during
# the first trimester.
# &&&&&&&&&&&&&&&&&&&&&&&&&&&&&*********************************&&&&&&&&&&&&&&&&&&&&&
# Ans.3 #
# &&&&&&&&&&&&&&&&&&&&&&&&&&&&&*********************************&&&&&&&&&&&&&&&&&&&&&
ftv.level <- levels(bw$ftv)
est.prob.age20 <- predict.glm(model,
newdata = data.frame(age = 20, ftv = ftv.level),
type = "response", se.fit=TRUE)
est.prob.age30 <- predict.glm(model,
newdata = data.frame(age = 30, ftv = ftv.level),
type = "response", se.fit=TRUE)
# Estimated probability of low birthweight for 20 and 30 years mothers
# for each ftv-catagories
(est.prob.low.bw <- data.frame(ftv = ftv.level, est.prob.age20$fit, est.prob.age30$fit))
# Fitted values vs. age plot for each ftv-grups.
ggplot(bw, aes(bw$age, model$fitted.values, colour = ftv))+
geom_point()+
labs(x = "Mother's age", y = "Probability of low birthweight")
# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
# Interpretation based on the plot: #
# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
## The mothers, who had no physician visit (ftv = 0), have a chance of having
## uderweight baby is low but increase with the mother's age.
## The mothers, who had one physician visit (ftv = 1), have a chance of having
## uderweight baby is low and also decrease with the mother's age.
## The young mothers, who had 2+ physician visits (ftv = 2+), have a higher chance of
## having uderweight baby and but decrease sharply with the mother's age.
# &&&&&&&&&&&&&&&&&&&&&&&&&&&&&*********************************&&&&&&&&&&&&&&&&&&&&&
# Ans.4 #
# &&&&&&&&&&&&&&&&&&&&&&&&&&&&&*********************************&&&&&&&&&&&&&&&&&&&&&
p30 <- est.prob.age30
lower.limit <- p30$fit - 1.96*p30$se.fit
upper.limit <- p30$fit + 1.96*p30$se.fit
# 95% cofidence interval of PI for 30 years old women in each ftv group.
(conf.interval <- data.frame( ftv.level, p30$fit, lower.limit, upper.limit))
# The probabilty can't be zero.
# Set the lower.limit to zero if it is negative.
conf.interval$lower.limit[conf.interval$lower.limit < 0] <- 0.0
conf.interval
# &&&&&&&&&&&&&&&&&&&&&&&&&&&&&*********************************&&&&&&&&&&&&&&&&&&&&&
# Ans.5 #
# &&&&&&&&&&&&&&&&&&&&&&&&&&&&&*********************************&&&&&&&&&&&&&&&&&&&&&
# *****************************???????????????????????????*************************
## The likelihood ratio test (LR-test) can be performed either using lrtest()
## from lmtest package or using anova() function from base R. The reduced
## model has less log likelihood but it is necessary to test whether the observed
## difference is significan or not?. Here, the null hypothesis is that the
## reduced model is true.
# ********************************************************************************
# %%%%%%%%%
# Ans.5.a #*************************************************************************
# %%%%%%%%%
# model.a includes all explanatory variables from datset bw.
model.a <- glm(low ~ ., family = binomial(link = logit), data = bw)
# %%%%%%%%%
# Ans.5.b #**************************************************************************
# %%%%%%%%%
# model.b include interaction effect to model.a.
model.b <- update(model.a,~.+age:ftv)
# compare two models uisng anova. We use chi-squared test as our
# models are binomial fits.
anova(model.a, model.b, test = "Chisq")
# Alternative method gives the same result.
lrtest(model.a, model.b)
# Here, p value shows that the model.b does singnificantly improve the model fit.
# %%%%%%%%%
# Ans.5.c #***************************************************************************
# %%%%%%%%%
summary(model.a)
# From summary of model.a, we can see that explanatory variables
# "lwt", "race", "ptd", and "ht" had a significant efffect
# using singificance level 0.05.
model.c <- glm(low ~ lwt + race+ ptd + ht,
family = binomial(link = logit), data = bw)
# compare model.c with model.a using anova.
anova(model.c, model.a, test = "Chisq")
# compare model.c with model.a using anova.
anova(model.c, model.b, test = "Chisq")
# According to LR-test, model.b is preferred.
AIC(model.a, model.b, model.c)
# According to AIC, model.b with smalest AIC is the best.
# The residuals also suggest model.b but they are not signigicant enough
# to select one model over other. i.e ther are of same order.
# %%%%%%%%%
# Ans.5.d #****************************************************************************
# %%%%%%%%%
model.d <- stepAIC(model.a, scope = ~. ^2)
# The final model with smallest AIC is
# Step: AIC=207.94
# low ~ age + lwt + race + smoke + ptd + ht + ui + ftv + age:ftv + smoke:ui
# Based on AIC value, we can conclude that this is the optimal model.
optimal.m <- glm(low ~ age + lwt + race + smoke + ptd + ht + ui + ftv + age:ftv + smoke:ui,
family = binomial(link = logit), data = bw)
# %%%%%%%%%
# Ans.5.e #******************************************************************************
# %%%%%%%%%
# The prediction accuracy based on 10-fold cross validation is
CVbinary(optimal.m, print.details = FALSE)$acc.cv
# Average of 10 prediction accuracy rates based on 5-fold cross validation.
acc.cv.list <- NULL
for (j in 1:10)
{
acc.cv.list[j] <- CVbinary(optimal.m, nfolds = 5,
print.details = FALSE)$acc.cv
}
ave.predict.acc <- mean(acc.cv.list)
paste("Average predication accuracy is", round(mean(ave.predict.acc)*100, 1), "%.")
# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@