-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathpcr_pls_poly4.R
105 lines (88 loc) · 4.09 KB
/
pcr_pls_poly4.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
library(pls)
attach(merComplete)
#################
###### PCR ######
set.seed (2)
# Use the pcr() function:
# Setting scale=TRUE has the effect of standardizing each predictor (scale projection)
# Setting validation = 'CV' has the effect to use cross validation to rate M parameter
pcr.fit=pcr(fit.poly4,data = merComplete ,scale=TRUE, validation ="CV")
#Data: X dimension: [29] 79576
# Y dimension: [1] 79576
# By default, "pcr" gives us the RMSE (Root MSE) in terms of prediction with croos validation approach
# For each component, the result gives us the RMSE, as the number of components changes, and the variance explained.
# First line: variance explained as the number of regressors changes.
# Second line: variance explained as a function of overall variance.
# Variance explained:
# - x: 100% when there are all regressors
# - overall: 90.71%
summary(pcr.fit)
# Note that pcr() reports the root mean squared error;
# It also provides the percentage of variance explained in the predictors and in the response using different numbers of components.
# Plot the cross-validation scores
# Using val.type="MSEP" will cause the cross-validation MSE to be plotted.
dev.new()
validationplot(pcr.fit,val.type="MSEP",legendpos = "topright")
which.min(MSEP(pcr.fit)$val[1,,][-1])
# We see that the smallest CV error occurs when M = 29
# This suggests that there is no benefit in terms of reduction of dimensionality (M = 29)
# Now perform PCR on the training data and evaluate its test set performance:
set.seed (1)
x = model.matrix(fit.poly4,data = merComplete)[,-1]
y = merComplete$overall
train=sample(1:nrow(x), nrow(x)/2) # another typical approach to sample
test=(-train)
y.test=y[test]
pcr.fit=pcr(fit.poly4,data = merComplete ,subset=train,scale=TRUE,
validation ="CV")
dev.new()
# Plot MSE and RMSE
validationplot(pcr.fit,val.type="MSEP",legendpos = "topright")
minPCR <- which.min(MSEP(pcr.fit)$val[1,,][-1]); minPCR # M=29 shows the lowest CV error
dev.new()
plot(RMSEP(pcr.fit),legendpos = "topright")
# We compute the test MSE as follows:
pcr.pred=predict(pcr.fit,x[test,], ncomp=minPCR)
mean((pcr.pred-y.test)^2) # --> 1.093318
# This test set MSE is competitive with the results obtained using ridge and the lasso
# Finally, we fit PCR on the full data set, using M = 29
pcr.fit=pcr(y~x,scale=TRUE,ncomp=which.min(MSEP(pcr.fit)$val[1,,][-1]))
summary(pcr.fit)
dev.new()
validationplot(pcr.fit,val.type="MSEP",legendpos = "topright")
dev.new()
# Plot of the y values of dataset and those predicted by the model
plot(pcr.fit, ncomp = 29, asp = 1, line = TRUE)
coef(pcr.fit) ## get the coefficients
######################
######### PLS ########
set.seed (1)
# Using the plsr() function:
# Pls is similar to pcr, but is used to manage the variance of y
# With pls, we don't concentrate our attention only on the regressions, but also the y variable
# Setting scale=TRUE has the effect of standardizing each predictor (scale projection).
# Setting validation = 'CV' has the effect to use cross validation to rate M parameter
pls.fit=plsr(fit.poly4,data = merComplete, scale=TRUE,
validation ="CV")
summary(pls.fit)
dev.new()
validationplot(pls.fit,val.type="MSEP")
which.min(MSEP(pls.fit)$val[1,,][-1]) # M = 12
# Now perform Pls on the training data and evaluate its test set performance:
set.seed (1)
pls.fit=plsr(fit.poly4,data = merComplete, subset=train,
scale=TRUE, validation ="CV")
dev.new()
validationplot(pls.fit,val.type="MSEP");
which.min(MSEP(pls.fit)$val[1,,][-1]); # M = 10
pls.pred=predict(pls.fit,x[test,],ncomp=11)
mean((pls.pred-y.test)^2) # --> 1.093311
pls.pred=predict(pls.fit,x[test,],ncomp=10)
mean((pls.pred-y.test)^2) # --> 1.093311
pls.pred=predict(pls.fit,x[test,],ncomp=9)
mean((pls.pred-y.test)^2) # --> 1.093249
# The test MSE is comparable to (slightly higher) the test MSE obtained using ridge regression, the lasso, and PCR.
# Finally, we perform PLS using the full data set, using M = 29
pls.fit=plsr(fit.poly4,data = merComplete ,scale=TRUE,ncomp=29)
summary(pls.fit)
#Final result: with 6 components we can explain the same variance of y obtained with 29 components