-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathregularization_poly2.R
124 lines (93 loc) · 4.64 KB
/
regularization_poly2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# Linear regression - multiple regressor, with regularization
###### REGULARIZATION #######
library(glmnet)
attach(merComplete)
x = model.matrix(fit.poly2.complete, merComplete)[,-1] #[-1] means no intercept
y = merComplete$overall
# The model.matrix() function is particularly useful for creating x; not only does it produce a matrix
# corresponding to the 15 predictors but it also automatically transforms any qualitative
# variables into dummy variables (recommended).
# The latter property is important because glmnet() can only take numerical, quantitative inputs.
########################
######### Ridge ########
grid=10^seq(10,-2,length=100) # Lambda values grid (from 10^10 to 10^-2)
# Inputs of glmnet: regressors matrix, dependent variable and alpha value:
# alpha = 0 (ridge regression), alpha = 1 (lasso)
ridge.mod=glmnet(x,y,alpha=0,lambda=grid)
dim(coef(ridge.mod)) # 16 coefficients, 100 lambda values
# We expect the coefficient estimates to be much smaller, in terms of l2 norm, when a large value of lambda is used, as
# compared to when a small value is used.
# grid[50]
ridge.mod$lambda[50] # lambda = 11497.57
coef(ridge.mod)[,50] # corresponding coefficients
sqrt(sum(coef(ridge.mod)[-1,50]^2)) # l2 norm
# grid[60]
ridge.mod$lambda[60] # lambda = 705.48
coef(ridge.mod)[,60] # corresponding coefficients
sqrt(sum(coef(ridge.mod)[-1,60]^2)) # l2 norm for lambda[60] > l2 for lambda[50]
# As lambda decreases -> the coefficients increase. As lambda increases -> the coefficients decrease
predict(ridge.mod,s=50,type="coefficients")[1:16,] # s = lambda value, predict of the coefficients
# Validation approach to estimate test error
set.seed(1)
train=sample(1:nrow(x), nrow(x)/2) # another typical approach to sample
test=(-train)
y.test=y[test]
# Fit a ridge regression model on the training set, and evaluate its MSE on the test set, using lambda (s) = 4.
ridge.mod=glmnet(x[train,],y[train],alpha=0,lambda=grid,thresh=1e-12)
ridge.pred=predict(ridge.mod,s=4,newx=x[test,]) # Note the use of the predict() function for a test set
mean((ridge.pred-y.test)^2) # test MSE = 1.62354
# Two predictions with different (arbitrary) values of lambda:
# lambda --> +OO (10^10) means coefficients close to zero
ridge.pred=predict(ridge.mod,s=1e10,newx=x[test,])
mean((ridge.pred-y.test)^2) # 11.85287 like intercept only
# lambda --> 0 means that Least squares is simply ridge regression.
ridge.pred=predict(ridge.mod,s=0,newx=x[test,],exact=T,x=x[train,],y=y[train])
mean((ridge.pred-y.test)^2) # 1.095796
# Same result obtained with lm(), exact = T works again on the model (but not the known model)
# Comparation of the results between lm() and glmnet when lambda=0:
lm(y~x, subset=train)
predict(ridge.mod,s=0,exact=T,type="coefficients",x=x[train,],y=y[train])[1:16,]
#In our case, best results are obtained with MSE compared to Ridge Regression
#Instead of arbitrary values, we now use method "Cross validation" to estimate lambda:
set.seed (1)
cv.out=cv.glmnet(x[train,],y[train],alpha=0)
dev.new()
plot(cv.out)
bestlam=cv.out$lambda.min
bestlam # the best lambda is 0.3087768
log(bestlam) # log value of previous lambda is -1.175137
# Prediction of the model with the best value of lambda
ridge.pred=predict(ridge.mod,s=bestlam ,newx=x[test,])
mean((ridge.pred-y.test)^2) # MSE = 1.11337
# Prediction of the coefficients with the best value of lambda
out=glmnet(x,y,alpha=0)
predict(out,type="coefficients",s=bestlam)[1:16,]
# As expected, none of the coefficients is zero
# Figure that show the variation of the coefficients with different values of lambda
dev.new()
plot(out,label = T, xvar = "lambda")
#####################
####### LASSO #######
# Use the argument alpha = 1 to perform Lasso
lasso.mod = glmnet(x[train,], y[train], alpha=1, lambda=grid)
dev.new()
plot(lasso.mod,label = T) # L1 norm
# Figure that show the variation of the coefficients with different values of lambda
dev.new()
plot(lasso.mod,label = T, xvar = "lambda")
# Perform Cross-Validation for estimate the best lambda to minimize "mse test"
set.seed (1)
cv.out=cv.glmnet(x[train,],y[train],alpha=1)
dev.new()
plot(cv.out)
bestlam=cv.out$lambda.min; print(bestlam);print(log(bestlam)) # the best lambda is 0.005032292
lasso.pred=predict(lasso.mod,s=bestlam ,newx=x[test,])
mean((lasso.pred-y.test)^2) # 1.097244 slighly larger than ridge
# Comparation between lm() and lasso-model with lambda = 0
lasso.pred=predict(lasso.mod,s=0,newx=x[test,],exact=T,x=x[train,],y=y[train])
mean((lasso.pred-y.test)^2) # 1.09579
out=glmnet(x,y,alpha=1,lambda=grid)
lasso.coef=predict(out,type="coefficients",s=bestlam)[1:16,]
lasso.coef
lasso.coef[lasso.coef!=0]
cat("Number of coefficients equal to 0:",sum(lasso.coef==0),"\n")