Two examples are considered here. They are both based on the simulation examples from the original paper of LASSO (Tibshirani, 1996). In Example 1, we simulated 20 data sets consisting 250 observations from the following model
\[
y=\beta^Tx+\epsilon, \quad \beta=(3,1.5,0,0,2,0,0,0)^T, \quad \epsilon\sim N(0,3^2).
\] The \(X\) are generated from a multivariate normal distribution with zero mean and covariance matrix \(\Sigma\), with variance=\(1\) and covariance=\(0.5\). The function mvrnorm
in MASS
package can generate multivariate normal distribution.
library(MASS) #mvrnorm is in MASS library
#Example 1: sparse case
sim1=function(n,seed){
#mean of multivariate normal
mu=rep(0,8)
#Covariance matrix of multivariate normal
sigma=diag(0.5,8,8)+matrix(0.5,8,8)
set.seed(seed)
x=mvrnorm(n,mu,sigma)
#true value of beta
b=c(3,1.5,0,0,2,0,0,0)
#random error
set.seed(seed)
y=x%*%b+rnorm(n,mean=0,sd=3)
list(y=y,x=x)
}
sim1(5,1)
## $y
## [,1]
## [1,] 0.42203088
## [2,] -0.01825974
## [3,] -0.74727532
## [4,] -4.61314412
## [5,] 1.44827877
##
## $x
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 0.5018884 0.6813970 0.9802099 1.09255198 -0.1131843 0.1521382
## [2,] -0.3843463 0.1384877 -0.3882324 -0.08221217 0.1880588 -0.1075593
## [3,] -0.4135754 0.5873781 0.5500061 1.34760809 1.0596349 0.4016834
## [4,] -1.3972336 -1.9101098 -1.4405883 -0.90950014 -1.1710604 -1.1790782
## [5,] 0.1940798 -0.6998758 -0.2054007 -0.90832040 0.4636649 -0.8351033
## [,7] [,8]
## [1,] 1.1443119 -0.6805903
## [2,] 0.1328760 -0.5989322
## [3,] 0.4251606 1.0558760
## [4,] -2.3017617 0.7376474
## [5,] 0.5341647 -0.5202558
Let's consider lasso, ridge and elastic net here.
library(glmnet)
n=250 #50 training, 200 test
iter=20 #20 iterations
mse1=matrix(0,iter,3)
colnames(mse1)=c("Lasso","Ridge","ENET")
alpha1=rep(0,iter) #store optimal alpha
for (it in 1:iter){
data=sim1(250,it)
x1=data$x[1:50,];y1=data$y[1:50]
x2=data$x[51:n,];y2=data$y[51:n]
lasso.cv=cv.glmnet(x1,y1,alpha=1)
ridge.cv=cv.glmnet(x1,y1,alpha=0)
alpha=seq(from=0,to=1,by=0.2)
cvmse=rep(0,length(alpha))
for (i in 1:6){
fit=cv.glmnet(x1,y1,alpha=alpha[i])
cvmse[i]=min(fit$cvm)
}
opt.alpha=alpha[which.min(cvmse)]
alpha1[it]=opt.alpha
enet.cv=cv.glmnet(x1,y1,alpha=opt.alpha)
lasso.pred=predict(lasso.cv,newx=x2,s="lambda.min")
ridge.pred=predict(ridge.cv,newx=x2,s="lambda.min")
enet.pred=predict(enet.cv,newx=x2,s="lambda.min")
mse1[it,1]=mean((lasso.pred-y2)^2)
mse1[it,2]=mean((ridge.pred-y2)^2)
mse1[it,3]=mean((enet.pred-y2)^2)
}
par(mfrow=c(1,2))
boxplot(mse1,main="MSE")
boxplot(alpha1,main="Optimal alpha")
We see that lasso and elastic net perform very closely and much better than ridge in the sparse case. The optimal \(\alpha\) selected from elastic net is close to lasso (\(\alpha=1\)).
Example 2 is the same as example 1, except \(\beta=(0.85,0.85,0.85,0.85,0.85,0.85,0.85,0.85)^T\).
#Example 2: dense case
sim2=function(n,seed){
#mean of multivariate normal
mu=rep(0,8)
#Covariance matrix of multivariate normal
sigma=diag(0.5,8,8)+matrix(0.5,8,8)
set.seed(seed)
x=mvrnorm(n,mu,sigma)
#true value of beta
b=rep(0.85,8)
#random error
set.seed(seed)
y=x%*%b+rnorm(n,mean=0,sd=3)
list(y=y,x=x)
}
mse2=matrix(0,iter,3)
colnames(mse2)=c("Lasso","Ridge","ENET")
alpha2=rep(0,iter) #store optimal alpha
for (it in 1:iter){
data=sim2(250,it)
x1=data$x[1:50,];y1=data$y[1:50]
x2=data$x[51:n,];y2=data$y[51:n]
lasso.cv=cv.glmnet(x1,y1,alpha=1)
ridge.cv=cv.glmnet(x1,y1,alpha=0)
alpha=seq(from=0,to=1,by=0.2)
cvmse=rep(0,length(alpha))
for (i in 1:6){
fit=cv.glmnet(x1,y1,alpha=alpha[i])
cvmse[i]=min(fit$cvm)
}
opt.alpha=alpha[which.min(cvmse)]
alpha2[it]=opt.alpha
enet.cv=cv.glmnet(x1,y1,alpha=opt.alpha)
lasso.pred=predict(lasso.cv,newx=x2,s="lambda.min")
ridge.pred=predict(ridge.cv,newx=x2,s="lambda.min")
enet.pred=predict(enet.cv,newx=x2,s="lambda.min")
mse2[it,1]=mean((lasso.pred-y2)^2)
mse2[it,2]=mean((ridge.pred-y2)^2)
mse2[it,3]=mean((enet.pred-y2)^2)
}
par(mfrow=c(1,2))
boxplot(mse2,main="MSE")
boxplot(alpha2,main="Optimal alpha")
For dense case, ridge performs much better than Lasso. Elastic net performs very closely with ridge. The optimal \(\alpha\) selected from elastic net is close to ridge (\(\alpha=0\)).