Description of Simulation Model

Two examples are considered here. They are both based on the simulation examples from the original paper of LASSO (Tibshirani, 1996). In Example 1, we simulated 20 data sets consisting 250 observations from the following model

\[ y=\beta^Tx+\epsilon, \quad \beta=(3,1.5,0,0,2,0,0,0)^T, \quad \epsilon\sim N(0,3^2). \] The \(X\) are generated from a multivariate normal distribution with zero mean and covariance matrix \(\Sigma\), with variance=\(1\) and covariance=\(0.5\). The function mvrnorm in MASS package can generate multivariate normal distribution.

library(MASS) #mvrnorm is in MASS library
#Example 1: sparse case
sim1=function(n,seed){
  #mean of multivariate normal
  mu=rep(0,8)
  #Covariance matrix of multivariate normal
  sigma=diag(0.5,8,8)+matrix(0.5,8,8)
  set.seed(seed)
  x=mvrnorm(n,mu,sigma)
  #true value of beta
  b=c(3,1.5,0,0,2,0,0,0)
  #random error
  set.seed(seed)
  y=x%*%b+rnorm(n,mean=0,sd=3)
  list(y=y,x=x)
}
sim1(5,1) 
## $y
##             [,1]
## [1,]  0.42203088
## [2,] -0.01825974
## [3,] -0.74727532
## [4,] -4.61314412
## [5,]  1.44827877
## 
## $x
##            [,1]       [,2]       [,3]        [,4]       [,5]       [,6]
## [1,]  0.5018884  0.6813970  0.9802099  1.09255198 -0.1131843  0.1521382
## [2,] -0.3843463  0.1384877 -0.3882324 -0.08221217  0.1880588 -0.1075593
## [3,] -0.4135754  0.5873781  0.5500061  1.34760809  1.0596349  0.4016834
## [4,] -1.3972336 -1.9101098 -1.4405883 -0.90950014 -1.1710604 -1.1790782
## [5,]  0.1940798 -0.6998758 -0.2054007 -0.90832040  0.4636649 -0.8351033
##            [,7]       [,8]
## [1,]  1.1443119 -0.6805903
## [2,]  0.1328760 -0.5989322
## [3,]  0.4251606  1.0558760
## [4,] -2.3017617  0.7376474
## [5,]  0.5341647 -0.5202558

Example 1 Results

Let's consider lasso, ridge and elastic net here.

library(glmnet)

n=250 #50 training, 200 test
iter=20 #20 iterations
mse1=matrix(0,iter,3)
colnames(mse1)=c("Lasso","Ridge","ENET")
alpha1=rep(0,iter) #store optimal alpha

for (it in 1:iter){
  data=sim1(250,it)
  x1=data$x[1:50,];y1=data$y[1:50] 
  x2=data$x[51:n,];y2=data$y[51:n] 
   
  lasso.cv=cv.glmnet(x1,y1,alpha=1)
  ridge.cv=cv.glmnet(x1,y1,alpha=0)

  alpha=seq(from=0,to=1,by=0.2)
  cvmse=rep(0,length(alpha))
  for (i in 1:6){
    fit=cv.glmnet(x1,y1,alpha=alpha[i])
    cvmse[i]=min(fit$cvm)
  }
  opt.alpha=alpha[which.min(cvmse)]
  alpha1[it]=opt.alpha
  enet.cv=cv.glmnet(x1,y1,alpha=opt.alpha)

  lasso.pred=predict(lasso.cv,newx=x2,s="lambda.min")
  ridge.pred=predict(ridge.cv,newx=x2,s="lambda.min")
  enet.pred=predict(enet.cv,newx=x2,s="lambda.min")

  mse1[it,1]=mean((lasso.pred-y2)^2)
  mse1[it,2]=mean((ridge.pred-y2)^2)
  mse1[it,3]=mean((enet.pred-y2)^2)
}
par(mfrow=c(1,2))
boxplot(mse1,main="MSE")
boxplot(alpha1,main="Optimal alpha")

We see that lasso and elastic net perform very closely and much better than ridge in the sparse case. The optimal \(\alpha\) selected from elastic net is close to lasso (\(\alpha=1\)).

Example 2 (Dense Case)

Example 2 is the same as example 1, except \(\beta=(0.85,0.85,0.85,0.85,0.85,0.85,0.85,0.85)^T\).

#Example 2: dense case
sim2=function(n,seed){
  #mean of multivariate normal
  mu=rep(0,8)
  #Covariance matrix of multivariate normal
  sigma=diag(0.5,8,8)+matrix(0.5,8,8)
  set.seed(seed)
  x=mvrnorm(n,mu,sigma)
  #true value of beta
  b=rep(0.85,8)
  #random error
  set.seed(seed)
  y=x%*%b+rnorm(n,mean=0,sd=3)
  list(y=y,x=x)
}

mse2=matrix(0,iter,3)
colnames(mse2)=c("Lasso","Ridge","ENET")
alpha2=rep(0,iter) #store optimal alpha
for (it in 1:iter){
  data=sim2(250,it)
  x1=data$x[1:50,];y1=data$y[1:50] 
  x2=data$x[51:n,];y2=data$y[51:n] 
   
  lasso.cv=cv.glmnet(x1,y1,alpha=1)
  ridge.cv=cv.glmnet(x1,y1,alpha=0)

  alpha=seq(from=0,to=1,by=0.2)
  cvmse=rep(0,length(alpha))
  for (i in 1:6){
    fit=cv.glmnet(x1,y1,alpha=alpha[i])
    cvmse[i]=min(fit$cvm)
  }
  opt.alpha=alpha[which.min(cvmse)]
  alpha2[it]=opt.alpha
  enet.cv=cv.glmnet(x1,y1,alpha=opt.alpha)

  lasso.pred=predict(lasso.cv,newx=x2,s="lambda.min")
  ridge.pred=predict(ridge.cv,newx=x2,s="lambda.min")
  enet.pred=predict(enet.cv,newx=x2,s="lambda.min")

  mse2[it,1]=mean((lasso.pred-y2)^2)
  mse2[it,2]=mean((ridge.pred-y2)^2)
  mse2[it,3]=mean((enet.pred-y2)^2)
}
par(mfrow=c(1,2))
boxplot(mse2,main="MSE")
boxplot(alpha2,main="Optimal alpha")

For dense case, ridge performs much better than Lasso. Elastic net performs very closely with ridge. The optimal \(\alpha\) selected from elastic net is close to ridge (\(\alpha=0\)).