### Chapter 3 code ## Bootstrapping #IQR example iter = 5000 iqr.pop = rep(0,iter) for (i in 1:iter){ set.seed(i+1) iqr.pop[i] = IQR(rexp(200,rate=1)) } sd(iqr.pop) iqr.boot = rep(0,iter) for (i in 1:iter){ set.seed(i+1) iqr.boot[i] = IQR(sample(x,size=200,replace=T)) } sd(iqr.boot) #Figure 3.1 par(mfrow=c(1,2)) hist(iqr.pop,xlim=range(iqr.boot,iqr.pop),ylim=c(0,5.5),nclass=100,freq=F,xlab="IQR") lines(density(iqr.pop),col="red",lwd=2) hist(iqr.boot,xlim=range(iqr.boot,iqr.pop),ylim=c(0,5.5),nclass=100,freq=F,xlab="IQR") lines(density(iqr.boot),col="red",lwd=2) c(mean(iqr.pop),sd(iqr.pop)) c(mean(iqr.boot),sd(iqr.boot)) #Linear regression example library(car) attach(Duncan) lr.fit <- lm(prestige ~ income) shapiro.test(lr.fit$resid) summary(lr.fit) #Figure 3.2 par(mfrow=c(1,2)) plot(income,prestige) abline(lr.fit) plot(prestige,lr.fit$resid) abline(h=0) iter = 5000 beta.boot <- rep(0,iter) for (i in 1:iter){ set.seed(i) ind <- sample(1:45,size=45,replace=T) fit <- lm(prestige~income,data=Duncan[ind,]) beta.boot[i] <- fit$coef[2] } #Figure 3.3 hist(beta.boot,nclass=100,freq=F,xlab=expression(hat(beta)[income]), main=expression(paste("Histogram of ",hat(beta)[income]," from bootstrapping",sep=""))) lines(density(beta.boot),col="red",lwd=2) abline(v=lr.fit$coef[2],col="blue",lwd=2) lines(rep(quantile(beta.boot,prob=0.025),2),c(0,0.3),lwd=4,col="red") lines(rep(quantile(beta.boot,prob=0.975),2),c(0,0.3),lwd=4,col="red") sd(beta.boot) quantile(beta.boot,prob=c(0.025,0.975)) confint(lr.fit,"income") ##Elastic net library(MASS) S = matrix(0,8,8) for (i in 1:8){ for (j in 1:8){ S[i,j]=0.5^abs(i-j) } } n = 100 set.seed(1) x = mvrnorm(n,mu=rep(0,8),S) b = c(3,1.5,0,0,2,0,0,0) y = x%*%b+rnorm(n,sd=3) library(glmnet) ridge.fit = glmnet(x,y,alpha=0) lasso.fit = glmnet(x,y,alpha=1) enet.fit = glmnet(x,y,alpha=0.9) set.seed(1) ridge.cv = cv.glmnet(x,y,alpha=0) ridge.cv$lambda.min lasso.cv = cv.glmnet(x,y,alpha=1) lasso.cv$lambda.min set.seed(6) enet.cv = cv.glmnet(x,y,alpha=0.9) enet.cv$lambda.min #Figure 3.4 par(mfrow=c(1,2)) plot(ridge.fit,xvar="lambda",main="Ridge") abline(v=log(ridge.cv$lambda.min),col="red",lty=2) plot(lasso.fit,xvar="lambda",main="Lasso") abline(v=log(lasso.cv$lambda.min),col="red",lty=2) #Figure 3.5 par(mfrow=c(1,2)) plot(enet.fit,xvar="lambda",main="Elastic net") abline(v=log(enet.cv$lambda.min),col="red",lty=2) plot(enet.cv) ##Multiple Additive Regression Trees #CART example library(car) library(rpart) library("partykit") attach(Duncan) set.seed(1) tree.fit = rpart(prestige ~ .,data=Duncan) #Figure 3.6 plot(as.party(tree.fit),type="simple") #MART example library(mvtnorm) library(gbm) n=1000 avg=rep(0,10) sigma=diag(1.5,10)+matrix(1.5,10,10) set.seed(1) x=rmvnorm(n, avg, sigma) sd(100*dmvnorm(x[,3:4],mean=c(0,0),sigma=diag(c(4,2))+matrix(2,2,2))) sd(0.17*x[,1]^2) sd(0.5*x[,2]) f=140*dmvnorm(x[,3:4],mean=c(0,0),sigma=diag(c(4,2))+matrix(2,2,2))+0.17*x[,2]^2+0.5*x[,1] y=f+rnorm(n,mean=0,sd=sd(f)/sqrt(10)) data=as.data.frame(x);data$y=y set.seed(1) fit.mart=gbm(y~.,distribution="gaussian",data=data,n.tree=100,interaction.depth=3, shrinkage=0.01, train.fraction=0.8, bag.fraction=0.5,verbose=F) best.iter <- gbm.perf(fit.mart, plot.it=FALSE, oobag.curve=F, method="OOB") while(fit.mart$n.trees-best.iter<50){ fit.mart <- gbm.more(fit.mart, 100) #do another 100 iterations best.iter <- gbm.perf(fit.mart, plot.it=FALSE, oobag.curve=F, method="OOB") } best.iter fit.mart$n.trees #Figure 3.7 par(mfrow=c(1,2)) temp=summary(fit.mart,n.trees=best.iter,plotit=F) barplot(temp$rel.inf, horiz = TRUE, col = rainbow(10, start = 3/6, end = 4/6), names = paste("X",1:10,sep=""), xlab = "Relative influence",las=2) gbm.perf(fit.mart, plot.it=T, oobag.curve=F, method="OOB") #Figure 3.8 par(mfrow=c(1,2)) temp=plot(fit.mart,i.var=1,n.trees=best.iter,return.grid=T) plot(temp$V1,temp$y,xlab="X1",ylab="Partial dependence",type="l",xlim=c(-4,4)) axis(side=1, at=quantile(x[,1],probs=seq(0.1,0.9,by=0.1)), labels = FALSE, lwd.ticks=2,col.ticks="red") temp=plot(fit.mart,i.var=2,n.trees=best.iter,return.grid=T) plot(temp$V2,temp$y,xlab="X2",ylab="Partial dependence",type="l",xlim=c(-4,4)) axis(side=1, at=quantile(x[,2],probs=seq(0.1,0.9,by=0.1)), labels = FALSE, lwd.ticks=2,col.ticks="red") #Figure 3.9 library(akima) temp1=sort(x[,3]);temp2=x[order(x[,3]),4] temp=.Call("gbm_plot", X = as.double(cbind(temp1,temp2)), cRows = as.integer(length(temp1)),cCols = as.integer(2), n.class = as.integer(1), i.var = as.integer(c(3,4) - 1), n.trees = as.integer(best.iter), initF = as.double(fit.mart$initF), trees = fit.mart$trees, c.splits = fit.mart$c.splits, var.type = as.integer(fit.mart$var.type), PACKAGE = "gbm") temp3<-interp(temp1,temp2,temp,duplicate="mean") filled.contour(temp3,xlab="X1",ylab="X2") #Interaction effect indx=1:10;indx=indx[-4] interaction.x4=rep(0,9) for (j in 1:9){ interaction.x4[j]=interact.gbm(fit.mart,data,i.var=c(indx[j],4),n.trees=best.iter) } ##Generalized Additive Model library(splines) x=(5:105)/10 n=length(x) f=2*sin(x*1.5)+x set.seed(1) y=f+rnorm(n,mean=0,sd=1) y=y-mean(y) dat=data.frame(x=x,y=y) basis=ns(x,knots=1:10) fit1=lm(y~basis-1,data=dat) wt.basis=basis%*%diag(coef(fit1)) y2=x*0.5 dat2=data.frame(x=x,y=y2) fit2=lm(y2~basis-1,data=dat2) wt.basis2=basis%*%diag(coef(fit2)) #Figure 3.10 par(fig=c(0,1,0.6,1)) matplot(x,ns(x,knots=1:10),type="l",xlab="",ylab="Natural-spline basis",axes=F,lwd=2) box() axis(side=2) par(new=T,fig=c(0,1,0,0.6)) matplot(x,wt.basis,xlab="X",ylab="",type="l",ylim=range(wt.basis,fit1$fit),lwd=2) lines(x,fit1$fit,col="red",lwd=3) library(mvtnorm) library(mgcv) n=1000 avg=rep(0,10) sigma=diag(1.5,10)+matrix(1.5,10,10) set.seed(1) x=rmvnorm(n, avg, sigma) f=140*dmvnorm(x[,3:4],mean=c(0,0),sigma=diag(c(4,2))+matrix(2,2,2))+0.17*x[,2]^2+0.5*x[,1] y=f+rnorm(n,mean=0,sd=sd(f)/sqrt(10)) data=as.data.frame(x[,1:4]) names(data)=paste("X",1:4,sep="") data$y=y fit1=gam(y~s(X1)+s(X2)+s(X3)+s(X4),data=data,method="GCV.Cp") fit2=gam(y~s(X1)+s(X2)+s(X3,X4),data=data,method="GCV.Cp") anova(fit1,fit2,test="F") #Figure 3.11 par(mfrow=c(2,2)) plot(fit2,select=1,ylab="f(X1)") plot(fit2,select=2,ylab="f(X2)") vis.gam(fit2,view=c("X3","X4"),plot.type="contour",main="") vis.gam(fit2,view=c("X3","X4"),theta=30,phi=30,plot.type="persp")