################################################### ## Fit a regression tree to medv~lstat in the boston housing data. ## The tree is plotted as well as a plot of the corresponding step function ## fit to the data. ## The cutpoints from tree are added to the plot so you can see how ## the tree corresponds to the function. ################################################### library(tree) library(MASS) data(Boston) attach(Boston) #-------------------------------------------------- #fit a tree to boston data just using lstat. #first get a big tree using a small value of mindev temp = tree(medv~lstat,data=Boston,mindev=.0001) cat('first big tree size: \n') print(length(unique(temp$where))) #then prune it down to one with 7 leaves boston.tree=prune.tree(temp,best=7) cat('pruned tree size: \n') print(length(unique(boston.tree$where))) #-------------------------------------------------- #plot the tree and the fits. par(mfrow=c(1,2)) #plot the tree plot(boston.tree,type="uniform") text(boston.tree,col="blue",label=c("yval"),cex=.8) #plot data with fit boston.fit = predict(boston.tree) #get training fitted values plot(lstat,medv,cex=.5,pch=16) #plot data oo=order(lstat) lines(lstat[oo],boston.fit[oo],col='red',lwd=3) #step function fit cvals=c(9.725,4.65,3.325,5.495,16.085,19.9) #cutpoints from tree for(i in 1:length(cvals)) abline(v=cvals[i],col='magenta',lty=2) #cutpoints rm(list=ls()) ################################################################################ ## Fit a regression tree to mev~dis+lstat from the Boston housing data. ## The tree is plotted as well as the corresponding partition of the two-dimensional ## x=(dis,lstat) space. ################################################################################ library(MASS) data(Boston) attach(Boston) library(tree) #-------------------------------------------------- df2=Boston[,c(8,13,14)] #pick off dis,lstat,medv print(names(df2)) #-------------------------------------------------- #big tree temp = tree(medv~.,df2,mindev=.0001) cat('first big tree size: \n') print(length(unique(temp$where))) #-------------------------------------------------- #then prune it down to one with 7 leaves boston.tree=prune.tree(temp,best=7) cat('pruned tree size: \n') print(length(unique(boston.tree$where))) #-------------------------------------------------- # plot tree and partition in x. par(mfrow=c(1,2)) plot(boston.tree,type="u") text(boston.tree,col="blue",label=c("yval"),cex=.8) partition.tree(boston.tree) rm(list=ls()) ################################################################################ ## Fit a regression tree to medv~dis+lstat and then do a perpective ## plot of (x1,x2) vs. y. ################################################################################ library(MASS) data(Boston) attach(Boston) library(tree) #-------------------------------------------------- df2=Boston[,c(8,13,14)] #pick off dis,lstat,medv print(names(df2)) #-------------------------------------------------- #big tree temp = tree(medv~.,df2,mindev=.0001) cat('first big tree size: \n') print(length(unique(temp$where))) #then prune it down to one with 7 leaves boston.tree=prune.tree(temp,best=7) cat('pruned tree size: \n') print(length(unique(boston.tree$where))) #-------------------------------------------------- #get predictions on 2d grid pv=seq(from=.01,to=.99,by=.05) x1q = quantile(df2$lstat,probs=pv) x2q = quantile(df2$dis,probs=pv) xx = expand.grid(x1q,x2q) #matrix with two columns using all combinations of x1q and x2q dfpred = data.frame(dis=xx[,2],lstat=xx[,1]) lmedpred = predict(boston.tree,dfpred) #make perspective plot par(mfrow=c(1,1)) persp(x1q,x2q,matrix(lmedpred,ncol=length(x2q),byrow=T), theta=150,xlab='dis',ylab='lstat',zlab='medv', zlim=c(min(df2$medv),1.1*max(df2$medv))) ################################################################################ ## Read in the California Housing data. ## Transform some of the variables. ## Divide data into train,val,test. ################################################################################ source('cal_setup.txt') ################################################################################ # Fit a regression tree to California Housing data: # logMedVal ~ longitude+latitude. # Plot: The tree and the 2-dim partitoon. # Plot: The California map with the fits from the tree coded with colors. ################################################################################ library(tree) library(maps) #-------------------------------------------------- #first get big tree temp = tree(logMedVal~longitude+latitude,catrain,mindev=.0001) cat('first big tree size: \n') print(length(unique(temp$where))) #then prune it caltrain.tree = prune.tree(temp,best=50) cat('pruned tree size: \n') print(length(unique(caltrain.tree$where))) #-------------------------------------------------- #plot the tree par(mfrow=c(1,2)) plot(caltrain.tree,type="u") text(caltrain.tree,col="blue",label=c("yval"),cex=.8) partition.tree(caltrain.tree) #-------------------------------------------------- #map plot frm = caltrain.tree$frame wh = caltrain.tree$where nrf = nrow(frm) iil = frm[,"var"]=="" iil = (1:nrf)[iil] #indices of leaves in the frame oo = order(frm[iil,"yval"]) #sort by yval so iil[oo[i]] give frame row of ith yval leaf map('state', 'california') nc=length(iil) colv = heat.colors(nc)[nc:1] for(i in 1:length(iil)) { print(iil[oo[i]]) iitemp = (wh == iil[oo[i]]) #where refers to rows of the frame points(catrain$longitude[iitemp],catrain$latitude[iitemp],col=colv[i]) } lglabs=as.character(round(exp(frm[iil[oo],"yval"]),0)) print(lglabs) lseq = seq(from=nc,to=1,by=-2) print(lseq) legend("topright",legend=lglabs[lseq],col=colv[lseq], cex=1.8,lty=rep(1,nc),lwd=rep(5,nc),bty="n") rm(list=ls()) ################################################################################ ## Fit a classification tree to the penalty data. ## Plot the tree. ################################################################################ library(tree) source('pen_setup.txt') #-------------------------------------------------- #simple tree on hockey data #first get big tree temp = tree(oppcall~.,data=pen,mindev=.0001) cat('first big tree size: \n') print(length(unique(temp$where))) #then prune it down pen.tree=prune.tree(temp,best=15) cat('pruned tree size: \n') print(length(unique(pen.tree$where))) #plot the tree par(mfrow=c(1,1)) plot(pen.tree,type="uniform") options(digits=5) text(pen.tree,pretty=0,col="blue",label=c("yprob"),cex=.8) options(digits=7) rm(list=ls()) ################################################################################ ## Fit a big tree to medv~lstat in the Boston housing data using rpart instead of tree ## and using cross-validation. ## Use rpart plotcp so do cross-validation. ## Plot: rpart plotcp cross-validation. ## Big off min loss cp value and plot tree for that cp value as well ## as a bigger cp value (smaller tree) and a smaller cp value (bigger tree). ## Plot: the three trees (from the three cp values) as well as the fitted function. ## Plot: the best tree using rpart. ################################################################################ library(tree) library(rpart) library(MASS) data(Boston) attach(Boston) #-------------------------------------------------- #reduce df to just lmed and lrat bdf = Boston[,c(13,14)] #lstat and medv #-------------------------------------------------- #fit a big tree using rpart.control big.tree = rpart(medv~lstat,method="anova",data=bdf, control=rpart.control(minsplit=5,cp=.0005)) nbig = length(unique(big.tree$where)) cat('size of big tree: ',nbig,'\n') #-------------------------------------------------- #look at cross-validation par(mfrow=c(1,1)) plotcp(big.tree) #-------------------------------------------------- #show fit from some trees oo=order(bdf$lstat) bestcp=big.tree$cptable[which.min(big.tree$cptable[,"xerror"]),"CP"] cat('bestcp: ',bestcp,'\n') cpvec = c(.0157,bestcp,.004) par(mfrow=c(3,2)) for(i in 1:3) { plot(bdf,pch=16,col='blue',cex=.5) ptree = prune(big.tree,cp=cpvec[i]) pfit = predict(ptree) lines(bdf$lstat[oo],pfit[oo],col='red',lwd=2) title(paste('alpha = ',round(cpvec[i],3))) plot(ptree,uniform=TRUE) text(ptree,digits=4) } #-------------------------------------------------- #plot best tree par(mfrow=c(1,1)) best.tree = prune(big.tree,cp=bestcp) plot(best.tree,uniform=TRUE,branch=.5,margine=.5) text(best.tree,digits=4,use.n=TRUE,fancy=TRUE,bg='lightblue') rm(list=ls()) ################################################################################ ## Random Forests: fit medv~lstat using random forests. ## Plot: oob error esitmation. ## Plot: fit from random forests for three different number of trees in forest. ################################################################################ library(randomForest) library(MASS) data(Boston) attach(Boston) #-------------------------------------------------- #get rf fits for different number of trees #note: to get this to work I had to use maxnodes parameter of randomForest!!! set.seed(99) n = nrow(Boston) ntreev = c(10,500,5000) nset = length(ntreev) fmat = matrix(0,n,nset) for(i in 1:nset) { cat('doing Boston rf: ',i,'\n') rffit = randomForest(medv~lstat,data=Boston,ntree=ntreev[i],maxnodes=15) fmat[,i] = predict(rffit) } #-------------------------------------------------- #plot oob error using last fitted rffit which has the largest ntree. par(mfrow=c(1,1)) plot(rffit) #-------------------------------------------------- #plot fits par(mfrow=c(1,3)) oo = order(Boston$lstat) for(i in 1:nset) { plot(Boston$lstat,Boston$medv,xlab='lstat',ylab='medv') lines(Boston$lstat[oo],fmat[oo,i],col=i,lwd=3) title(main=paste('bagging ntrees = ',ntreev[i])) } #-------------------------------------------------- rm(list=ls()) ################################################################################ ## Fit medv~lstat, Boston Housing using boosting. ## Plot: fits for three different values of number of trees. ################################################################################ library(gbm) #boost package library(MASS) data(Boston) attach(Boston) #-------------------------------------------------- #fit boosting for various number of trees set.seed(99) n = nrow(Boston) ntreev = c(5,20,100) nset = length(ntreev) fmat = matrix(0,n,nset) for(i in 1:nset) { cat('doing Boston boost: ',i,'\n') boostfit = gbm(medv~lstat,data=Boston,distribution='gaussian', interaction.depth=2,n.trees=ntreev[i],shrinkage=.2) fmat[,i] = predict(boostfit,n.trees=ntreev[i]) } #-------------------------------------------------- #plot fits par(mfrow=c(1,3)) oo = order(Boston$lstat) for(i in 1:nset) { plot(Boston$lstat,Boston$medv,xlab='lstat',ylab='medv') lines(Boston$lstat[oo],fmat[oo,i],col=i+1,lwd=3,lty=1) title(main=paste('boosting, ntree= ',ntreev[i])) } #-------------------------------------------------- rm(list=ls()) ################################################################################ ## Variable Importance: ## Fit boosting and random forests and plot variable importance. ################################################################################ library(gbm) #boost package library(randomForest) library(MASS) data(Boston) attach(Boston) #-------------------------------------------------- #fit boost and plot variable importance boostfit = gbm(medv~.,data=Boston,distribution='gaussian', interaction.depth=2,n.trees=100,shrinkage=.2) par(mfrow=c(1,1)) p=ncol(Boston)-1 vsum=summary(boostfit,plotit=FALSE) #this will have the variable importance info row.names(vsum)=NULL #drop varable names from rows. #plot variable importance #the package does this automatically, but I did not like the plot plot(vsum$rel.inf,axes=F,pch=16,col='red') axis(1,labels=vsum$var,at=1:p) axis(2) for(i in 1:p) lines(c(i,i),c(0,vsum$rel.inf[i]),lwd=4,col='blue') #-------------------------------------------------- #fit random forest and plot variable importance rffit = randomForest(medv~.,data=Boston,mtry=3,ntree=500) varImpPlot(rffit) rm(list=ls()) ################################################################################ ## California Housing data, fit regression trees on train, get loss on validation ## for a vector of cp values. ## Get cp values from rpart. ## Write fits (on validation) from best tree to file (thetreepred.txt) ################################################################################ source('cal_setup.txt') library(rpart) #-------------------------------------------------- #get big tree big.tree = rpart(logMedVal~.,method="anova",data=catrain, control=rpart.control(minsplit=5,cp=.0001)) nbig = length(unique(big.tree$where)) cat('size of big tree: ',nbig,'\n') #-------------------------------------------------- #fit on train, predict on val for vector of cp. cpvec = big.tree$cptable[,"CP"] #cp values to try ntree = length(cpvec) #number of cv values = number of trees fit. iltree = rep(0,ntree) #in-sample loss oltree = rep(0,ntree) #out-of-sample loss sztree = rep(0,ntree) #size of each tree for(i in 1:ntree) { if((i %% 10)==0) cat('tree i: ',i,'\n') temptree = prune(big.tree,cp=cpvec[i]) sztree[i] = length(unique(temptree$where)) iltree[i] = sum((catrain$logMedVal-predict(temptree))^2) ofit = predict(temptree,caval) oltree[i] = sum((caval$logMedVal-ofit)^2) } oltree=sqrt(oltree/nrow(caval)); iltree = sqrt(iltree/nrow(catrain)) #-------------------------------------------------- #plot losses rgl = range(c(iltree,oltree)) plot(range(sztree),rgl,type='n',xlab='tree size',ylab='loss') points(sztree,iltree,pch=15,col='red') points(sztree,oltree,pch=16,col='blue') legend("topright",legend=c('in-sample','out-of-sample'),lwd=3,col=c('red','blue')) #-------------------------------------------------- #write val preds iitree = which.min(oltree) thetree = prune(big.tree,cp=cpvec[iitree]) thetreepred = predict(thetree,caval) write(thetreepred,file='thetreepred.txt',ncol=1) #-------------------------------------------------- rm(list=ls()) ################################################################################ ## California Housing data: fit boosting for values of ## (i) depth (ii) number of trees (iii) lamda = shrinkage. ## Fit on train, get loss on validation. ## Write fits on validition from best to file thebpred-2.txt. ################################################################################ source('cal_setup.txt') library(gbm) #-------------------------------------------------- set.seed(1) idv = c(4,10) ntv = c(1000,5000) lamv=c(.001,.2) parmb = expand.grid(idv,ntv,lamv) colnames(parmb) = c('tdepth','ntree','lam') print(parmb) nset = nrow(parmb) olb = rep(0,nset) ilb = rep(0,nset) bfitv = vector('list',nset) for(i in 1:nset) { cat('doing boost ',i,' out of ',nset,'\n') tempboost = gbm(logMedVal~.,data=catrain,distribution='gaussian', interaction.depth=parmb[i,1],n.trees=parmb[i,2],shrinkage=parmb[i,3]) ifit = predict(tempboost,n.trees=parmb[i,2]) ofit=predict(tempboost,newdata=caval,n.trees=parmb[i,2]) olb[i] = sum((caval$logMedVal-ofit)^2) ilb[i] = sum((catrain$logMedVal-ifit)^2) bfitv[[i]]=tempboost } ilb = round(sqrt(ilb/nrow(catrain)),3); olb = round(sqrt(olb/nrow(caval)),3) #-------------------------------------------------- #print losses print(cbind(parmb,olb,ilb)) #-------------------------------------------------- #write val preds iib=which.min(olb) theb = bfitv[[iib]] thebpred = predict(theb,newdata=caval,n.trees=parmb[iib,2]) write(thebpred,file='thebpred-2.txt',ncol=1) rm(list=ls()) ################################################################################ ## California Housing Data: fit random forests on train, get loss on validation ## using values of (i) mtry (ii) number of trees. ## Write out fits on validation from best t file therfpred.txt. ################################################################################ source('cal_setup.txt') library(randomForest) #-------------------------------------------------- set.seed(1) p=ncol(catrain)-1 mtryv = c(p,sqrt(p)) ntreev = c(100,500) parmrf = expand.grid(mtryv,ntreev) colnames(parmrf)=c('mtry','ntree') nset = nrow(parmrf) olrf = rep(0,nset) ilrf = rep(0,nset) rffitv = vector('list',nset) for(i in 1:nset) { cat('doing rf ',i,' out of ',nset,'\n') temprf = randomForest(logMedVal~.,data=catrain,mtry=parmrf[i,1],ntree=parmrf[i,2]) ifit = predict(temprf) ofit=predict(temprf,newdata=caval) olrf[i] = sum((caval$logMedVal-ofit)^2) ilrf[i] = sum((catrain$logMedVal-ifit)^2) rffitv[[i]]=temprf } ilrf = round(sqrt(ilrf/nrow(catrain)),3); olrf = round(sqrt(olrf/nrow(caval)),3) #---------------------------------------- #print losses print(cbind(parmrf,olrf,ilrf)) #---------------------------------------- #write val preds iirf=which.min(olrf) therf = rffitv[[iirf]] therfpred=predict(therf,newdata=caval) write(therfpred,file='therfpred.txt',ncol=1) rm(list=ls()) ################################################################################ ## California Housing Data: fit best boosting on (train, val), get fit on test. ## Get rmse on test. ## Write file: finbrmse.txt with just rmse on test. ## Plot: fit vs. y on test. ## Plot: variable importance. ## Plot: partial dependence plots. ## Write file: cal-boost-varimport.txt with variable importance measures. ################################################################################ source('cal_setup.txt') library(gbm) #-------------------------------------------------- #fit on train+val set.seed(1) catrainval = rbind(catrain,caval) ntrees=5000 finb = gbm(logMedVal~.,data=catrainval,distribution='gaussian', interaction.depth=4,n.trees=ntrees,shrinkage=.2) finbpred=predict(finb,newdata=catest,n.trees=ntrees) #-------------------------------------------------- #plot y vs yhat for test data and compute rmse on test. finbrmse = sqrt(sum((catest$logMedVal-finbpred)^2)/nrow(catest)) cat('finbrmse: ',finbrmse,'\n') plot(catest$logMedVal,finbpred,xlab='test logMedVal',ylab='boost pred') abline(0,1,col='red',lwd=2) #-------------------------------------------------- #plot variable importance p=ncol(catrain)-1 #want number of variables for later vsum=summary(finb) #this will have the variable importance info row.names(vsum)=NULL #drop varable names from rows. #write variable importance table cat('\\begin{verbatim}\n') print(vsum) cat('\\end{verbatim}\n') #plot variable importance #the package does this automatically, but I did not like the plot plot(vsum$rel.inf,axes=F,pch=16,col='red') axis(1,labels=vsum$var,at=1:p) axis(2) for(i in 1:p) lines(c(i,i),c(0,vsum$rel.inf[i]),lwd=4,col='blue') #-------------------------------------------------- #partial dependence plots par(mfrow=c(3,3)) nms = names(catrain)[1:9] for(i in 1:9) plot(finb,i=nms[i]) #-------------------------------------------------- rm(list=ls()) ################################################################################ ## California Housing: fit random forests on train+validation using best. ## Predict on test. ## Plot: fits vs test y. ## Plot: Variable importance. ## Write: rmse on test to file finrfrmse.txt. ################################################################################ source('cal_setup.txt') library(randomForest) #-------------------------------------------------- #fit on train+val set.seed(1) catrainval = rbind(catrain,caval) finrf = randomForest(logMedVal~.,data=catrainval,mtry=3,ntree=500) finrfpred=predict(finrf,newdata=catest) #-------------------------------------------------- #plot y vs yhat for test data finrfrmse = sqrt(sum((catest$logMedVal-finrfpred)^2)/nrow(catest)) cat('finrfrmse: ',finrfrmse,'\n') plot(catest$logMedVal,finrfpred,xlab='test logMedVal',ylab='rf pred') abline(0,1,col='red',lwd=2) #-------------------------------------------------- #plot variable importance varImpPlot(finrf) g#-------------------------------------------------- rm(list=ls())