########################################################### # Airline Example # ########################################################### Data = read.table("Airline.txt",header=T) attach(Data) plot(Passengers,type="l",col=4,lwd=2,xlab="Time",ylab="Passengers") X = matrix(0,144,11) for(i in 1:11){ for(j in 0:11){ X[i+j*12,i]=1 } } X = data.frame(X) attach(X) T = 1:144 model1 = lm(Passengers~T) sig1 = summary(model1)$sigma plot(Passengers,type="l",col=4,lwd=2,xlab="Time",ylab="Passengers") abline(model1$coef[1],model1$coef[2],lwd=2,col=2) legend(20,500,"Fitted Values",col=2,lty=1,lwd=2) plot(model1$res/sig1,type="b",pch=19,col=4,cex=1.5,ylab="std residuals",xlab="Time",cex.lab=1.4) abline(h=-2,lty=2,lwd=2,col=2) abline(h=2,lty=2,lwd=2,col=2) model2 = lm(log(Passengers)~T) summary(model2) sig2 = summary(model2)$sigma plot(model2$res/sig2,type="b",pch=19,col=4,cex=1.5,ylab="std residuals",xlab="Time",cex.lab=1.4) abline(h=-2,lty=2,lwd=2,col=2) abline(h=2,lty=2,lwd=2,col=2) plot(log(Passengers),type="l",col=4,lwd=2,xlab="Time",ylab="log(Passengers)") abline(model2$coef[1],model2$coef[2],lwd=2,col=2) legend(20,6,"Fitted Values",col=2,lty=1,lwd=2) model3 = lm(log(Passengers)~T+X1+X2+X3+X4+X5+X6+X7+X8+X9+X10+X11) sig3 = summary(model3)$sigma plot(model3$res/sig3,type="b",pch=19,col=4,cex=1.5,ylab="std residuals",xlab="Time",cex.lab=1.4) abline(h=-2,lty=2,lwd=2,col=2) abline(h=2,lty=2,lwd=2,col=2) plot(model3$res[1:143],model3$res[2:144],col=4,cex.lab=1.5,ylab="e(t)",xlab="e(t-1)",pch=19,lwd=1.5,main=paste("corr(e(t),e(t-1))=",round(cor(model3$res[1:143],model3$res[2:144]),3))) plot(log(Passengers),type="l",col=4,lwd=2,xlab="Time",ylab="log(Passengers)") lines(T,model3$fitted,col=2,lty=2,lwd=2) legend(20,6,"Fitted Values",col=2,lty=2,lwd=2) LogP = log(Passengers[2:144]) LogPlag = log(Passengers[1:143]) X = data.frame(X[2:144,]) attach(X) T = 1:143 model4 = lm(LogP~T+X1+X2+X3+X4+X5+X6+X7+X8+X9+X10+X11 + LogPlag) sig4 = summary(model4)$sigma plot(model4$res/sig4,type="b",pch=19,col=4,cex=1.5,ylab="std residuals",xlab="Time",cex.lab=1.4) abline(h=-2,lty=2,lwd=2,col=2) abline(h=2,lty=2,lwd=2,col=2) plot(model4$res[1:142],model4$res[2:143],col=4,cex.lab=1.5,ylab="e(t)",xlab="e(t-1)",pch=19,lwd=1.5,main=paste("corr(e(t),e(t-1))=",round(cor(model4$res[1:142],model4$res[2:143]),3))) plot(log(Passengers),type="l",col=4,lwd=2,xlab="Time",ylab="log(Passengers)") lines(T,model4$fitted,col=2,lty=2,lwd=2) legend(20,6,"Fitted Values",col=2,lty=2,lwd=2) ################# ### NBA data #### ################# NBA = read.csv("NBAspread.csv") attach(NBA) n = nrow(NBA) par(mfrow=c(1,2)) hist(NBA$spread[favwin==1], col=5, main="", xlab="spread") hist(NBA$spread[favwin==0], add=TRUE, col=6) legend("topright", legend=c("favwin=1", "favwin=0"), fill=c(5,6), bty="n") boxplot(NBA$spread ~ NBA$favwin, col=c(6,5), horizontal=TRUE, ylab="favwin", xlab="spread") nbareg = glm(favwin~spread-1, family=binomial) s = seq(0,30,length=100) fit = exp(s*nbareg$coef[1])/(1+exp(s*nbareg$coef[1])) plot(s, fit, typ="l", col=4, lwd=2, ylim=c(0.5,1), xlab="spread", ylab="P(favwin)") bic = cbind( extractAIC(nbareg, k=log(553)), extractAIC(glm(favwin ~ spread, family=binomial), k=log(553)), extractAIC(glm(favwin ~ spread + favhome, family=binomial), k=log(553)))[2,] ebic = exp(-.5*(bic-min(bic))) round(ebic/sum(ebic),2) thisweek=c(8,4) pred = nbareg$coef[1]*thisweek exp(pred)/(1+exp(pred)) ### German Credit Scoring Data ### credit = read.csv("germancredit.csv") train = 1:800 # build a model assuming you have credit history in there (a good idea) null = glm(GoodCredit~history3, family=binomial, data=credit[train,]) full = glm(GoodCredit~., family=binomial, data=credit[train,]) reg = step(null, scope=formula(full), direction="forward", k=log(length(train))) # Now predictiction: the function defaults to give X'b, # so you tell it to give you predictions of type="response" predreg = predict(reg, newdata=credit[-train,], type="response") predfull = predict(full, newdata=credit[-train,], type="response") prednull = predict(null, newdata=credit[-train,], type="response") errorreg = credit[-train,1]-(predreg >= .5) # 1 is a false negative, -1 is a false positive errorfull = credit[-train,1]-(predfull >= .5) errornull = credit[-train,1]-(prednull >= .5) # misclassification rates: mean(abs(errorreg)) mean(abs(errorfull)) mean(abs(errornull)) library(MASS) ## a library of example datasets library(class) ## a library with lots of classification tools library(glmnet) ## One of the Lasso libraries library(tree) ## library for tree methods library(randomForest) ## library for random forests #### ******* Forensic Glass ****** #### data(fgl) ## loads the data into R; see help(fgl) attach(fgl) par(mfrow=c(2,3)) plot(RI ~ type, data=fgl, col=c(grey(.2),2:6),cex.lab=1.4) plot(Al ~ type, data=fgl, col=c(grey(.2),2:6),cex.lab=1.4) plot(Na ~ type, data=fgl, col=c(grey(.2),2:6),cex.lab=1.4) plot(Mg ~ type, data=fgl, col=c(grey(.2),2:6),cex.lab=1.4) plot(Ba ~ type, data=fgl, col=c(grey(.2),2:6),cex.lab=1.4) plot(Si ~ type, data=fgl, col=c(grey(.2),2:6),cex.lab=1.4) ## for illustration, consider the RIxAl plane ## use 200 training points to find nearest neighbors for 14 train <- sample(1:214,200) x <- scale(fgl[,c(4,1)]) gtype <- fgl$type ng <- length(gtype) nearest1 <- knn(train=x[train,], test=x[-train,], cl=gtype[train], k=1) nearest5 <- knn(train=x[train,], test=x[-train,], cl=gtype[train], k=5) data.frame(gtype[-train],nearest1,nearest5) ## plot them to see how it worked par(mfrow=c(1,1)) plot(x[train,], col=gtype[train], cex=.8, main="5-nearest neighbor",pch=1) lixo = (1:ng)[-train] points(x[172,1],x[172,2], pch=18, col=nearest5[(1:14)[lixo==172]], cex=2) for(i in 1:14){ points(x[lixo[i],1],x[lixo[i],2], pch=18, col="orange", cex=2) points(x[lixo[i],1],x[lixo[i],2], pch=18, col=nearest1[i], cex=2) } ####################### ## Trees... ####################### out_size = 30 n = 214 train <- sample(1:n,n-out_size) treeGlass = tree(type~.,fgl,subset=train) tree.pred=predict(treeGlass,fgl,type="class") table(tree.pred[-train],type[-train]) 1-sum(diag(table(tree.pred[-train],type[-train])))/out_size # compute classification error rate par(mfrow=c(1,1)) plot(treeGlass) text(treeGlass,pretty=0) cv.glass = cv.tree(treeGlass,K=out_size) par(mfrow=c(1,2)) plot(cv.glass$size,cv.glass$dev,type="b") plot(cv.glass$k,cv.glass$dev,type="b") cv.glass$k[which.min(cv.glass$dev)] prune.glass = prune.tree(treeGlass,k=14) par(mfrow=c(1,2)) plot(treeGlass) text(treeGlass,pretty=0) plot(prune.glass) text(prune.glass,pretty=0) tree.predNEW = predict(prune.glass,fgl,type="class") 1-sum(diag(table(tree.pred[-train],type[-train])))/out_size 1-sum(diag(table(tree.predNEW[-train],type[-train])))/out_size # Run knn again in this new set of points... x <- scale(fgl[,-10]) nearest1 <- knn(train=x[train,], test=x[-train,], cl=gtype[train], k=1) nearest5 <- knn(train=x[train,], test=x[-train,], cl=gtype[train], k=5) 1-sum(diag(table(nearest1,type[-train])))/out_size 1-sum(diag(table(nearest5,type[-train])))/out_size ################################################# ###### *** California Housing Data *** ###### ## median home values in various census tracts ## lat/long are centroids of the tract ## response value is log(medianhomeval) ca <- read.csv("CAhousing.csv") ca$AveBedrms <- ca$totalBedrooms/ca$households ca$AveRooms <- ca$totalRooms/ca$households ca$AveOccupancy <- ca$population/ca$households logMedVal <- log(ca$medianHouseValue) ca <- ca[,-c(4,5,9)] # lose lmedval and the room totals ## create a full matrix of interactions (only necessary for linear model) ## do the normalization only for main variables. XXca <- model.matrix(~.*longitude*latitude, data=data.frame(scale(ca)))[,-1] ## what would a lasso linear model fit look like? ## it likes a pretty complicated model par(mfrow=c(1,2)) plot(capen <- cv.glmnet(x=XXca, y=logMedVal)) plot(capen$glm,xv="lambda") round(coef(capen),2) ## First, lets do it with CART ## no need for interactions; the tree finds them automatically catree <- tree(logMedVal ~ ., data=ca) par(mfrow=c(1,1)) plot(catree, col=8, lwd=2) text(catree) ## Next, with random forest (takes some time to run) ## limit the number of trees and the minimum tree size for speed ## add importance=TRUE so that we store the variable importance information carf <- randomForest(logMedVal ~ ., data=ca, ntree=250, nodesize=25, importance=TRUE) ## variable importance plot. Add type=1 to plot % contribution to MSE varImpPlot(carf, type=1, pch=21, bg="navy", main='RF variable importance') ## Fitted values yhatlasso <- predict(capen, XXca) yhattree <- predict(catree, ca) yhatrf <- predict(carf, ca) ## takes time! drawback of RFs ## Plotting the fitted values on a map... library(maps) par(mfrow=c(1,2)) ## preds map('state', 'california') points(ca[,1:2], col=predmap(yhatlasso), pch=20, cex=.5) mtext("lasso fitted") map('state', 'california') points(ca[,1:2], col=predmap(yhatrf), pch=20, cex=.5) mtext("rf fitted") legend("topright", title="prediction", bty="n", fill=predcol[c(1,4,7,9)], legend=c("20k","100k","400k","1mil")) ## Out of sample prediction (takes a while since RF is slow) MSE <- list(LASSO=NULL, CART=NULL, RF=NULL) for(i in 1:10){ train <- sample(1:nrow(ca), 5000) lin <- cv.glmnet(x=XXca[train,], y=logMedVal[train]) yhat.lin <- predict(lin, XXca[-train,]) MSE$LASSO <- c( MSE$LASSO, sqrt(mean((logMedVal[-train] - yhat.lin)^2))) rt <- tree(logMedVal[train] ~ ., data=ca[train,]) yhat.rt <- predict(rt, newdata=ca[-train,]) MSE$CART <- c( MSE$CART, sqrt(mean((logMedVal[-train] - yhat.rt)^2))) rf <- randomForest(logMedVal[train] ~ ., data=ca[train,], ntree=250, nodesize=25) yhat.rf <- predict(rf, newdata=ca[-train,]) MSE$RF <- c( MSE$RF, sqrt(mean((logMedVal[-train] - yhat.rf)^2))) cat(i) } par(mfrow=c(1,1)) boxplot(MSE, col="blue", xlab="model", ylab="MSE")