###########################################################
# Airline Example                                         #
###########################################################


Data = read.table("Airline.txt",header=T)
attach(Data)

plot(Passengers,type="l",col=4,lwd=2,xlab="Time",ylab="Passengers")

X = matrix(0,144,11)
for(i in 1:11){
  for(j in 0:11){
    
    X[i+j*12,i]=1
  }
}
X = data.frame(X)
attach(X)

T = 1:144

model1 = lm(Passengers~T)
sig1 = summary(model1)$sigma

plot(Passengers,type="l",col=4,lwd=2,xlab="Time",ylab="Passengers")
abline(model1$coef[1],model1$coef[2],lwd=2,col=2)
legend(20,500,"Fitted Values",col=2,lty=1,lwd=2)


plot(model1$res/sig1,type="b",pch=19,col=4,cex=1.5,ylab="std residuals",xlab="Time",cex.lab=1.4)
abline(h=-2,lty=2,lwd=2,col=2)
abline(h=2,lty=2,lwd=2,col=2)


model2 = lm(log(Passengers)~T)
summary(model2)
sig2 = summary(model2)$sigma


plot(model2$res/sig2,type="b",pch=19,col=4,cex=1.5,ylab="std residuals",xlab="Time",cex.lab=1.4)
abline(h=-2,lty=2,lwd=2,col=2)
abline(h=2,lty=2,lwd=2,col=2)

plot(log(Passengers),type="l",col=4,lwd=2,xlab="Time",ylab="log(Passengers)")
abline(model2$coef[1],model2$coef[2],lwd=2,col=2)
legend(20,6,"Fitted Values",col=2,lty=1,lwd=2)


model3 = lm(log(Passengers)~T+X1+X2+X3+X4+X5+X6+X7+X8+X9+X10+X11)
sig3 = summary(model3)$sigma

plot(model3$res/sig3,type="b",pch=19,col=4,cex=1.5,ylab="std residuals",xlab="Time",cex.lab=1.4)
abline(h=-2,lty=2,lwd=2,col=2)
abline(h=2,lty=2,lwd=2,col=2)

plot(model3$res[1:143],model3$res[2:144],col=4,cex.lab=1.5,ylab="e(t)",xlab="e(t-1)",pch=19,lwd=1.5,main=paste("corr(e(t),e(t-1))=",round(cor(model3$res[1:143],model3$res[2:144]),3)))

plot(log(Passengers),type="l",col=4,lwd=2,xlab="Time",ylab="log(Passengers)")
lines(T,model3$fitted,col=2,lty=2,lwd=2)
legend(20,6,"Fitted Values",col=2,lty=2,lwd=2)


LogP = log(Passengers[2:144])
LogPlag = log(Passengers[1:143])

X = data.frame(X[2:144,])
attach(X)

T = 1:143

model4 = lm(LogP~T+X1+X2+X3+X4+X5+X6+X7+X8+X9+X10+X11 + LogPlag)

sig4 = summary(model4)$sigma

plot(model4$res/sig4,type="b",pch=19,col=4,cex=1.5,ylab="std residuals",xlab="Time",cex.lab=1.4)
abline(h=-2,lty=2,lwd=2,col=2)
abline(h=2,lty=2,lwd=2,col=2)

plot(model4$res[1:142],model4$res[2:143],col=4,cex.lab=1.5,ylab="e(t)",xlab="e(t-1)",pch=19,lwd=1.5,main=paste("corr(e(t),e(t-1))=",round(cor(model4$res[1:142],model4$res[2:143]),3)))

plot(log(Passengers),type="l",col=4,lwd=2,xlab="Time",ylab="log(Passengers)")
lines(T,model4$fitted,col=2,lty=2,lwd=2)
legend(20,6,"Fitted Values",col=2,lty=2,lwd=2)


#################
### NBA data ####
#################

NBA = read.csv("NBAspread.csv")
attach(NBA)
n = nrow(NBA)

par(mfrow=c(1,2))
hist(NBA$spread[favwin==1], col=5, main="", xlab="spread")
hist(NBA$spread[favwin==0], add=TRUE, col=6)
legend("topright", legend=c("favwin=1", "favwin=0"), fill=c(5,6), bty="n")
boxplot(NBA$spread ~ NBA$favwin, col=c(6,5), horizontal=TRUE, ylab="favwin", xlab="spread")

nbareg = glm(favwin~spread-1, family=binomial)
s = seq(0,30,length=100)
fit = exp(s*nbareg$coef[1])/(1+exp(s*nbareg$coef[1]))
plot(s, fit, typ="l", col=4, lwd=2, ylim=c(0.5,1), xlab="spread", ylab="P(favwin)")

bic = cbind( extractAIC(nbareg, k=log(553)),
             extractAIC(glm(favwin ~ spread, family=binomial), k=log(553)),
             extractAIC(glm(favwin ~ spread + favhome, family=binomial), k=log(553)))[2,]
ebic = exp(-.5*(bic-min(bic)))
round(ebic/sum(ebic),2) 

thisweek=c(8,4)
pred = nbareg$coef[1]*thisweek
exp(pred)/(1+exp(pred))


### German Credit Scoring Data ###

credit = read.csv("germancredit.csv")
train = 1:800

# build a model assuming you have credit history in there (a good idea)
null = glm(GoodCredit~history3, family=binomial, data=credit[train,])
full = glm(GoodCredit~., family=binomial, data=credit[train,])
reg = step(null, scope=formula(full), direction="forward", k=log(length(train)))

# Now predictiction: the function defaults to give X'b,
# so you tell it to give you predictions of type="response"
predreg = predict(reg, newdata=credit[-train,], type="response") 
predfull = predict(full, newdata=credit[-train,], type="response") 
prednull = predict(null, newdata=credit[-train,], type="response") 

errorreg = credit[-train,1]-(predreg >= .5) # 1 is a false negative, -1 is a false positive
errorfull = credit[-train,1]-(predfull >= .5)
errornull = credit[-train,1]-(prednull >= .5)

# misclassification rates:
mean(abs(errorreg))
mean(abs(errorfull))
mean(abs(errornull))


library(MASS) ## a library of example datasets
library(class) ## a library with lots of classification tools
library(glmnet) ## One of the Lasso libraries
library(tree) ## library for tree methods
library(randomForest) ## library for random forests


#### ******* Forensic Glass ****** ####

data(fgl) ## loads the data into R; see help(fgl)
attach(fgl)

par(mfrow=c(2,3))
plot(RI ~ type, data=fgl, col=c(grey(.2),2:6),cex.lab=1.4)
plot(Al ~ type, data=fgl, col=c(grey(.2),2:6),cex.lab=1.4)
plot(Na ~ type, data=fgl, col=c(grey(.2),2:6),cex.lab=1.4)
plot(Mg ~ type, data=fgl, col=c(grey(.2),2:6),cex.lab=1.4)
plot(Ba ~ type, data=fgl, col=c(grey(.2),2:6),cex.lab=1.4)
plot(Si ~ type, data=fgl, col=c(grey(.2),2:6),cex.lab=1.4)


## for illustration, consider the RIxAl plane
## use 200 training points to find nearest neighbors for 14
train <- sample(1:214,200)
x <- scale(fgl[,c(4,1)]) 
gtype <- fgl$type
ng <- length(gtype)


nearest1 <- knn(train=x[train,], test=x[-train,], cl=gtype[train], k=1)
nearest5 <- knn(train=x[train,], test=x[-train,], cl=gtype[train], k=5)
data.frame(gtype[-train],nearest1,nearest5)

## plot them to see how it worked
par(mfrow=c(1,1))
plot(x[train,], col=gtype[train], cex=.8, main="5-nearest neighbor",pch=1)
lixo = (1:ng)[-train]
points(x[172,1],x[172,2], pch=18, col=nearest5[(1:14)[lixo==172]], cex=2)
for(i in 1:14){
points(x[lixo[i],1],x[lixo[i],2], pch=18, col="orange", cex=2)
points(x[lixo[i],1],x[lixo[i],2], pch=18, col=nearest1[i], cex=2)
}


#######################
## Trees...
#######################
out_size = 30
n = 214
train <- sample(1:n,n-out_size)

treeGlass = tree(type~.,fgl,subset=train)
tree.pred=predict(treeGlass,fgl,type="class")
table(tree.pred[-train],type[-train])
1-sum(diag(table(tree.pred[-train],type[-train])))/out_size # compute classification error rate


par(mfrow=c(1,1))
plot(treeGlass)
text(treeGlass,pretty=0)


cv.glass = cv.tree(treeGlass,K=out_size)

par(mfrow=c(1,2))
plot(cv.glass$size,cv.glass$dev,type="b")
plot(cv.glass$k,cv.glass$dev,type="b")

cv.glass$k[which.min(cv.glass$dev)]

prune.glass = prune.tree(treeGlass,k=14)


par(mfrow=c(1,2))
plot(treeGlass)
text(treeGlass,pretty=0)
plot(prune.glass)
text(prune.glass,pretty=0)


tree.predNEW = predict(prune.glass,fgl,type="class")

1-sum(diag(table(tree.pred[-train],type[-train])))/out_size
1-sum(diag(table(tree.predNEW[-train],type[-train])))/out_size


# Run knn again in this new set of points...

x <- scale(fgl[,-10])
nearest1 <- knn(train=x[train,], test=x[-train,], cl=gtype[train], k=1)
nearest5 <- knn(train=x[train,], test=x[-train,], cl=gtype[train], k=5)

1-sum(diag(table(nearest1,type[-train])))/out_size
1-sum(diag(table(nearest5,type[-train])))/out_size


#################################################
###### *** California Housing Data *** ######
## median home values in various census tracts
## lat/long are centroids of the tract
## response value is log(medianhomeval)
ca <- read.csv("CAhousing.csv")
ca$AveBedrms <- ca$totalBedrooms/ca$households
ca$AveRooms <- ca$totalRooms/ca$households
ca$AveOccupancy <- ca$population/ca$households
logMedVal <- log(ca$medianHouseValue)
ca <- ca[,-c(4,5,9)] # lose lmedval and the room totals

## create a full matrix of interactions (only necessary for linear model)
## do the normalization only for main variables.
XXca <- model.matrix(~.*longitude*latitude, data=data.frame(scale(ca)))[,-1]

## what would a lasso linear model fit look like?
## it likes a pretty complicated model
par(mfrow=c(1,2))
plot(capen <- cv.glmnet(x=XXca, y=logMedVal))
plot(capen$glm,xv="lambda")
round(coef(capen),2)

## First, lets do it with CART
## no need for interactions; the tree finds them automatically
catree <- tree(logMedVal ~ ., data=ca) 
par(mfrow=c(1,1))
plot(catree, col=8, lwd=2)
text(catree)

## Next, with random forest (takes some time to run)
## limit the number of trees and the minimum tree size for speed
## add importance=TRUE so that we store the variable importance information
carf <- randomForest(logMedVal ~ ., data=ca, ntree=250, nodesize=25, importance=TRUE)
## variable importance plot. Add type=1 to plot % contribution to MSE
varImpPlot(carf,  type=1, pch=21, bg="navy", main='RF variable importance')

## Fitted values
yhatlasso <- predict(capen, XXca)
yhattree <- predict(catree, ca)
yhatrf <- predict(carf, ca) ## takes time!  drawback of RFs


## Plotting the fitted values on a map...
library(maps)
par(mfrow=c(1,2))
## preds
map('state', 'california') 
points(ca[,1:2], col=predmap(yhatlasso), pch=20, cex=.5)
mtext("lasso fitted")
map('state', 'california') 
points(ca[,1:2], col=predmap(yhatrf), pch=20, cex=.5)
mtext("rf fitted")
legend("topright", title="prediction", bty="n",
       fill=predcol[c(1,4,7,9)], legend=c("20k","100k","400k","1mil"))


## Out of sample prediction (takes a while since RF is slow)
MSE <- list(LASSO=NULL, CART=NULL, RF=NULL)
for(i in 1:10){
  train <- sample(1:nrow(ca), 5000)
  
  lin <- cv.glmnet(x=XXca[train,], y=logMedVal[train])
  yhat.lin <- predict(lin, XXca[-train,])
  
  MSE$LASSO <- c( MSE$LASSO, sqrt(mean((logMedVal[-train] - yhat.lin)^2)))

  rt <- tree(logMedVal[train] ~ ., data=ca[train,])
  yhat.rt <- predict(rt, newdata=ca[-train,])
  MSE$CART <- c( MSE$CART, sqrt(mean((logMedVal[-train] - yhat.rt)^2)))

  rf <- randomForest(logMedVal[train] ~ ., data=ca[train,], ntree=250, nodesize=25)
  yhat.rf <- predict(rf, newdata=ca[-train,])
  MSE$RF <- c( MSE$RF, sqrt(mean((logMedVal[-train] - yhat.rf)^2)))
 
  cat(i)
} 

par(mfrow=c(1,1))
boxplot(MSE, col="blue", xlab="model", ylab="MSE")