rm(list=ls())

#############################################
# Baseball Examples                         #
#############################################

Data = read.table("RunsPerGame.txt",header=T)
attach(Data)

summary(League)


modelAVG = lm(R.G ~ AVG)

modelSLG = lm(R.G ~ SLG)

modelOBP = lm(R.G ~ OBP)

summary(modelAVG)


plot(AVG,R.G,pch=19,cex=1.5,xlab="AVG",ylab = "Runs per Game")
abline(modelAVG$coef[1],modelAVG$coef[2],col=2,lwd=2)

plot(SLG,R.G,pch=19,cex=1.5,xlab="SLG",ylab = "Runs per Game")
abline(modelSLG$coef[1],modelSLG$coef[2],col=2,lwd=2)

plot(OBP,R.G,pch=19,cex=1.5,xlab="OBP",ylab = "Runs per Game")
abline(modelOBP$coef[1],modelOBP$coef[2],col=2,lwd=2)


#############################################
# House Example                             #
#############################################

House = read.table("housedata.txt",header=T)
attach(House)

plot(Size,Price,pch=19)

Housemodel = lm(Price~Size,data=House)

summary(Housemodel)

names(Housemodel)

names(summary(Housemodel))

confint(Housemodel)
confint(Housemodel,level=0.99)


Xfuture <- data.frame(Size=seq(0,8,by=0.01))


Future1 = predict(Housemodel, Xfuture, interval = "prediction",se.fit=T)
Future2 = predict(Housemodel, Xfuture, interval = "prediction",se.fit=T,level=0.99)


plot(Size,Price,xlim=c(0,8),ylim=range(Future1$fit),pch=19,cex.lab=1.3)
abline(lsfit(Size,Price),lwd=2,col=2)
lines(Xfuture$Size,Future1$fit[,2],col=4,lwd=2,lty=2)
lines(Xfuture$Size,Future1$fit[,3],col=4,lwd=2,lty=2)


#############################################
# Salary Data                               #
#############################################

SalaryData = read.table("SalaryData.txt",header=T)

attach(SalaryData)

plot(Gender,Salary,col="red",lwd=1.5,main="Salary Data")

plot(YrHired,Salary,xlab="Year Hired")
points(YrHired[Gender=="Male"],Salary[Gender=="Male"],col="blue",pch=19)
points(YrHired[Gender=="Female"],Salary[Gender=="Female"],col="red",pch=19)

plot(YrsPrior,Salary)
points(YrHired[Gender=="Male"],Salary[Gender=="Male"],col="red",pch=19)
points(YrHired[Gender=="Female"],Salary[Gender=="Female"],col="blue",pch=19)


model1 = lm(Salary~Gender)

model2 = lm(Salary~Gender+YrHired)
plot(YrHired,Salary,xlab="Year Hired")
abline(coef=model2$coef[c(1,3)],col="red",lwd=2)
points(YrHired[Gender=="Female"],Salary[Gender=="Female"],col="red",pch=19)
abline(coef=c(model2$coef[1]+model2$coef[2],model2$coef[3]),col="blue",lwd=2)
points(YrHired[Gender=="Male"],Salary[Gender=="Male"],col="blue",pch=19)


model3 = lm(Salary~Gender*YrHired)
plot(YrHired,Salary,xlab="Year Hired")
abline(coef=model3$coef[c(1,3)],col="red",lwd=2)
points(YrHired[Gender=="Female"],Salary[Gender=="Female"],col="red",pch=19)
abline(coef=c(model3$coef[1]+model3$coef[2],model3$coef[3]+model3$coef[4]),col="blue",lwd=2)
points(YrHired[Gender=="Male"],Salary[Gender=="Male"],col="blue",pch=19)


Exp = 95-YrHired

model4 = lm(Salary~Gender*Exp)


## Another Housing Data
#######################
rm(list=ls())

MidCity = read.csv("MidCity.csv",header=T)
attach(MidCity)

n = dim(MidCity)[1]

dn1 = rep(0,n)
dn1[Nbhd==1]=1

dn2 = rep(0,n)
dn2[Nbhd==2]=1

dn3 = rep(0,n)
dn3[Nbhd==3]=1

BR = rep(0,n)
BR[Brick=="Yes"]=1


Price = Price/1000
SqFt = SqFt/1000


MidCityModel = lm(Price~dn1+dn2+SqFt)

plot(SqFt,Price,xlab="Size")
abline(coef=MidCityModel$coef[c(1,4)],col=2,lwd=2)
points(SqFt[dn3==1],Price[dn3==1],col=2,pch=19)
abline(coef=c(MidCityModel$coef[1]+MidCityModel$coef[2],MidCityModel$coef[4]),col=4,lwd=2)
points(SqFt[dn1==1],Price[dn1==1],col=4,pch=19)
abline(coef=c(MidCityModel$coef[1]+MidCityModel$coef[3],MidCityModel$coef[4]),col=3,lwd=2)
points(SqFt[dn2==1],Price[dn2==1],col=3,pch=19)
legend(1.45,210,c("Nbhd = 1","Nbhd = 2","Nbhd = 3","Just Size"),lty=c(1,1,1,2),lwd=c(2,2,2,2),col=c(4,3,2,1))
abline(lsfit(SqFt,Price),col=1,lwd=2,lty=2)


Nbhd = factor(Nbhd)

model2 = lm(Price~Nbhd + SqFt + Nbhd:Brick)


############################################
# Telemarketing Example
############################################


Tele = read.table("Tele.txt",header=T)
attach(Tele)

Model1 = lm(calls~months)
summary(Model1)

plot(months,Model1$res,pch=19,cex=1.5,xlab="months",ylab="Std. Res")


months2 = months^2
Model2 = lm(calls~months+months2)
summary(Model2)


plot(months,Model2$res,pch=19,cex=1.5,xlab="months",ylab="Std. Res")

plot(months,calls,pch=19,cex=1.5,col=4)
lines(months,Model2$fitted,lwd=2,col=3)
lines(months,Model1$fitted,lwd=2,col=2)

############################################
# OJ Example
############################################
rm(list=ls())

OJ = read.table("OJ.txt",header=T)
attach(OJ)


model = lm(Sales~Price)

par(mfrow=c(1,2))
plot(Price,Sales,pch=19,col=4,main="Fitted Model")
abline(lsfit(Price,Sales),lwd=2,col=2)

plot(Price,model$res,pch=19,col=2,ylab="residuals",main="Residual Plot")

Logmodel = lm(log(Sales)~log(Price))


par(mfrow=c(1,2))
plot(Price,Sales,pch=19,col=4,main="Plug-in Prediction")
lines(sort(Price),exp(Logmodel$fitted[order(Price)]),pch=19,col=2,lwd=2)
lines(sort(Price),exp(Logmodel$fitted[order(Price)] - 2*0.38),pch=19,col=2,lwd=2,lty=2)
lines(sort(Price),exp(Logmodel$fitted[order(Price)] + 2*0.38),pch=19,col=2,lwd=2,lty=2)

plot(log(Price),log(Sales),pch=19,col=4,main="Plug-in Prediction")
abline(lsfit(log(Price),log(Sales)),lwd=2,col=2)
abline(c(Logmodel$coef[1]-2*0.38,Logmodel$coef[2]),lwd=2,col=2,lty=2)
abline(c(Logmodel$coef[1]+2*0.38,Logmodel$coef[2]),lwd=2,col=2,lty=2)


par(mfrow=c(1,1))
# residual plot...
plot(Price,Logmodel$res,pch=19,col=2,ylab="residuals",main="Residual Plot")


###########################################################
# Airline Example                                         #
###########################################################
rm(list=ls())

Data = read.table("Airline.txt",header=T)
attach(Data)

plot(Passengers,type="l",col=4,lwd=2,xlab="Time",ylab="Passengers")

X = matrix(0,144,11)
for(i in 1:11){
  for(j in 0:11){
    
    X[i+j*12,i]=1
  }
}
X = data.frame(X)
attach(X)

T = 1:144

model1 = lm(Passengers~T)
sig1 = summary(model1)$sigma

plot(Passengers,type="l",col=4,lwd=2,xlab="Time",ylab="Passengers")
abline(model1$coef[1],model1$coef[2],lwd=2,col=2)
legend(20,500,"Fitted Values",col=2,lty=1,lwd=2)


plot(model1$res/sig1,type="b",pch=19,col=4,cex=1.5,ylab="std residuals",xlab="Time",cex.lab=1.4)
abline(h=-2,lty=2,lwd=2,col=2)
abline(h=2,lty=2,lwd=2,col=2)


model2 = lm(log(Passengers)~T)
summary(model2)
sig2 = summary(model2)$sigma


plot(model2$res/sig2,type="b",pch=19,col=4,cex=1.5,ylab="std residuals",xlab="Time",cex.lab=1.4)
abline(h=-2,lty=2,lwd=2,col=2)
abline(h=2,lty=2,lwd=2,col=2)

plot(log(Passengers),type="l",col=4,lwd=2,xlab="Time",ylab="log(Passengers)")
abline(model2$coef[1],model2$coef[2],lwd=2,col=2)
legend(20,6,"Fitted Values",col=2,lty=1,lwd=2)


model3 = lm(log(Passengers)~T+X1+X2+X3+X4+X5+X6+X7+X8+X9+X10+X11)
sig3 = summary(model3)$sigma

plot(model3$res/sig3,type="b",pch=19,col=4,cex=1.5,ylab="std residuals",xlab="Time",cex.lab=1.4)
abline(h=-2,lty=2,lwd=2,col=2)
abline(h=2,lty=2,lwd=2,col=2)

plot(model3$res[1:143],model3$res[2:144],col=4,cex.lab=1.5,ylab="e(t)",xlab="e(t-1)",pch=19,lwd=1.5,main=paste("corr(e(t),e(t-1))=",round(cor(model3$res[1:143],model3$res[2:144]),3)))

plot(log(Passengers),type="l",col=4,lwd=2,xlab="Time",ylab="log(Passengers)")
lines(T,model3$fitted,col=2,lty=2,lwd=2)
legend(20,6,"Fitted Values",col=2,lty=2,lwd=2)


LogP = log(Passengers[2:144])
LogPlag = log(Passengers[1:143])

X = data.frame(X[2:144,])
attach(X)

T = 1:143

model4 = lm(LogP~T+X1+X2+X3+X4+X5+X6+X7+X8+X9+X10+X11 + LogPlag)

sig4 = summary(model4)$sigma

plot(model4$res/sig4,type="b",pch=19,col=4,cex=1.5,ylab="std residuals",xlab="Time",cex.lab=1.4)
abline(h=-2,lty=2,lwd=2,col=2)
abline(h=2,lty=2,lwd=2,col=2)

plot(model4$res[1:142],model4$res[2:143],col=4,cex.lab=1.5,ylab="e(t)",xlab="e(t-1)",pch=19,lwd=1.5,main=paste("corr(e(t),e(t-1))=",round(cor(model4$res[1:142],model4$res[2:143]),3)))

plot(LogP,type="l",col=4,lwd=2,xlab="Time",ylab="log(Passengers)")
lines(T,model4$fitted,col=2,lty=2,lwd=2)
legend(20,6,"Fitted Values",col=2,lty=2,lwd=2)


#######################################################
#  California Housing Example (Model Selection)
#######################################################
rm(list=ls())

library(glmnet)


ca <- read.csv("CAhousing.csv")
logMedVal <- log(ca$medianHouseValue)

ca <- ca[,-c(4,5,9)] # lose lmedval and the room totals
n = dim(ca)[1]
tr = sample(1:n,5000)


## create a full matrix of interactions (only necessary for linear model)
## do the normalization only for main variables.
XXca <- model.matrix(~.*longitude*latitude, data=data.frame(scale(ca)))[,-1]
CAdata = data.frame(logMedVal,XXca)


null = lm(logMedVal~1, data=CAdata[tr,])
full = glm(logMedVal~., data=CAdata[tr,])

regForward = step(null, scope=formula(full), direction="forward", k=log(length(tr)))
regBack = step(full, direction="backward", k=log(length(tr)))
regForward = step(null, scope=formula(full), direction="both", k=log(length(tr)))


XXca = scale(XXca)


Lasso.Fit = glmnet(XXca[tr,],logMedVal[tr])
Ridge.Fit = glmnet(XXca[tr,],logMedVal[tr],alpha=0)

par(mfrow=c(1,2))
plot(Lasso.Fit)
plot(Ridge.Fit)

CV.L = cv.glmnet(XXca[tr,], logMedVal[tr],alpha=1)
CV.R = cv.glmnet(XXca[tr,], logMedVal[tr],alpha=0)

LamR = CV.R$lambda.1se
LamL = CV.L$lambda.1se


par(mfrow=c(1,2))
plot(log(CV.R$lambda),sqrt(CV.R$cvm),main="Ridge CV (k=10)",xlab="log(lambda)",ylab = "RMSE",col=4,type="b",cex.lab=1.2)
abline(v=log(LamR),lty=2,col=2,lwd=2)
plot(log(CV.L$lambda),sqrt(CV.L$cvm),main="LASSO CV (k=10)",xlab="log(lambda)",ylab = "RMSE",col=4,type="b",cex.lab=1.2)
abline(v=log(LamL),lty=2,col=2,lwd=2)


coef.R = predict(CV.R,type="coefficients",s=LamR)
coef.L = predict(CV.L,type="coefficients",s=LamL)

par(mfrow=c(1,1))
plot(abs(coef.R[2:20]),abs(coef.L[2:20]),ylim=c(0,1),xlim=c(0,1))
abline(0,1)


### *** European Protein Consumption, in grams/person-day *** ###
##################################################################
rm(list=ls())

food = read.csv("protein.csv", row.names=1) # 1st column is country name
xfood = scale(food) 


### PCA ###
pcfood <- prcomp(xfood, scale=TRUE)
zfood <- predict(pcfood) 


par(mfrow=c(1,2))
plot(xfood[,1:2], type="n")
text(x=xfood[,1], y=xfood[,2], labels=rownames(food))
abline(h=mean(xfood[,1]),lty=2,col=2,lwd=1.5)
abline(v=mean(xfood[,2]),lty=2,col=2,lwd=1.5)
plot(zfood[,1:2], type="n", xlim=c(-4,5))
text(x=zfood[,1], y=zfood[,2], labels=rownames(food))
abline(h=mean(zfood[,1]),lty=2,col=2,lwd=1.5)
abline(v=mean(zfood[,2]),lty=2,col=2,lwd=1.5)


## how do the PCs look?
par(mfrow=c(1,1))
plot(zfood[,1:2], type="n", xlim=c(-4,5))
text(x=zfood[,1], y=zfood[,2], labels=rownames(food))
abline(h=mean(zfood[,1]),lty=2,col=2,lwd=1.5)
abline(v=mean(zfood[,2]),lty=2,col=2,lwd=1.5)


par(mfrow=c(1,1))
plot(pcfood, main="Food Principal Components Variance",xlab="PC")


#######################################################
##### *** rollcall voting *** #####
#######################################################
rm(list=ls())


votes <- read.csv("rollcall-votes.csv")
legis <- read.csv("rollcall-members.csv")

pcavote <- prcomp(votes[,2:1648], scale=TRUE)
plot(pcavote, main="Rollcall-Vote Principle Components",xlab="PC")

names(summary(pcavote))
summary(pcavote)$importance[,1:10]


## the 1st is traditional left-right
loadings <- pcavote$rotation[,1:2]

hist(loadings[,1], main="", xlab="1st Principle Component Vote-Loadings",
     col=8, border=grey(.9))
abline(v=loadings[884,1], col=2)
text(x=loadings[884,1], y=550, "Afford. Health (amdt.)", xpd=TRUE, col=2, font=3)
abline(v=loadings[25,1], col=4)
text(x=loadings[25,1], y=550, "TARP", xpd=TRUE, col=4, font=3)


votepc <- predict(pcavote) 


## What congress person is more extreme?
scores = votepc[,1]
idx = sort(scores, decreasing=TRUE,index.return=T)
scores = scores[idx$ix]


hist(scores, main="", xlab="1st Principle Component Vote-Scores",col=8, border=grey(.9))
abline(v=scores[13], col=4)
text(x=scores[13], y=190,"Royball (CA-34)" , xpd=TRUE, col=4, font=3)
abline(v=scores[256], col=4)
text(x=scores[256], y=190, "Deutch (FL-19)", xpd=TRUE, col=4, font=3)
abline(v=scores[277], col=2)
text(x=scores[277], y=190, "Rosleht (FL-18)", xpd=TRUE, col=2, font=3)
abline(v=scores[435], col=2)
text(x=scores[435], y=190, "Conaway (TX-11)", xpd=TRUE, col=2, font=3)


par(mfrow=c(1,2))
plot(votepc[,1],legis$share, pch=21, bg=(4:2)[legis$party], main="",xlab="PC1",ylab="% Obama")
abline(lsfit(votepc[,1],legis$share),lwd=2)
abline(h=50,col=3,lwd=2,lty=2)
plot(votepc[,1:2], pch=21, bg=(4:2)[legis$party], main="")
abline(v=mean(votepc[,1]),lty=2,col=2,lwd=1.5)


par(mfrow=c(1,2))
plot(votepc[,1],legis$share, pch=21, bg=(4:2)[legis$party], main="",xlab="PC1",ylab="% Obama")
abline(lsfit(votepc[,1],legis$share),lwd=2)
abline(h=50,col=3,lwd=2,lty=2)

plot(votepc[,1],legis$share, pch=21, bg=(4:2)[legis$party], main="",xlab="PC1",ylab="% Obama")
abline(h=50,col=3,lwd=2,lty=2)
points(votepc[legis$flip==1,1],legis$share[legis$flip==1],pch=19,col="magenta")
legend(-38,90,"losers in 2010",pch=19,col="magenta")


summary(lm(legis$share~votepc[,1]))
summary(lm(legis$share~votepc[,1:2]))

#######################################################
### PLS on Roll Call data
#######################################################
library(pls)

votes1 = as.matrix(votes[,2:1648])

plsParty = plsr(legis$share~votes1,scale=T,ncomp=2)
summary(plsParty)


plsReg = lm(legis$share~plsParty$scores)
pcrReg = lm(legis$share~votepc[,1:2])


par(mfrow=c(1,2))
plot(votepc[,1],legis$share,pch=21, bg=(4:2)[legis$party],xlab="PC1",ylab="% Obama")
abline(lsfit(votepc[,1],legis$share),lwd=2)
plot(plsParty$scores[,1],legis$share,pch=21, bg=(4:2)[legis$party],xlab="Comp1 (PLS)",ylab="% Obama")
abline(lsfit(plsParty$scores[,1],legis$share),lwd=2)


par(mfrow=c(1,2))
plot(pcrReg$fitted,legis$share,pch=21, bg=(4:2)[legis$party],xlab="Fitted",ylab="% Obama",main=paste("PCR","(","R^2=",round(summary(pcrReg)$r.s,3),")"))
abline(0,1)
plot(plsReg$fitted,legis$share,pch=21, bg=(4:2)[legis$party],xlab="Fitted",ylab="% Obama",main=paste("PLS","(","R^2=",round(summary(plsReg)$r.s,3),")"))
abline(0,1)


par(mfrow=c(1,2))
plot(votepc[,1:2], pch=21, bg=(4:2)[legis$party], main="PCR")
abline(v=mean(votepc[,1]),lty=2,col=2,lwd=1.5)
abline(h=mean(votepc[,2]),lty=2,col=2,lwd=1.5)

plot(plsParty$scores, pch=21, bg=(4:2)[legis$party], main="PLS")
abline(v=mean(plsParty$scores[,1]),lty=2,col=2,lwd=1.5)
abline(h=mean(plsParty$scores[,2]),lty=2,col=2,lwd=1.5)