###################################################
### Demo for Lecture 9:
### Model search via Cp, AIC, AICc, BIC for SA.dat 
###################################################
#line 80 "Lecture8b.Rnw"
library(leaps)

###################################################
### Read in data and enumerate all models to investigate (via "regsubsets").
### Total number of subsets models: 2^12 = 4096 (will only consider 1464).
###################################################
SA<-data.frame(read.table('http://www.math.ttu.edu/~atrindad/stat5371/SA.dat',sep='\t',header=T))
SA<-data.frame(read.table('/home/atrindad/Teach/stat5371/Data/SA.dat',sep='\t',header=T)) 
print(dim(SA)) ## dimensions
#SA<-SA[sample(seq(1,dim(SA)[1]),150),] ## if a random subset of dataframe is wanted...
yy<-SA[,12]
xx<-SA[,-12]
# get the "best" subset models...
rleaps<-regsubsets(xx,yy,int=T,nbest=250,nvmax=250,really.big=T,method=c("ex")) 
## True/False matrix: which vars are in?
cleaps<-summary(rleaps,matrix=T) 
Models<-cleaps$which
## adding the empty model (just an intercept) to the model matrix
Models<-rbind(c(T,rep(F,dim(xx)[2])),Models) 



###################################################
### Calculate Cp, AIC, AICc, BIC for each model (m) & plot vs. p(m)
###################################################
#line 240 "Lect9.Rnw"
nn = length(yy)               ## sample size (n)
tt = apply(cleaps$which,1,sum) ## model size (p)
BIC  <- nn*log(cleaps$rss/nn)+tt*log(nn)
AIC  <- nn*log(cleaps$rss/nn)+tt*2
AICc <- AIC + tt*2*(1+tt)/(nn-tt-1)
#line 245 "Lect9.Rnw"
par(mfrow=c(2,2))
plot(tt,cleaps$cp,main="Cp", xlab="model size (p)")
plot(tt,AIC,main="AIC", xlab="model size (p)")
plot(tt,AICc,main="AICc", xlab="model size (p)")
plot(tt,BIC,main="BIC", xlab="model size (p)")
#par(mfrow=c(1,1))


###################################################
### Select only the best model in each subset size & plot AIC-min(AIC), etc.
###################################################
#line 267 "Lect9.Rnw"
rleaps<-regsubsets(xx,yy,int=T,nbest=1,nvmax=dim(xx)[2]+1,really.big=T,method=c("ex")) ## nbest=1: the best model of each size
cleaps<-summary(rleaps,matrix=T)
# just the best model of every size
nn = length(yy)               ## sample size (n)
tt = apply(cleaps$which,1,sum) ## model size (p)
BIC  <- nn*log(cleaps$rss/nn)+tt*log(nn)
AIC  <- nn*log(cleaps$rss/nn)+tt*2
AICc <- AIC + tt*2*(1+tt)/(nn-tt-1)
par(mfrow=c(1,1))
plot(tt,cleaps$cp-min(cleaps$cp),type="l",main="Criterion - min(Criterion)", xlab="model size (p)")
lines(tt,AIC-min(AIC),col="blue")
lines(tt,AICc-min(AICc),col="green")
lines(tt,BIC-min(BIC),col="red")
legend("topleft", col=c(1,4,3,2), lwd=c(2,2,2,2), lty=c(1,1,1,1),
       legend=c("Cp","AIC","AICc","BIC"))
abline(h=0,lty=2)



###################################################
### Listing of the best models
###################################################
#line 293 "Lect9.Rnw"
cpmod<-cleaps$which[cleaps$cp==min(cleaps$cp),]
aicmod<-cleaps$which[AIC==min(AIC),]
aiccmod<-cleaps$which[AICc==min(AICc),]
bicmod<-cleaps$which[BIC==min(BIC),]
print(cpmod)
print(aicmod)
print(aiccmod)
print(bicmod)


###################################################
### Repeat model selection on several random subsets of data.
### Try to discover which variables are most frequently selected.
### Start by choosing a splitting fraction for the data and the
###  number of random splits (K) to perform.
### Use frac of the data for training, 1-frac for testing
### (Can try other splits like frac=2/3...)
### Omit AICc (practically the same as AIC...)
###################################################
#line 306 "Lect9.Rnw"
frac<-.5
# Number of random splits
K<-1000
# Storage for models and
modsizecp<-rep(0,K)
modsizeaic<-rep(0,K)
modsizebic<-rep(0,K)
PEcpK<-rep(0,K)
PEaicK<-rep(0,K)
PEbicK<-rep(0,K)
modselcp<-rep(0,dim(SA)[2]-1)
modselaic<-rep(0,dim(SA)[2]-1)
modselbic<-rep(0,dim(SA)[2]-1)


###################################################
### The main program.
### Loop cycles over splitting the data, model selection & prediction
###   on test data several times. The PE vectors store the prediction errors
###   associated with the selected models using each of the criteria (Cp, AIC, BIC).
### "modesize": store the selected models' sizes.
### "modsel": store counts of number of times each variable is selected.
###################################################
#line 324 "Lect9.Rnw"
# Creating training and test data
for (kk in (1:K)) {
ii<-sample(seq(1,dim(SA)[1]),round(dim(SA)[1]*frac))
yy<-SA[ii,12]
xx<-as.matrix(SA[ii,-12])
yyt<-SA[-ii,12]
xxt<-as.matrix(SA[-ii,-12])
# model selection criteria computed for top 1 model of each size
rleaps<-regsubsets(xx,yy,int=T,nbest=1,nvmax=dim(SA)[2],really.big=T,method=c("ex"))
cleaps<-summary(rleaps,matrix=T)
tt<-apply(cleaps$which,1,sum)
BIC<-length(yy)*log(cleaps$rss/length(yy))+tt*log(length(yy))
AIC<-length(yy)*log(cleaps$rss/length(yy))+tt*2
# winning models
cpmod<-cleaps$which[cleaps$cp==min(cleaps$cp),]
aicmod<-cleaps$which[AIC==min(AIC),]
bicmod<-cleaps$which[BIC==min(BIC),]
# winning model parameters
mmcp<-lm(yy~xx[,cpmod[2:length(cpmod)]==T])
mmaic<-lm(yy~xx[,aicmod[2:length(aicmod)]==T])
mmbic<-lm(yy~xx[,bicmod[2:length(bicmod)]==T])
# prediction errors on test data
PEcpK[kk]<-sum((yyt-cbind(rep(1,dim(xxt)[1]),xxt[,cpmod[2:length(cpmod)]==T])%*%mmcp$coef)^2)/length(yyt)
PEaicK[kk]<-sum((yyt-cbind(rep(1,dim(xxt)[1]),xxt[,aicmod[2:length(aicmod)]==T])%*%mmaic$coef)^2)/length(yyt)
PEbicK[kk]<-sum((yyt-cbind(rep(1,dim(xxt)[1]),xxt[,bicmod[2:length(bicmod)]==T])%*%mmbic$coef)^2)/length(yyt)
# model sizes of winning models
modsizecp[kk]<-sum(cpmod)
modsizeaic[kk]<-sum(aicmod)
modsizebic[kk]<-sum(bicmod)
# keeping track of which variables were chosen
modselcp[cpmod[2:length(cpmod)]==T]<-modselcp[cpmod[2:length(cpmod)]==T]+1
modselaic[aicmod[2:length(cpmod)]==T]<-modselaic[aicmod[2:length(cpmod)]==T]+1
modselbic[bicmod[2:length(cpmod)]==T]<-modselbic[bicmod[2:length(cpmod)]==T]+1
}
# end program
# print results
modseltabs<-cbind(names(SA)[-12],modselcp,modselaic,modselbic)
print(modseltabs)


###################################################
### Compare average model size and average prediction error over the random splits.
###################################################
#line 381 "Lect9.Rnw"
print(c("mean PE for cp, aic and bic"))
print(c("PEcp=",round(mean(PEcpK),4
)," PEaic=",round(mean(PEaicK),4)," PEbicK=",round(mean(PEbicK),4)))
print(c("mean model size for cp, aic and bic"))
print(c("sizecp=",round(mean(modsizecp),4
)," sizeaic=",round(mean(modsizeaic),4)," sizebic=",round(mean(modsizebic),4)))


###################################################
### Boxplots of model sizes and prediction errors over the random splits.
### (To see if results vary a lot between random splits...)
###################################################
#line 390 "Lect9.Rnw"
par(mfrow=c(1,2))
boxplot(as.data.frame(cbind(modsizecp,modsizeaic,modsizebic)))
boxplot(as.data.frame(cbind(PEcpK,PEaicK,PEbicK)))


###################################################
### Since each random split has its own characteristics, we should
###   perhaps compare prediction errors (PEs) within each test set...
###################################################
# pairwise comparisons
par(mfrow=c(1,2))
boxplot(as.data.frame(cbind(modsizeaic-modsizecp,modsizebic-modsizecp,modsizebic-modsizeaic)),names=c("AIC-CP","BIC-CP","BIC-AIC"),main="Model sizes")
abline(h=0,lty=2)
#line 423 "Lect9.Rnw"
boxplot(as.data.frame(cbind(PEaicK-PEcpK,PEbicK-PEcpK,PEbicK-PEaicK)),names=c("AIC-CP","BIC-CP","BIC-AIC"),main="PE")
abline(h=0,lty=2)


###################################################
### scatterplots of PEs on same test set: Cp vs. AIC, Cp vs. BIC. 
###################################################
#line 440 "Lect9.Rnw"
par(mfrow=c(1,1))
plot(PEcpK,PEaicK,xlab="PE with Cp", ylab="PE with AIC/BIC",pch=1,col=4)
points(PEcpK,PEbicK,pch=2,col=2)
abline(0,1)
legend("topleft", col=c(4,2), pch=c(1,2), legend=c("Cp vs. AIC","Cp vs. BIC"))