################################################### ### Demo for Lecture 9: ### Model search via Cp, AIC, AICc, BIC for SA.dat ################################################### #line 80 "Lecture8b.Rnw" library(leaps) ################################################### ### Read in data and enumerate all models to investigate (via "regsubsets"). ### Total number of subsets models: 2^12 = 4096 (will only consider 1464). ################################################### SA<-data.frame(read.table('http://www.math.ttu.edu/~atrindad/stat5371/SA.dat',sep='\t',header=T)) SA<-data.frame(read.table('/home/atrindad/Teach/stat5371/Data/SA.dat',sep='\t',header=T)) print(dim(SA)) ## dimensions #SA<-SA[sample(seq(1,dim(SA)[1]),150),] ## if a random subset of dataframe is wanted... yy<-SA[,12] xx<-SA[,-12] # get the "best" subset models... rleaps<-regsubsets(xx,yy,int=T,nbest=250,nvmax=250,really.big=T,method=c("ex")) ## True/False matrix: which vars are in? cleaps<-summary(rleaps,matrix=T) Models<-cleaps$which ## adding the empty model (just an intercept) to the model matrix Models<-rbind(c(T,rep(F,dim(xx)[2])),Models) ################################################### ### Calculate Cp, AIC, AICc, BIC for each model (m) & plot vs. p(m) ################################################### #line 240 "Lect9.Rnw" nn = length(yy) ## sample size (n) tt = apply(cleaps$which,1,sum) ## model size (p) BIC <- nn*log(cleaps$rss/nn)+tt*log(nn) AIC <- nn*log(cleaps$rss/nn)+tt*2 AICc <- AIC + tt*2*(1+tt)/(nn-tt-1) #line 245 "Lect9.Rnw" par(mfrow=c(2,2)) plot(tt,cleaps$cp,main="Cp", xlab="model size (p)") plot(tt,AIC,main="AIC", xlab="model size (p)") plot(tt,AICc,main="AICc", xlab="model size (p)") plot(tt,BIC,main="BIC", xlab="model size (p)") #par(mfrow=c(1,1)) ################################################### ### Select only the best model in each subset size & plot AIC-min(AIC), etc. ################################################### #line 267 "Lect9.Rnw" rleaps<-regsubsets(xx,yy,int=T,nbest=1,nvmax=dim(xx)[2]+1,really.big=T,method=c("ex")) ## nbest=1: the best model of each size cleaps<-summary(rleaps,matrix=T) # just the best model of every size nn = length(yy) ## sample size (n) tt = apply(cleaps$which,1,sum) ## model size (p) BIC <- nn*log(cleaps$rss/nn)+tt*log(nn) AIC <- nn*log(cleaps$rss/nn)+tt*2 AICc <- AIC + tt*2*(1+tt)/(nn-tt-1) par(mfrow=c(1,1)) plot(tt,cleaps$cp-min(cleaps$cp),type="l",main="Criterion - min(Criterion)", xlab="model size (p)") lines(tt,AIC-min(AIC),col="blue") lines(tt,AICc-min(AICc),col="green") lines(tt,BIC-min(BIC),col="red") legend("topleft", col=c(1,4,3,2), lwd=c(2,2,2,2), lty=c(1,1,1,1), legend=c("Cp","AIC","AICc","BIC")) abline(h=0,lty=2) ################################################### ### Listing of the best models ################################################### #line 293 "Lect9.Rnw" cpmod<-cleaps$which[cleaps$cp==min(cleaps$cp),] aicmod<-cleaps$which[AIC==min(AIC),] aiccmod<-cleaps$which[AICc==min(AICc),] bicmod<-cleaps$which[BIC==min(BIC),] print(cpmod) print(aicmod) print(aiccmod) print(bicmod) ################################################### ### Repeat model selection on several random subsets of data. ### Try to discover which variables are most frequently selected. ### Start by choosing a splitting fraction for the data and the ### number of random splits (K) to perform. ### Use frac of the data for training, 1-frac for testing ### (Can try other splits like frac=2/3...) ### Omit AICc (practically the same as AIC...) ################################################### #line 306 "Lect9.Rnw" frac<-.5 # Number of random splits K<-1000 # Storage for models and modsizecp<-rep(0,K) modsizeaic<-rep(0,K) modsizebic<-rep(0,K) PEcpK<-rep(0,K) PEaicK<-rep(0,K) PEbicK<-rep(0,K) modselcp<-rep(0,dim(SA)[2]-1) modselaic<-rep(0,dim(SA)[2]-1) modselbic<-rep(0,dim(SA)[2]-1) ################################################### ### The main program. ### Loop cycles over splitting the data, model selection & prediction ### on test data several times. The PE vectors store the prediction errors ### associated with the selected models using each of the criteria (Cp, AIC, BIC). ### "modesize": store the selected models' sizes. ### "modsel": store counts of number of times each variable is selected. ################################################### #line 324 "Lect9.Rnw" # Creating training and test data for (kk in (1:K)) { ii<-sample(seq(1,dim(SA)[1]),round(dim(SA)[1]*frac)) yy<-SA[ii,12] xx<-as.matrix(SA[ii,-12]) yyt<-SA[-ii,12] xxt<-as.matrix(SA[-ii,-12]) # model selection criteria computed for top 1 model of each size rleaps<-regsubsets(xx,yy,int=T,nbest=1,nvmax=dim(SA)[2],really.big=T,method=c("ex")) cleaps<-summary(rleaps,matrix=T) tt<-apply(cleaps$which,1,sum) BIC<-length(yy)*log(cleaps$rss/length(yy))+tt*log(length(yy)) AIC<-length(yy)*log(cleaps$rss/length(yy))+tt*2 # winning models cpmod<-cleaps$which[cleaps$cp==min(cleaps$cp),] aicmod<-cleaps$which[AIC==min(AIC),] bicmod<-cleaps$which[BIC==min(BIC),] # winning model parameters mmcp<-lm(yy~xx[,cpmod[2:length(cpmod)]==T]) mmaic<-lm(yy~xx[,aicmod[2:length(aicmod)]==T]) mmbic<-lm(yy~xx[,bicmod[2:length(bicmod)]==T]) # prediction errors on test data PEcpK[kk]<-sum((yyt-cbind(rep(1,dim(xxt)[1]),xxt[,cpmod[2:length(cpmod)]==T])%*%mmcp$coef)^2)/length(yyt) PEaicK[kk]<-sum((yyt-cbind(rep(1,dim(xxt)[1]),xxt[,aicmod[2:length(aicmod)]==T])%*%mmaic$coef)^2)/length(yyt) PEbicK[kk]<-sum((yyt-cbind(rep(1,dim(xxt)[1]),xxt[,bicmod[2:length(bicmod)]==T])%*%mmbic$coef)^2)/length(yyt) # model sizes of winning models modsizecp[kk]<-sum(cpmod) modsizeaic[kk]<-sum(aicmod) modsizebic[kk]<-sum(bicmod) # keeping track of which variables were chosen modselcp[cpmod[2:length(cpmod)]==T]<-modselcp[cpmod[2:length(cpmod)]==T]+1 modselaic[aicmod[2:length(cpmod)]==T]<-modselaic[aicmod[2:length(cpmod)]==T]+1 modselbic[bicmod[2:length(cpmod)]==T]<-modselbic[bicmod[2:length(cpmod)]==T]+1 } # end program # print results modseltabs<-cbind(names(SA)[-12],modselcp,modselaic,modselbic) print(modseltabs) ################################################### ### Compare average model size and average prediction error over the random splits. ################################################### #line 381 "Lect9.Rnw" print(c("mean PE for cp, aic and bic")) print(c("PEcp=",round(mean(PEcpK),4 )," PEaic=",round(mean(PEaicK),4)," PEbicK=",round(mean(PEbicK),4))) print(c("mean model size for cp, aic and bic")) print(c("sizecp=",round(mean(modsizecp),4 )," sizeaic=",round(mean(modsizeaic),4)," sizebic=",round(mean(modsizebic),4))) ################################################### ### Boxplots of model sizes and prediction errors over the random splits. ### (To see if results vary a lot between random splits...) ################################################### #line 390 "Lect9.Rnw" par(mfrow=c(1,2)) boxplot(as.data.frame(cbind(modsizecp,modsizeaic,modsizebic))) boxplot(as.data.frame(cbind(PEcpK,PEaicK,PEbicK))) ################################################### ### Since each random split has its own characteristics, we should ### perhaps compare prediction errors (PEs) within each test set... ################################################### # pairwise comparisons par(mfrow=c(1,2)) boxplot(as.data.frame(cbind(modsizeaic-modsizecp,modsizebic-modsizecp,modsizebic-modsizeaic)),names=c("AIC-CP","BIC-CP","BIC-AIC"),main="Model sizes") abline(h=0,lty=2) #line 423 "Lect9.Rnw" boxplot(as.data.frame(cbind(PEaicK-PEcpK,PEbicK-PEcpK,PEbicK-PEaicK)),names=c("AIC-CP","BIC-CP","BIC-AIC"),main="PE") abline(h=0,lty=2) ################################################### ### scatterplots of PEs on same test set: Cp vs. AIC, Cp vs. BIC. ################################################### #line 440 "Lect9.Rnw" par(mfrow=c(1,1)) plot(PEcpK,PEaicK,xlab="PE with Cp", ylab="PE with AIC/BIC",pch=1,col=4) points(PEcpK,PEbicK,pch=2,col=2) abline(0,1) legend("topleft", col=c(4,2), pch=c(1,2), legend=c("Cp vs. AIC","Cp vs. BIC"))