###################################################
### Demo R code for Lecture 14: CART analysis of SA heart data
###################################################
# Note: Will use package "tree()", it's easier to demo, but package 
#   "rpart()" is more commonly used (has better validation methods).
###################################################
library(tree)
SA<-read.table('http://www.math.ttu.edu/~atrindad/stat5371/SA.dat',sep='\t',header=T)
SA<-read.table('/home/atrindad/Teach/stat5371/Data/SA.dat',sep='\t',header=T)


###################################################
### First try to predict chd (heart disease yes/no).
### This code produces a full tree.
### (Must specify full model with all required transforms already in place.)
###################################################
#line 273 "Lect14.Rnw"
tree1<-tree(as.factor(chd)~log(age)+sbp+adiposity+obesity+typea+alcohol+as.factor(alcind)+tobacco+as.factor(tobind)+as.factor(famhist)+log(ldl),data=SA,minsize=5)
par(mfrow=c(1,1))
plot(tree1)
text(tree1)


###################################################
### Select "best" tree via 10-fold CV.
### Should pick tree with smallest deviance in "deviance-size" plot.
### (In this case best tree has size=1, i.e. 1 split only.)
###################################################
#line 289 "Lect14.Rnw"
cvtree1<-cv.tree(tree1,K=10)
plot(cvtree1)
print(cvtree1)


###################################################
### Prune the best tree identified above.
###################################################
#line 294 "Lect14.Rnw"
mim<-min(cvtree1$dev)
sizesel<-max(2,max(cvtree1$size[cvtree1$dev==mim]))
ptree1<-prune.tree(tree1,best=sizesel)
plot(ptree1)
text(ptree1)


###################################################
### In-sample misclassification error rates for the pruned tree.
### 34% error rate not so good..., similar to logistic model.
###################################################
#line 311 "Lect14.Rnw"
predtree<-predict(ptree1,newdata=SA,type="class")
print(table(SA$chd,predtree))


###################################################
### Perform B=100 random splits of the data to check model stability,
###   variable importance, and prediction performance.
### Use test data of size 50, and training data of size 312-50.
###################################################
#line 318 "Lect14.Rnw"
B<-100
# number of random splits
NbrLeaves<-matrix(0,B,1)
UsedMatrix<-matrix(0,B,11)
UsedFirst<-matrix(0,B,11)
MSEMatrix<-matrix(0,B,2)
for (bb in (1:B)) {
#
iie<-sample(seq(1,dim(SA)[1]),50)
trdata<-SA[-iie,]
tedata<-SA[iie,]
tree1<-tree(as.factor(chd)~log(age)+sbp+adiposity+obesity+typea+alcohol+as.factor(alcind)+tobacco+as.factor(tobind)+as.factor(famhist)+log(ldl),data=trdata,minsize=2)
cvtree1<-cv.tree(tree1,K=10)
mim<-min(cvtree1$dev)
sizesel<-max(2,max(cvtree1$size[cvtree1$dev==mim]))
ptree1<-prune.tree(tree1,best=sizesel)
NbrLeaves[bb]<-length(unique(ptree1$where))
# pruned tree predictions
predtree<-predict(ptree1,newdata=tedata,type="class")
rlabels<-tedata[,10]
MSEMatrix[bb,1]<-sum(rlabels!=predtree)/length(predtree)
# un-pruned tree predictions
predtree<-predict(tree1,newdata=tedata,type="class")
MSEMatrix[bb,2]<-sum(rlabels!=predtree)/length(predtree)
UsedMatrix[bb,as.double(summary(ptree1)$used)-1]<-UsedMatrix[bb,as.double(summary(ptree1)$used)-1]+1
UsedFirst[bb,as.double(summary(ptree1)$used)[1]-1]<-UsedFirst[bb,as.double(summary(ptree1)$used)[1]-1]+1
}


###################################################
### Mean out-of-sample misclassification error over the 100 (pruned) trees.
###################################################
#line 346 "Lect14.Rnw"
print(mean(MSEMatrix[,1]))


###################################################
### Hists & 5-number summaries of tree sizes & number of leaves.
### 5-number summary: min, 25%, 50%, 75%, max.
###################################################
#line 349 "Lect14.Rnw"
sizetrees<-apply(UsedMatrix,1,sum)
print(fivenum(sizetrees))
print(fivenum(NbrLeaves))
par(mfrow=c(1,2))
hist(sizetrees, main="size of pruned trees")
hist(NbrLeaves, main="number of leaves")
par(mfrow=c(1,1))


###################################################
### Frequencies of variables being picked for the trees.
###  UsedMatrix: variables used in tree.
###  UsedFirst: variables picked 1st for tree.
###################################################
#line 354 "Lect14.Rnw"
modtab<-apply(UsedMatrix,2,sum)/B
print(cbind(names(SA)[-10],modtab))
modfirst<-apply(UsedFirst,2,sum)/B
print(cbind(names(SA)[-10],modfirst))




###################################################
### Now try to predict ldl (cholesterol level).
### This code produces a full tree.
### (Must specify full model with all required transforms already in place.)
###################################################
#line 363 "Lect14.Rnw"
tree1<-tree(ldl~log(age)+sbp+adiposity+obesity+typea+alcohol+alcind+tobacco+tobind+as.factor(chd)+as.factor(famhist),data=SA,minsize=5)
par(mfrow=c(1,1))
plot(tree1)
text(tree1)


###################################################
### Select "best" tree via 10-fold CV.
###################################################
#line 379 "Lect14.Rnw"
cvtree1<-cv.tree(tree1,K=10)
plot(cvtree1)
print(cvtree1)


###################################################
### Prune the best tree identified above.
###################################################
#line 384 "Lect14.Rnw"
mim<-min(cvtree1$dev)
sizesel<-max(2,max(cvtree1$size[cvtree1$dev==mim]))
ptree1<-prune.tree(tree1,best=sizesel)
plot(ptree1)
text(ptree1)


###################################################
### In-sample MSE (analogue of misclassification error rate).
###################################################
#line 401 "Lect14.Rnw"
predtree<-predict(ptree1,newdata=SA)
print(sum((SA$ldl-predtree)^2)/length(predtree))


###################################################
### Perform B=100 random splits of the data (as before).
###################################################
#line 408 "Lect14.Rnw"
B<-100
# number of random splits
NbrLeaves<-matrix(0,B,1)
UsedMatrix<-matrix(0,B,11)
UsedFirst<-matrix(0,B,11)
MSEMatrix<-matrix(0,B,2)
for (bb in (1:B)) {
#
iie<-sample(seq(1,dim(SA)[1]),50)
trdata<-SA[-iie,]
tedata<-SA[iie,]
tree1<-tree(ldl~log(age)+sbp+adiposity+obesity+typea+alcohol+as.factor(alcind)+tobacco+as.factor(tobind)+as.factor(chd)+as.factor(famhist),data=trdata,minsize=2)
cvtree1<-cv.tree(tree1,K=10)
mim<-min(cvtree1$dev)
sizesel<-max(2,max(cvtree1$size[cvtree1$dev==mim]))
ptree1<-prune.tree(tree1,best=sizesel)
NbrLeaves[bb]<-length(unique(ptree1$where))
# pruned tree predictions
predtree<-predict(ptree1,newdata=tedata) 
rlabels<-tedata[,12]
MSEMatrix[bb,1]<-sum((rlabels-predtree)^2)/length(predtree)
# un-pruned tree predictions
predtree<-predict(tree1,newdata=tedata)  
MSEMatrix[bb,2]<-sum((rlabels-predtree)^2)/length(predtree)
UsedMatrix[bb,as.double(summary(ptree1)$used)-1]<-UsedMatrix[bb,as.double(summary(ptree1)$used)-1]+1
UsedFirst[bb,as.double(summary(ptree1)$used)[1]-1]<-UsedFirst[bb,as.double(summary(ptree1)$used)[1]-1]+1
}


###################################################
### prediction MSE (pMSE) over the 100 (pruned) trees.
###################################################
#line 436 "Lect14.Rnw"
print(c(mean(MSEMatrix[,1])))


###################################################
### Hists & 5-number summaries of tree sizes & number of leaves.
### 5-number summary: min, 25%, 50%, 75%, max.
###################################################
#line 439 "Lect14.Rnw"
sizetrees<-apply(UsedMatrix,1,sum)
print(fivenum(sizetrees))
print(fivenum(NbrLeaves))
par(mfrow=c(1,2))
hist(sizetrees, main="size of pruned trees")
hist(NbrLeaves, main="number of leaves")
par(mfrow=c(1,1))


###################################################
### Frequencies of variables being picked for the trees.
###  UsedMatrix: variables used in tree.
###  UsedFirst: variables picked 1st for tree.
###################################################
#line 444 "Lect14.Rnw"
modtab<-apply(UsedMatrix,2,sum)/B
print(cbind(names(SA)[-12],modtab))
modfirst<-apply(UsedFirst,2,sum)/B
print(cbind(names(SA)[-12],modfirst))