################################################### ### Demo R code for Lecture 14: CART analysis of SA heart data ################################################### # Note: Will use package "tree()", it's easier to demo, but package # "rpart()" is more commonly used (has better validation methods). ################################################### library(tree) SA<-read.table('http://www.math.ttu.edu/~atrindad/stat5371/SA.dat',sep='\t',header=T) SA<-read.table('/home/atrindad/Teach/stat5371/Data/SA.dat',sep='\t',header=T) ################################################### ### First try to predict chd (heart disease yes/no). ### This code produces a full tree. ### (Must specify full model with all required transforms already in place.) ################################################### #line 273 "Lect14.Rnw" tree1<-tree(as.factor(chd)~log(age)+sbp+adiposity+obesity+typea+alcohol+as.factor(alcind)+tobacco+as.factor(tobind)+as.factor(famhist)+log(ldl),data=SA,minsize=5) par(mfrow=c(1,1)) plot(tree1) text(tree1) ################################################### ### Select "best" tree via 10-fold CV. ### Should pick tree with smallest deviance in "deviance-size" plot. ### (In this case best tree has size=1, i.e. 1 split only.) ################################################### #line 289 "Lect14.Rnw" cvtree1<-cv.tree(tree1,K=10) plot(cvtree1) print(cvtree1) ################################################### ### Prune the best tree identified above. ################################################### #line 294 "Lect14.Rnw" mim<-min(cvtree1$dev) sizesel<-max(2,max(cvtree1$size[cvtree1$dev==mim])) ptree1<-prune.tree(tree1,best=sizesel) plot(ptree1) text(ptree1) ################################################### ### In-sample misclassification error rates for the pruned tree. ### 34% error rate not so good..., similar to logistic model. ################################################### #line 311 "Lect14.Rnw" predtree<-predict(ptree1,newdata=SA,type="class") print(table(SA$chd,predtree)) ################################################### ### Perform B=100 random splits of the data to check model stability, ### variable importance, and prediction performance. ### Use test data of size 50, and training data of size 312-50. ################################################### #line 318 "Lect14.Rnw" B<-100 # number of random splits NbrLeaves<-matrix(0,B,1) UsedMatrix<-matrix(0,B,11) UsedFirst<-matrix(0,B,11) MSEMatrix<-matrix(0,B,2) for (bb in (1:B)) { # iie<-sample(seq(1,dim(SA)[1]),50) trdata<-SA[-iie,] tedata<-SA[iie,] tree1<-tree(as.factor(chd)~log(age)+sbp+adiposity+obesity+typea+alcohol+as.factor(alcind)+tobacco+as.factor(tobind)+as.factor(famhist)+log(ldl),data=trdata,minsize=2) cvtree1<-cv.tree(tree1,K=10) mim<-min(cvtree1$dev) sizesel<-max(2,max(cvtree1$size[cvtree1$dev==mim])) ptree1<-prune.tree(tree1,best=sizesel) NbrLeaves[bb]<-length(unique(ptree1$where)) # pruned tree predictions predtree<-predict(ptree1,newdata=tedata,type="class") rlabels<-tedata[,10] MSEMatrix[bb,1]<-sum(rlabels!=predtree)/length(predtree) # un-pruned tree predictions predtree<-predict(tree1,newdata=tedata,type="class") MSEMatrix[bb,2]<-sum(rlabels!=predtree)/length(predtree) UsedMatrix[bb,as.double(summary(ptree1)$used)-1]<-UsedMatrix[bb,as.double(summary(ptree1)$used)-1]+1 UsedFirst[bb,as.double(summary(ptree1)$used)[1]-1]<-UsedFirst[bb,as.double(summary(ptree1)$used)[1]-1]+1 } ################################################### ### Mean out-of-sample misclassification error over the 100 (pruned) trees. ################################################### #line 346 "Lect14.Rnw" print(mean(MSEMatrix[,1])) ################################################### ### Hists & 5-number summaries of tree sizes & number of leaves. ### 5-number summary: min, 25%, 50%, 75%, max. ################################################### #line 349 "Lect14.Rnw" sizetrees<-apply(UsedMatrix,1,sum) print(fivenum(sizetrees)) print(fivenum(NbrLeaves)) par(mfrow=c(1,2)) hist(sizetrees, main="size of pruned trees") hist(NbrLeaves, main="number of leaves") par(mfrow=c(1,1)) ################################################### ### Frequencies of variables being picked for the trees. ### UsedMatrix: variables used in tree. ### UsedFirst: variables picked 1st for tree. ################################################### #line 354 "Lect14.Rnw" modtab<-apply(UsedMatrix,2,sum)/B print(cbind(names(SA)[-10],modtab)) modfirst<-apply(UsedFirst,2,sum)/B print(cbind(names(SA)[-10],modfirst)) ################################################### ### Now try to predict ldl (cholesterol level). ### This code produces a full tree. ### (Must specify full model with all required transforms already in place.) ################################################### #line 363 "Lect14.Rnw" tree1<-tree(ldl~log(age)+sbp+adiposity+obesity+typea+alcohol+alcind+tobacco+tobind+as.factor(chd)+as.factor(famhist),data=SA,minsize=5) par(mfrow=c(1,1)) plot(tree1) text(tree1) ################################################### ### Select "best" tree via 10-fold CV. ################################################### #line 379 "Lect14.Rnw" cvtree1<-cv.tree(tree1,K=10) plot(cvtree1) print(cvtree1) ################################################### ### Prune the best tree identified above. ################################################### #line 384 "Lect14.Rnw" mim<-min(cvtree1$dev) sizesel<-max(2,max(cvtree1$size[cvtree1$dev==mim])) ptree1<-prune.tree(tree1,best=sizesel) plot(ptree1) text(ptree1) ################################################### ### In-sample MSE (analogue of misclassification error rate). ################################################### #line 401 "Lect14.Rnw" predtree<-predict(ptree1,newdata=SA) print(sum((SA$ldl-predtree)^2)/length(predtree)) ################################################### ### Perform B=100 random splits of the data (as before). ################################################### #line 408 "Lect14.Rnw" B<-100 # number of random splits NbrLeaves<-matrix(0,B,1) UsedMatrix<-matrix(0,B,11) UsedFirst<-matrix(0,B,11) MSEMatrix<-matrix(0,B,2) for (bb in (1:B)) { # iie<-sample(seq(1,dim(SA)[1]),50) trdata<-SA[-iie,] tedata<-SA[iie,] tree1<-tree(ldl~log(age)+sbp+adiposity+obesity+typea+alcohol+as.factor(alcind)+tobacco+as.factor(tobind)+as.factor(chd)+as.factor(famhist),data=trdata,minsize=2) cvtree1<-cv.tree(tree1,K=10) mim<-min(cvtree1$dev) sizesel<-max(2,max(cvtree1$size[cvtree1$dev==mim])) ptree1<-prune.tree(tree1,best=sizesel) NbrLeaves[bb]<-length(unique(ptree1$where)) # pruned tree predictions predtree<-predict(ptree1,newdata=tedata) rlabels<-tedata[,12] MSEMatrix[bb,1]<-sum((rlabels-predtree)^2)/length(predtree) # un-pruned tree predictions predtree<-predict(tree1,newdata=tedata) MSEMatrix[bb,2]<-sum((rlabels-predtree)^2)/length(predtree) UsedMatrix[bb,as.double(summary(ptree1)$used)-1]<-UsedMatrix[bb,as.double(summary(ptree1)$used)-1]+1 UsedFirst[bb,as.double(summary(ptree1)$used)[1]-1]<-UsedFirst[bb,as.double(summary(ptree1)$used)[1]-1]+1 } ################################################### ### prediction MSE (pMSE) over the 100 (pruned) trees. ################################################### #line 436 "Lect14.Rnw" print(c(mean(MSEMatrix[,1]))) ################################################### ### Hists & 5-number summaries of tree sizes & number of leaves. ### 5-number summary: min, 25%, 50%, 75%, max. ################################################### #line 439 "Lect14.Rnw" sizetrees<-apply(UsedMatrix,1,sum) print(fivenum(sizetrees)) print(fivenum(NbrLeaves)) par(mfrow=c(1,2)) hist(sizetrees, main="size of pruned trees") hist(NbrLeaves, main="number of leaves") par(mfrow=c(1,1)) ################################################### ### Frequencies of variables being picked for the trees. ### UsedMatrix: variables used in tree. ### UsedFirst: variables picked 1st for tree. ################################################### #line 444 "Lect14.Rnw" modtab<-apply(UsedMatrix,2,sum)/B print(cbind(names(SA)[-12],modtab)) modfirst<-apply(UsedFirst,2,sum)/B print(cbind(names(SA)[-12],modfirst))