################################################### ### Demo for Lecture 11: ### Logistic Regression modeling of SA heart data. ### Now the indicator of heart disease (chd) is the response. ################################################### #line 27 "Lect11.Rnw" library(stats) logit<-function(z) {ll<-log(z/(1-z))} SA<-data.frame(read.table('http://www.math.ttu.edu/~atrindad/stat5371/SA.dat',sep='\t',header=T)) #SA<-data.frame(read.table('/home/atrindad/Teach/stat5371/Data/SA.dat',sep='\t',header=T)) y<-SA$chd ################################################### ### Link function check. ### Cycle through potential transforms, original scale or log, for all ### the vars, eventually decide on log(ldl) & log(age). ### These are plots of the logit link vs. x and vs. log(x), and must be done ### by grouping the x data (here we use default binning from histograms). ################################################### ### first ldl par(mfrow=c(2,2)) xt<-(SA$ldl) hh<-hist(xt,plot=F) x2<-xt for (kk in (1:length(hh$breaks)-1)) { x2[xt<=hh$breaks[kk+1] & xt>hh$breaks[kk]]<-hh$mid[kk] } tt<-table(y,x2) ttt<-logit(tt[2,]/apply(tt,2,sum)) # plot(sort(unique(x2)),ttt, xlab="ldl",ylab="log-odds") gg<-glm(y~xt,'binomial') pp<-predict(gg,response='link',se=T) lines(sort(xt),pp$fit[sort.list(xt)]) lines(sort(xt),(pp$fit-pp$se)[sort.list(xt)],lty=2) lines(sort(xt),(pp$fit+pp$se)[sort.list(xt)],lty=2) # xt<-log(SA$ldl) hh<-hist(xt,plot=F) x2<-xt for (kk in (1:length(hh$breaks)-1)) { x2[xt<=hh$breaks[kk+1] & xt>hh$breaks[kk]]<-hh$mid[kk] } tt<-table(y,x2) ttt<-logit(tt[2,]/apply(tt,2,sum)) # plot(sort(unique(x2)),ttt,xlab="log(ldl)",ylab="log-odds") gg<-glm(y~xt,'binomial') pp<-predict(gg,response='link',se=T) lines(sort(xt),pp$fit[sort.list(xt)]) lines(sort(xt),(pp$fit-pp$se)[sort.list(xt)],lty=2) lines(sort(xt),(pp$fit+pp$se)[sort.list(xt)],lty=2) ### now age xt<-SA$age hh<-hist(xt,plot=F) x2<-xt for (kk in (1:length(hh$breaks)-1)) { x2[xt<=hh$breaks[kk+1] & xt>hh$breaks[kk]]<-hh$mid[kk] } tt<-table(y,x2) ttt<-logit(tt[2,]/apply(tt,2,sum)) # plot(sort(unique(x2)),ttt,xlab="age",ylab="log-odds") gg<-glm(y~xt,'binomial') pp<-predict(gg,response='link',se=T) lines(sort(xt),pp$fit[sort.list(xt)]) lines(sort(xt),(pp$fit-pp$se)[sort.list(xt)],lty=2) lines(sort(xt),(pp$fit+pp$se)[sort.list(xt)],lty=2) # xt<-log(SA$age) hh<-hist(xt,plot=F) x2<-xt for (kk in (1:length(hh$breaks)-1)) { x2[xt<=hh$breaks[kk+1] & xt>hh$breaks[kk]]<-hh$mid[kk] } tt<-table(y,x2) ttt<-logit(tt[2,]/apply(tt,2,sum)) # plot(sort(unique(x2)),ttt,xlab="log(age)",ylab="log-odds") gg<-glm(y~xt,'binomial') pp<-predict(gg,response='link',se=T) lines(sort(xt),pp$fit[sort.list(xt)]) lines(sort(xt),(pp$fit-pp$se)[sort.list(xt)],lty=2) lines(sort(xt),(pp$fit+pp$se)[sort.list(xt)],lty=2) ################################################### ### Fit full model. ### The Chi-square goodness-of-fit test compares the deviance 314.103 to ### Chi-square dist. df=302, p-value = P(Chi-square(302)>314.103) = 0.304. ### We do not reject the fit of the logistic model. ################################################### #line 401 "Lect11.Rnw" gg<-glm(chd~log(ldl)+log(age)+alcohol+tobind+sbp+typea+adiposity+obesity+as.factor(famhist),data=SA,'binomial') print(gs<-summary(gg)) ################################################### ### Diagnostic plots. OK! ################################################### #line 410 "Lect11.Rnw" par(mfrow=c(2,2)) res<-residuals(gg,'pearson') plot(logit(gg$fit),res,xlab="linear predictor",main="Pearson residuals") ll<-loess(res[sort.list(logit(gg$fit))]~sort(logit(gg$fit))) lines(ll$x,ll$fit,col=2) abline(h=0,lty=2) #line 417 "Lect11.Rnw" res<-residuals(gg,'deviance') plot(logit(gg$fit),res,xlab="linear predictor",main="Deviance residuals") ll<-loess(res[sort.list(logit(gg$fit))]~sort(logit(gg$fit))) lines(ll$x,ll$fit,col=2) abline(h=0,lty=2) #line 424 "Lect11.Rnw" hh<-hatvalues(gg) plot(hh,type='h',main="leverage",xlab="index") dd<-cooks.distance(gg) plot(dd,type='h',main="cook's distance",xlab="index") ################################################### ### More diagnostic plots (canned version). ### Top left: pearson residuals again. ### Top right: QQ-plot of pearson res vs. normal dist. (may be relevant for ### e.g. Poisson models that sometimes can be approx by a normal, but ### in general we don't expect to see a good agreement here). ### Bot left: just like in the linear model, a way to check the variance ### of y as a function of fitted value. For binomial models we expect ### var. to be non-const. and max at pi=0.5 corresp. to linear predictor=0. ### Bot right: combines leverage and pearson resids; extremes in either or both ### directions constitutes a large Cook's distance (red dashed lines). ################################################### #line 445 "Lect11.Rnw" par(mfrow=c(2,2)) plot(gg) ################################################### ### More diagnostics: Change in deviance. ### Looks for obs that impact overall measure of goodness-of-fit ### (similar to the change of sigma^2 in regression). ### Here, obs 174 is identifed - with click - as possible outlier. ################################################### par(mfrow=c(1,1)) dev.change<-residuals(gg,'deviance')^2+residuals(gg,'pearson')^2*(hh/(1-hh)) plot(seq(1,dim(SA)[1]),dev.change,type='h',main="Change in deviance") id<-identify(dev.change,pos=T) #line 466 "Lect11.Rnw" #plot(seq(1,dim(SA)[1]),dev.change,type='h') #if (max(id$ind)!=-Inf) { #text(seq(1,dim(SA)[1])[id$ind],dev.change[id$ind],id$ind,pos=id$pos) } ################################################### ### Try a simple backward model selection based on AIC. ### For GLM: AIC = Deviance + 2*p. ################################################### #line 486 "Lect11.Rnw" print(step(gg)) selg<-step(gg,trace=F) print(summary(selg)) ################################################### ### Use selected model to construct a dose-response curve. ################################################### #line 493 "Lect11.Rnw" plot(logit(selg$fit),SA$chd,xlab="linear predictor",ylab="Prob(chd=1)", main="fitted dose-response curve") lines(sort(logit(selg$fit)),selg$fit[sort.list(selg$fit)]) ################################################### ### Check if model can classify patients in terms of heart disease... ### 26% misclassified, not so good, and optimistic, since same data ### is used for fitting as well as classifcation (more in GLM lecture). #################################################### #line 507 "Lect11.Rnw" pp<-predict(selg,type='response') pp[pp<.5]<-0 pp[pp>=.5]<-1 table(pp,SA$chd) print(c("Misclassification error=", round(sum(pp!=SA$chd)/length(SA$chd),3))) ################################################### ### Should we include interactions in the model? ### famhist & log(ldl) is very significant! ################################################### #line 517 "Lect11.Rnw" par(mfrow=c(2,2)) gg<-glm(chd~log(age)*as.factor(famhist),'binomial',data=SA) pp<-predict(gg,type='response') xvec<-seq(min(log(SA$age)),max(log(SA$age)),by=.1) p0<-gg$coef[1]+gg$coef[2]*xvec p1<-gg$coef[1]+gg$coef[2]*xvec+gg$coef[3]+gg$coef[4]*xvec plot(xvec,p0,type='l',ylab='log-odds',xlab='log(age)',ylim=c(min(logit(gg$fit)),max(logit(gg$fit)))) lines(xvec,p1,lty=2) legend("topleft", legend=c("famhist=0","famhist=1"),lty=c(1,2)) # gg<-glm(chd~log(ldl)*as.factor(famhist),'binomial',data=SA) pp<-predict(gg,type='response') xvec<-seq(min(log(SA$ldl)),max(log(SA$ldl)),by=.1) p0<-gg$coef[1]+gg$coef[2]*xvec p1<-gg$coef[1]+gg$coef[2]*xvec+gg$coef[3]+gg$coef[4]*xvec plot(xvec,p0,type='l',ylab='log-odds',xlab='log(ldl)',ylim=c(min(logit(gg$fit)),max(logit(gg$fit)))) lines(xvec,p1,lty=2) # gg<-glm(chd~typea*as.factor(famhist),'binomial',data=SA) pp<-predict(gg,type='response') xvec<-seq(min(SA$typea),max(SA$typea),by=.1) p0<-gg$coef[1]+gg$coef[2]*xvec p1<-gg$coef[1]+gg$coef[2]*xvec+gg$coef[3]+gg$coef[4]*xvec plot(xvec,p0,type='l',ylab='log-odds',xlab='typea',ylim=c(min(logit(gg$fit)),max(logit(gg$fit)))) lines(xvec,p1,lty=2) # gg<-glm(chd~obesity*as.factor(famhist),'binomial',data=SA) pp<-predict(gg,type='response') xvec<-seq(min(SA$obesity),max(SA$obesity),by=.1) p0<-gg$coef[1]+gg$coef[2]*xvec p1<-gg$coef[1]+gg$coef[2]*xvec+gg$coef[3]+gg$coef[4]*xvec plot(xvec,p0,type='l',ylab='log-odds',xlab='obesity',ylim=c(min(logit(gg$fit)),max(logit(gg$fit)))) lines(xvec,p1,lty=2) par(mfrow=c(1,1)) title("Interactions with 'famhist'") ################################################### ### Should we include interactions in the model? ### tobind & obesity looks significant, but is not (large p-value). ################################################### #line 562 "Lect11.Rnw" par(mfrow=c(2,2)) gg<-glm(chd~log(age)*as.factor(tobind),'binomial',data=SA) pp<-predict(gg,type='response') xvec<-seq(min(log(SA$age)),max(log(SA$age)),by=.1) p0<-gg$coef[1]+gg$coef[2]*xvec p1<-gg$coef[1]+gg$coef[2]*xvec+gg$coef[3]+gg$coef[4]*xvec plot(xvec,p0,type='l',ylab='log-odds',xlab='log(age)',ylim=c(min(logit(gg$fit)),max(logit(gg$fit)))) lines(xvec,p1,lty=2) legend("topleft", legend=c("tobind=0","tobind=1"),lty=c(1,2)) # gg<-glm(chd~log(ldl)*as.factor(tobind),'binomial',data=SA) pp<-predict(gg,type='response') xvec<-seq(min(log(SA$ldl)),max(log(SA$ldl)),by=.1) p0<-gg$coef[1]+gg$coef[2]*xvec p1<-gg$coef[1]+gg$coef[2]*xvec+gg$coef[3]+gg$coef[4]*xvec plot(xvec,p0,type='l',ylab='log-odds',xlab='log(ldl)',ylim=c(min(logit(gg$fit)),max(logit(gg$fit)))) lines(xvec,p1,lty=2) # gg<-glm(chd~typea*as.factor(tobind),'binomial',data=SA) pp<-predict(gg,type='response') xvec<-seq(min(SA$typea),max(SA$typea),by=.1) p0<-gg$coef[1]+gg$coef[2]*xvec p1<-gg$coef[1]+gg$coef[2]*xvec+gg$coef[3]+gg$coef[4]*xvec plot(xvec,p0,type='l',ylab='log-odds',xlab='typea',ylim=c(min(logit(gg$fit)),max(logit(gg$fit)))) lines(xvec,p1,lty=2) # gg<-glm(chd~obesity*as.factor(tobind),'binomial',data=SA) pp<-predict(gg,type='response') xvec<-seq(min(SA$obesity),max(SA$obesity),by=.1) p0<-gg$coef[1]+gg$coef[2]*xvec p1<-gg$coef[1]+gg$coef[2]*xvec+gg$coef[3]+gg$coef[4]*xvec plot(xvec,p0,type='l',ylab='log-odds',xlab='obesity',ylim=c(min(logit(gg$fit)),max(logit(gg$fit)))) lines(xvec,p1,lty=2) par(mfrow=c(1,1)) title("Interactions with 'tobind'") ################################################### ### Decision: just include interactions with famhist. ### Backward selection keeps only interaction b/ween famhist & log(ldl)... ### Take a closer look at model summary: ### inclusion of interaction has markedly dropped significance ### of main effect of log(ldl), from p=0.000866 to p=0.48530. ### Suspicion: perhaps interaction has created a collinearity problem, and ### interaction provides little info about chd not already provided by log(ldl)... ################################################### #line 608 "Lect11.Rnw" gg<-glm(chd~as.factor(famhist)*(log(ldl)+log(age)+sbp+adiposity+obesity+alcohol+typea+as.factor(tobind)),'binomial',data=SA) print(summary(gg)) selg<-step(gg,trace=F) print(summary(selg)) par(mfrow=c(2,2)) plot(selg) ################################################### ### Compute correlation matrix for the coefficients: ### correlation between ldl & ldl*famhist = -0.9695 !!! ### With such extreme collinearity, one should probably drop the interaction ### term and revert back to the additive model... ################################################### #line 626 "Lect11.Rnw" ss<-summary(selg) c2<-solve(diag(diag(ss$cov.sc)))^{1/2}%*%ss$cov.sc%*%solve(diag(diag(ss$cov.sc)))^{1/2} print(c2) ################################################### ### More: look for interactions between the numerical variables ### you need to bin the values of one of them first in order to ### plot the log-odds comparisons as above.