###################################################
### Demo for Lecture 11:
### Logistic Regression modeling of SA heart data.
### Now the indicator of heart disease (chd) is the response.
###################################################
#line 27 "Lect11.Rnw"
library(stats)
logit<-function(z) {ll<-log(z/(1-z))}
SA<-data.frame(read.table('http://www.math.ttu.edu/~atrindad/stat5371/SA.dat',sep='\t',header=T))
#SA<-data.frame(read.table('/home/atrindad/Teach/stat5371/Data/SA.dat',sep='\t',header=T)) 
y<-SA$chd


###################################################
### Link function check.
### Cycle through potential transforms, original scale or log, for all
###   the vars, eventually decide on log(ldl) & log(age).
### These are plots of the logit link vs. x and vs. log(x), and must be done 
###   by grouping the x data (here we use default binning from histograms).
###################################################
### first ldl
par(mfrow=c(2,2))
xt<-(SA$ldl)
hh<-hist(xt,plot=F)
x2<-xt
for (kk in (1:length(hh$breaks)-1)) {
  x2[xt<=hh$breaks[kk+1] & xt>hh$breaks[kk]]<-hh$mid[kk] }
tt<-table(y,x2)
ttt<-logit(tt[2,]/apply(tt,2,sum))
#
plot(sort(unique(x2)),ttt, xlab="ldl",ylab="log-odds")
gg<-glm(y~xt,'binomial')
pp<-predict(gg,response='link',se=T)
lines(sort(xt),pp$fit[sort.list(xt)])
lines(sort(xt),(pp$fit-pp$se)[sort.list(xt)],lty=2)
lines(sort(xt),(pp$fit+pp$se)[sort.list(xt)],lty=2)
#
xt<-log(SA$ldl)
hh<-hist(xt,plot=F)
x2<-xt
for (kk in (1:length(hh$breaks)-1)) {
  x2[xt<=hh$breaks[kk+1] & xt>hh$breaks[kk]]<-hh$mid[kk] }
tt<-table(y,x2)
ttt<-logit(tt[2,]/apply(tt,2,sum))
#
plot(sort(unique(x2)),ttt,xlab="log(ldl)",ylab="log-odds")
gg<-glm(y~xt,'binomial')
pp<-predict(gg,response='link',se=T)
lines(sort(xt),pp$fit[sort.list(xt)])
lines(sort(xt),(pp$fit-pp$se)[sort.list(xt)],lty=2)
lines(sort(xt),(pp$fit+pp$se)[sort.list(xt)],lty=2)
### now age
xt<-SA$age
hh<-hist(xt,plot=F)
x2<-xt
for (kk in (1:length(hh$breaks)-1)) {
  x2[xt<=hh$breaks[kk+1] & xt>hh$breaks[kk]]<-hh$mid[kk] }
tt<-table(y,x2)
ttt<-logit(tt[2,]/apply(tt,2,sum))
#
plot(sort(unique(x2)),ttt,xlab="age",ylab="log-odds")
gg<-glm(y~xt,'binomial')
pp<-predict(gg,response='link',se=T)
lines(sort(xt),pp$fit[sort.list(xt)])
lines(sort(xt),(pp$fit-pp$se)[sort.list(xt)],lty=2)
lines(sort(xt),(pp$fit+pp$se)[sort.list(xt)],lty=2)
#
xt<-log(SA$age)
hh<-hist(xt,plot=F)
x2<-xt
for (kk in (1:length(hh$breaks)-1)) {
  x2[xt<=hh$breaks[kk+1] & xt>hh$breaks[kk]]<-hh$mid[kk] }
tt<-table(y,x2)
ttt<-logit(tt[2,]/apply(tt,2,sum))
#
plot(sort(unique(x2)),ttt,xlab="log(age)",ylab="log-odds")
gg<-glm(y~xt,'binomial')
pp<-predict(gg,response='link',se=T)
lines(sort(xt),pp$fit[sort.list(xt)])
lines(sort(xt),(pp$fit-pp$se)[sort.list(xt)],lty=2)
lines(sort(xt),(pp$fit+pp$se)[sort.list(xt)],lty=2)


###################################################
### Fit full model.
### The Chi-square goodness-of-fit test compares the deviance 314.103 to
###  Chi-square dist. df=302, p-value = P(Chi-square(302)>314.103) = 0.304.
###  We do not reject the fit of the logistic model.
###################################################
#line 401 "Lect11.Rnw"
gg<-glm(chd~log(ldl)+log(age)+alcohol+tobind+sbp+typea+adiposity+obesity+as.factor(famhist),data=SA,'binomial')
print(gs<-summary(gg))


###################################################
### Diagnostic plots. OK!
###################################################
#line 410 "Lect11.Rnw"
par(mfrow=c(2,2))
res<-residuals(gg,'pearson')
plot(logit(gg$fit),res,xlab="linear predictor",main="Pearson residuals")
ll<-loess(res[sort.list(logit(gg$fit))]~sort(logit(gg$fit)))
lines(ll$x,ll$fit,col=2)
abline(h=0,lty=2)
#line 417 "Lect11.Rnw"
res<-residuals(gg,'deviance')
plot(logit(gg$fit),res,xlab="linear predictor",main="Deviance residuals")
ll<-loess(res[sort.list(logit(gg$fit))]~sort(logit(gg$fit)))
lines(ll$x,ll$fit,col=2)
abline(h=0,lty=2)
#line 424 "Lect11.Rnw"
hh<-hatvalues(gg)
plot(hh,type='h',main="leverage",xlab="index")
dd<-cooks.distance(gg)
plot(dd,type='h',main="cook's distance",xlab="index")



###################################################
### More diagnostic plots (canned version).
### Top left:  pearson residuals again.
### Top right: QQ-plot of pearson res vs. normal dist. (may be relevant for
###   e.g. Poisson models that sometimes can be approx by a normal, but 
###   in general we don't expect to see a good agreement here).
### Bot left:  just like in the linear model, a way to check the variance
###   of y as a function of fitted value. For binomial models we expect
###   var. to be non-const. and max at pi=0.5 corresp. to linear predictor=0.
### Bot right: combines leverage and pearson resids; extremes in either or both
###   directions constitutes a large Cook's distance (red dashed lines).
###################################################
#line 445 "Lect11.Rnw"
par(mfrow=c(2,2))
plot(gg)



###################################################
### More diagnostics: Change in deviance.
### Looks for obs that impact overall measure of goodness-of-fit
###  (similar to the change of sigma^2 in regression). 
###  Here, obs 174 is identifed - with click - as possible outlier.
###################################################
par(mfrow=c(1,1))
dev.change<-residuals(gg,'deviance')^2+residuals(gg,'pearson')^2*(hh/(1-hh))
plot(seq(1,dim(SA)[1]),dev.change,type='h',main="Change in deviance")
id<-identify(dev.change,pos=T)
#line 466 "Lect11.Rnw"
#plot(seq(1,dim(SA)[1]),dev.change,type='h')
#if (max(id$ind)!=-Inf) {
#text(seq(1,dim(SA)[1])[id$ind],dev.change[id$ind],id$ind,pos=id$pos) }



###################################################
### Try a simple backward model selection based on AIC.
### For GLM: AIC = Deviance + 2*p.
###################################################
#line 486 "Lect11.Rnw"
print(step(gg))
selg<-step(gg,trace=F)
print(summary(selg))


###################################################
### Use selected model to construct a dose-response curve.
###################################################
#line 493 "Lect11.Rnw"
plot(logit(selg$fit),SA$chd,xlab="linear predictor",ylab="Prob(chd=1)",
     main="fitted dose-response curve")
lines(sort(logit(selg$fit)),selg$fit[sort.list(selg$fit)])


###################################################
### Check if model can classify patients in terms of heart disease...
### 26% misclassified, not so good, and optimistic, since same data
### is used for fitting as well as classifcation (more in GLM lecture). 
####################################################
#line 507 "Lect11.Rnw"
pp<-predict(selg,type='response')
pp[pp<.5]<-0
pp[pp>=.5]<-1
table(pp,SA$chd)
print(c("Misclassification error=", round(sum(pp!=SA$chd)/length(SA$chd),3)))


###################################################
### Should we include interactions in the model?
###   famhist & log(ldl) is very significant!
###################################################
#line 517 "Lect11.Rnw"
par(mfrow=c(2,2))
gg<-glm(chd~log(age)*as.factor(famhist),'binomial',data=SA)
pp<-predict(gg,type='response')
xvec<-seq(min(log(SA$age)),max(log(SA$age)),by=.1)
p0<-gg$coef[1]+gg$coef[2]*xvec
p1<-gg$coef[1]+gg$coef[2]*xvec+gg$coef[3]+gg$coef[4]*xvec
plot(xvec,p0,type='l',ylab='log-odds',xlab='log(age)',ylim=c(min(logit(gg$fit)),max(logit(gg$fit))))
lines(xvec,p1,lty=2)
legend("topleft", legend=c("famhist=0","famhist=1"),lty=c(1,2))
#
gg<-glm(chd~log(ldl)*as.factor(famhist),'binomial',data=SA)
pp<-predict(gg,type='response')
xvec<-seq(min(log(SA$ldl)),max(log(SA$ldl)),by=.1)
p0<-gg$coef[1]+gg$coef[2]*xvec
p1<-gg$coef[1]+gg$coef[2]*xvec+gg$coef[3]+gg$coef[4]*xvec
plot(xvec,p0,type='l',ylab='log-odds',xlab='log(ldl)',ylim=c(min(logit(gg$fit)),max(logit(gg$fit))))
lines(xvec,p1,lty=2)
#
gg<-glm(chd~typea*as.factor(famhist),'binomial',data=SA)
pp<-predict(gg,type='response')
xvec<-seq(min(SA$typea),max(SA$typea),by=.1)
p0<-gg$coef[1]+gg$coef[2]*xvec
p1<-gg$coef[1]+gg$coef[2]*xvec+gg$coef[3]+gg$coef[4]*xvec
plot(xvec,p0,type='l',ylab='log-odds',xlab='typea',ylim=c(min(logit(gg$fit)),max(logit(gg$fit))))
lines(xvec,p1,lty=2)
#
gg<-glm(chd~obesity*as.factor(famhist),'binomial',data=SA)
pp<-predict(gg,type='response')
xvec<-seq(min(SA$obesity),max(SA$obesity),by=.1)
p0<-gg$coef[1]+gg$coef[2]*xvec
p1<-gg$coef[1]+gg$coef[2]*xvec+gg$coef[3]+gg$coef[4]*xvec
plot(xvec,p0,type='l',ylab='log-odds',xlab='obesity',ylim=c(min(logit(gg$fit)),max(logit(gg$fit))))
lines(xvec,p1,lty=2)
par(mfrow=c(1,1))
title("Interactions with 'famhist'")


###################################################
### Should we include interactions in the model?
###   tobind & obesity looks significant, but is not (large p-value).
###################################################
#line 562 "Lect11.Rnw"
par(mfrow=c(2,2))
gg<-glm(chd~log(age)*as.factor(tobind),'binomial',data=SA)
pp<-predict(gg,type='response')
xvec<-seq(min(log(SA$age)),max(log(SA$age)),by=.1)
p0<-gg$coef[1]+gg$coef[2]*xvec
p1<-gg$coef[1]+gg$coef[2]*xvec+gg$coef[3]+gg$coef[4]*xvec
plot(xvec,p0,type='l',ylab='log-odds',xlab='log(age)',ylim=c(min(logit(gg$fit)),max(logit(gg$fit))))
lines(xvec,p1,lty=2)
legend("topleft", legend=c("tobind=0","tobind=1"),lty=c(1,2))
#
gg<-glm(chd~log(ldl)*as.factor(tobind),'binomial',data=SA)
pp<-predict(gg,type='response')
xvec<-seq(min(log(SA$ldl)),max(log(SA$ldl)),by=.1)
p0<-gg$coef[1]+gg$coef[2]*xvec
p1<-gg$coef[1]+gg$coef[2]*xvec+gg$coef[3]+gg$coef[4]*xvec
plot(xvec,p0,type='l',ylab='log-odds',xlab='log(ldl)',ylim=c(min(logit(gg$fit)),max(logit(gg$fit))))
lines(xvec,p1,lty=2)
#
gg<-glm(chd~typea*as.factor(tobind),'binomial',data=SA)
pp<-predict(gg,type='response')
xvec<-seq(min(SA$typea),max(SA$typea),by=.1)
p0<-gg$coef[1]+gg$coef[2]*xvec
p1<-gg$coef[1]+gg$coef[2]*xvec+gg$coef[3]+gg$coef[4]*xvec
plot(xvec,p0,type='l',ylab='log-odds',xlab='typea',ylim=c(min(logit(gg$fit)),max(logit(gg$fit))))
lines(xvec,p1,lty=2)
#
gg<-glm(chd~obesity*as.factor(tobind),'binomial',data=SA)
pp<-predict(gg,type='response')
xvec<-seq(min(SA$obesity),max(SA$obesity),by=.1)
p0<-gg$coef[1]+gg$coef[2]*xvec
p1<-gg$coef[1]+gg$coef[2]*xvec+gg$coef[3]+gg$coef[4]*xvec
plot(xvec,p0,type='l',ylab='log-odds',xlab='obesity',ylim=c(min(logit(gg$fit)),max(logit(gg$fit))))
lines(xvec,p1,lty=2)
par(mfrow=c(1,1))
title("Interactions with 'tobind'")


###################################################
### Decision: just include interactions with famhist.
### Backward selection keeps only interaction b/ween famhist & log(ldl)...
### Take a closer look at model summary:
###   inclusion of interaction has markedly dropped significance 
###   of main effect of log(ldl), from p=0.000866 to p=0.48530.
### Suspicion: perhaps interaction has created a collinearity problem, and 
###   interaction provides little info about chd not already provided by log(ldl)... 
###################################################
#line 608 "Lect11.Rnw"
gg<-glm(chd~as.factor(famhist)*(log(ldl)+log(age)+sbp+adiposity+obesity+alcohol+typea+as.factor(tobind)),'binomial',data=SA)
print(summary(gg))
selg<-step(gg,trace=F)
print(summary(selg))
par(mfrow=c(2,2))
plot(selg)


###################################################
### Compute correlation matrix for the coefficients:
###  correlation between ldl & ldl*famhist = -0.9695 !!!
### With such extreme collinearity, one should probably drop the interaction 
###  term and revert back to the additive model...
###################################################
#line 626 "Lect11.Rnw"
ss<-summary(selg)
c2<-solve(diag(diag(ss$cov.sc)))^{1/2}%*%ss$cov.sc%*%solve(diag(diag(ss$cov.sc)))^{1/2}
print(c2)


###################################################
### More: look for interactions between the numerical variables
###   you need to bin the values of one of them first in order to
###   plot the log-odds comparisons as above.