###############################################################################
### Simple Linear Regression: Analysis of Computer Repair Time (lecture notes)
###############################################################################
# enter data: x=# components, y=repair time
x=c(1,2,4,4,4,5,6,6,8,8,9,9,10,10)
y=c(23,29,64,72,80,87,96,105,127,119,145,149,165,154)
# fit model & get coeffs & ANOVA table
repair.fit = lm(y~x)
summary(repair.fit)
anova(repair.fit)
# 95% confidence & prediction intervals for fitted values
predict(repair.fit, interval="confidence")
predict(repair.fit, interval="prediction")
# 95% confidence & prediction intervals at a specified x=3 value
predict(repair.fit, newdata=list(x=3), interval="confidence")
predict(repair.fit, newdata=list(x=3), interval="prediction")
# measures of influence: (dfbetas, dffit, cov-ratio, cooks-dist, hat)
influence.measures(repair.fit)
# built-in diagnostic plot (6 plots total)
par(mfrow=c(2,3))
plot(repair.fit, which=1:6)


#####################################################################
###  Multiple Regression: Peruvian Indian data (lecture notes) 
######################################################################
library(leaps)
library(car)
library(MASS)
library(stats)
library(wle)
### START: inputs for the entire routine
peru <- read.table("Data/peru.txt",header=TRUE)
attach(peru)
# which col of data matrix contains response?
ycoln <- 9
# short identifying x-var names to aid in best subset model selection
xnames <- c("V1","V2","V3","V4","V5","V6","V7","V8","V10","V11")
y <- peru[,ycoln]; n <- length(y); sst <- (n-1)*var(y)
### END: inputs for the entire routine, can now use rest of code as is,
###      except for final model at end
#
### pairwise scatterplots & correlation matrices
pairs(peru)
cor(peru)
### fit full model and check for multicollinearity
fit <- lm(y~.,data=peru[,-ycoln])
summary(fit)
# VIF's
vif(fit)
#
###  Stepwise model selection (not very good, cannot work with p-values)
g <- mle.stepwise(fit, type="Stepwise", f.in=4.0, f.out=4.0)
summary(g)
#
###  Best subsets using leaps package (for small dataframes)
# Adj R^2
g <- leaps(peru[,-ycoln],y,nbest=1,names=xnames,method="adjr2")
k<-g$size-1
plot(k,g$adjr2,xlab="k (number of vars)",ylab="Adjusted R^2")
for (i in 1:dim(g$which)[1]){ print(c(i,xnames[g$which[i,]]))}
# Mallow's Cp
g <- leaps(peru[,-ycoln],y,nbest=1,names=xnames,method="Cp")
k<-g$size-1
plot(k,g$Cp,xlab="k (number of vars)",ylab="Mallow's Cp")
abline(1,1)
for (i in 1:dim(g$which)[1]){ print(c(i,xnames[g$which[i,]]))}
# AICc
g <- leaps(peru[,-ycoln],y,nbest=1,names=xnames,method="r2")
k<-g$size-1
plot(k,n*log((1-g$r2)*sst/n)+2*n*(k+2)/(n-k-3),xlab="k (number of vars)",ylab="AICc")
for (i in 1:dim(g$which)[1]){ print(c(i,xnames[g$which[i,]]))}
#
###  Best subsets using car package (for large dataframes)
g <- regsubsets(peru[,-ycoln],y,method="exhaustive",nvmax=10)
subsets(g,legend=FALSE,statistic="bic")
subsets(g,legend=FALSE,statistic="cp")
abline(1,1)
subsets(g,legend=FALSE,statistic="adjr2")
subsets(g,legend=FALSE,statistic="rss")
#
### Fit 2-var and 5-var models to decide on final model
### p-value: 0.005522, suggests mod5 is better...
mod2 <- lm(Systol ~ Weight + Fraction, data=peru)
mod5 <- lm(Systol ~ Age + Years + Weight + Chin + Fraction, data=peru)
anova(mod2, mod5)
#
### Decide to go with 2-var model & check diagnostics
summary(mod2)
# compute AIC, BIC, and PRESS 
y.hat=fitted(mod2)     
hat.ii=lm.influence(mod2)$hat
#mse=((summary(mod2))$sigma)^2
aic = AIC(mod2)
bic = BIC(mod2)
press = sum(((y - y.hat) / (1 - hat.ii))^2)
# diagnostics
#pdf(file="Plots/PeruDiagnostics.pdf", pointsize=12, paper="a4r",width=0,height=0)
par(mfrow=c(2,2))
plot(mod2, ask=F)
#dev.off()
# an (*) is printed next to each influential case
influence.measures(mod2)