############################################################################### ### Simple Linear Regression: Analysis of Computer Repair Time (lecture notes) ############################################################################### # enter data: x=# components, y=repair time x=c(1,2,4,4,4,5,6,6,8,8,9,9,10,10) y=c(23,29,64,72,80,87,96,105,127,119,145,149,165,154) # fit model & get coeffs & ANOVA table repair.fit = lm(y~x) summary(repair.fit) anova(repair.fit) # 95% confidence & prediction intervals for fitted values predict(repair.fit, interval="confidence") predict(repair.fit, interval="prediction") # 95% confidence & prediction intervals at a specified x=3 value predict(repair.fit, newdata=list(x=3), interval="confidence") predict(repair.fit, newdata=list(x=3), interval="prediction") # measures of influence: (dfbetas, dffit, cov-ratio, cooks-dist, hat) influence.measures(repair.fit) # built-in diagnostic plot (6 plots total) par(mfrow=c(2,3)) plot(repair.fit, which=1:6) ##################################################################### ### Multiple Regression: Peruvian Indian data (lecture notes) ###################################################################### library(leaps) library(car) library(MASS) library(stats) library(wle) ### START: inputs for the entire routine peru <- read.table("Data/peru.txt",header=TRUE) attach(peru) # which col of data matrix contains response? ycoln <- 9 # short identifying x-var names to aid in best subset model selection xnames <- c("V1","V2","V3","V4","V5","V6","V7","V8","V10","V11") y <- peru[,ycoln]; n <- length(y); sst <- (n-1)*var(y) ### END: inputs for the entire routine, can now use rest of code as is, ### except for final model at end # ### pairwise scatterplots & correlation matrices pairs(peru) cor(peru) ### fit full model and check for multicollinearity fit <- lm(y~.,data=peru[,-ycoln]) summary(fit) # VIF's vif(fit) # ### Stepwise model selection (not very good, cannot work with p-values) g <- mle.stepwise(fit, type="Stepwise", f.in=4.0, f.out=4.0) summary(g) # ### Best subsets using leaps package (for small dataframes) # Adj R^2 g <- leaps(peru[,-ycoln],y,nbest=1,names=xnames,method="adjr2") k<-g$size-1 plot(k,g$adjr2,xlab="k (number of vars)",ylab="Adjusted R^2") for (i in 1:dim(g$which)[1]){ print(c(i,xnames[g$which[i,]]))} # Mallow's Cp g <- leaps(peru[,-ycoln],y,nbest=1,names=xnames,method="Cp") k<-g$size-1 plot(k,g$Cp,xlab="k (number of vars)",ylab="Mallow's Cp") abline(1,1) for (i in 1:dim(g$which)[1]){ print(c(i,xnames[g$which[i,]]))} # AICc g <- leaps(peru[,-ycoln],y,nbest=1,names=xnames,method="r2") k<-g$size-1 plot(k,n*log((1-g$r2)*sst/n)+2*n*(k+2)/(n-k-3),xlab="k (number of vars)",ylab="AICc") for (i in 1:dim(g$which)[1]){ print(c(i,xnames[g$which[i,]]))} # ### Best subsets using car package (for large dataframes) g <- regsubsets(peru[,-ycoln],y,method="exhaustive",nvmax=10) subsets(g,legend=FALSE,statistic="bic") subsets(g,legend=FALSE,statistic="cp") abline(1,1) subsets(g,legend=FALSE,statistic="adjr2") subsets(g,legend=FALSE,statistic="rss") # ### Fit 2-var and 5-var models to decide on final model ### p-value: 0.005522, suggests mod5 is better... mod2 <- lm(Systol ~ Weight + Fraction, data=peru) mod5 <- lm(Systol ~ Age + Years + Weight + Chin + Fraction, data=peru) anova(mod2, mod5) # ### Decide to go with 2-var model & check diagnostics summary(mod2) # compute AIC, BIC, and PRESS y.hat=fitted(mod2) hat.ii=lm.influence(mod2)$hat #mse=((summary(mod2))$sigma)^2 aic = AIC(mod2) bic = BIC(mod2) press = sum(((y - y.hat) / (1 - hat.ii))^2) # diagnostics #pdf(file="Plots/PeruDiagnostics.pdf", pointsize=12, paper="a4r",width=0,height=0) par(mfrow=c(2,2)) plot(mod2, ask=F) #dev.off() # an (*) is printed next to each influential case influence.measures(mod2)