################################################################################## ### R code for illustration of CLT for sampling distributions ################################################################################## ### CLT for sample mean (similar to problem 4.87 of Ott & Longnecker) # Define a function taking on sample size (n) & no. of reps (reps) to do the job # of approximating sampling distribution of sample mean when drawing samples from # an exponential distribution of mean 1 (all based on 1000 draws) clt<-function(n,reps,xmax,ymax){ # draw first sample (size n) from an exponential distribution. To draw # from a N(mu,sigma), you would use: sample<-rnorm(n,mean=mu,sd=sigma) sample<-rexp(n) # calculate sample mean smean<-sum(sample)/n # now do this reps-1 more times, each time appending the new mean to smean for (i in 2:reps){ sample<-rexp(n) smean<-c(smean,sum(sample)/n)} # finally, do relative frequency histogram of the sample means hist(smean,freq=F,xlim=c(0,xmax),ylim=c(0,ymax),xlab="Sample Means",main=paste("n =",n)) # plot the central limit theorem normal distribution for comparison xvec<-seq(1-3/sqrt(n),1+3/sqrt(n),3/(100*sqrt(n))) lines(xvec,dnorm(xvec,mean=1,sd=1/sqrt(n))) } # ## Can now use this function, for example, split screen into 6 and apply for various # sample sizes: par(mfrow=c(2,3)) clt(2,1000,3,3) clt(5,1000,3,3) clt(10,1000,3,3) clt(15,1000,3,3) clt(30,1000,3,3) clt(50,1000,3,3) par(mfrow=c(1,1)) ### CLT for sample mean, median and variance (like Example 4.20, ages of pennies) # draw 1000 obs from a gamma dist: mu=4*3=12, var=4*3^2=36 x <- rgamma(1000, shape=4, scale=3) # record mu, med, and sigma^2 for this population consiting of N=1000 values c(mean(x),median(x),var(x)) par(mfrow=c(2,2)) hist(x,breaks=20,main="Circulation ages of 1,000 pennies (years)",col="gray") ## First the sample mean # take random samples of size n from this pop and compute sample mean; # store in vector y; do this 10000 times; do histogram of these 10^4 sample means n <- 10; y <- mean(sample(x, size=n)) for (i in 2:10000) {y <- c(y,mean(sample(x, size=n)))} hist(y,freq=F,xlab="",main=paste("Sampling dist of sample mean: n =",n),xlim=c(0,max(x)),ylim=c(0,0.7)) # repeat for samples of size 20 n <- 20; y <- mean(sample(x, size=n)) for (i in 2:10000) {y <- c(y,mean(sample(x, size=n)))} hist(y,freq=F,xlab="",main=paste("Sampling dist of sample mean: n =",n),xlim=c(0,max(x)),ylim=c(0,0.7)) # repeat for samples of size 100 n <- 100; y <- mean(sample(x, size=n)) for (i in 2:10000) {y <- c(y,mean(sample(x, size=n)))} hist(y,freq=F,xlab="",main=paste("Sampling dist of sample mean: n=",n),xlim=c(0,max(x)),ylim=c(0,0.7)) # ## Now the sample median instead (of mean) # write a function and apply it for n=10, 20, 100 samp.dist.md <- function(n){ y <- median(sample(x, size=n)) for (i in 2:10000) {y <- c(y,median(sample(x, size=n)))} hist(y,freq=F,xlab="",main=paste("Sampling dist of sample median: n =",n),xlim=c(0,max(x)),ylim=c(0,0.7)) } hist(x,breaks=20,main="Circulation ages of 1,000 pennies (years)",col="gray") samp.dist.md(10) samp.dist.md(20) samp.dist.md(100) # ## Now repeat for sample standard deviation # write a function and apply it for n=10, 20, 100 samp.dist.sd <- function(n){ y <- sqrt(var(sample(x, size=n))) for (i in 2:10000) {y <- c(y,sqrt(var(sample(x, size=n))))} hist(y,freq=F,xlab="",main=paste("Sampling dist of sample standard deviation: n =",n),xlim=c(0,15),ylim=c(0,0.8)) } hist(x,breaks=20,main="Circulation ages of 1,000 pennies (years)",col="gray") samp.dist.sd(10) samp.dist.sd(20) samp.dist.sd(100) par(mfrow=c(1,1))