##################################################################################
### R code for illustration of CLT for sampling distributions
##################################################################################

### CLT for sample mean (similar to problem 4.87 of Ott & Longnecker) 
# Define a function taking on sample size (n) & no. of reps (reps) to do the job
# of approximating sampling distribution of sample mean when drawing samples from 
# an exponential distribution of mean 1 (all based on 1000 draws)
clt<-function(n,reps,xmax,ymax){
# draw first sample (size n) from an exponential distribution. To draw
# from a N(mu,sigma), you would use: sample<-rnorm(n,mean=mu,sd=sigma) 
 sample<-rexp(n)
# calculate sample mean
 smean<-sum(sample)/n
# now do this reps-1 more times, each time appending the new mean to smean
 for (i in 2:reps){ 
  sample<-rexp(n)
  smean<-c(smean,sum(sample)/n)}
# finally, do relative frequency histogram of the sample means
 hist(smean,freq=F,xlim=c(0,xmax),ylim=c(0,ymax),xlab="Sample Means",main=paste("n =",n))
# plot the central limit theorem normal distribution for comparison
 xvec<-seq(1-3/sqrt(n),1+3/sqrt(n),3/(100*sqrt(n)))
 lines(xvec,dnorm(xvec,mean=1,sd=1/sqrt(n)))
}
#
## Can now use this function, for example, split screen into 6 and apply for various
# sample sizes:
par(mfrow=c(2,3))
clt(2,1000,3,3)
clt(5,1000,3,3)
clt(10,1000,3,3)
clt(15,1000,3,3)
clt(30,1000,3,3)
clt(50,1000,3,3)
par(mfrow=c(1,1))


### CLT for sample mean, median and variance (like Example 4.20, ages of pennies)
# draw 1000 obs from a gamma dist: mu=4*3=12, var=4*3^2=36
x <- rgamma(1000, shape=4, scale=3)
# record mu, med, and sigma^2 for this population consiting of N=1000 values
c(mean(x),median(x),var(x))
par(mfrow=c(2,2))
hist(x,breaks=20,main="Circulation ages of 1,000 pennies (years)",col="gray")

## First the sample mean
# take random samples of size n from this pop and compute sample mean;
#  store in vector y; do this 10000 times; do histogram of these 10^4 sample means
n <- 10; y <- mean(sample(x, size=n))
for (i in 2:10000) {y <- c(y,mean(sample(x, size=n)))}
hist(y,freq=F,xlab="",main=paste("Sampling dist of sample mean: n =",n),xlim=c(0,max(x)),ylim=c(0,0.7))
# repeat for samples of size 20
n <- 20; y <- mean(sample(x, size=n))
for (i in 2:10000) {y <- c(y,mean(sample(x, size=n)))}
hist(y,freq=F,xlab="",main=paste("Sampling dist of sample mean: n =",n),xlim=c(0,max(x)),ylim=c(0,0.7))
# repeat for samples of size 100
n <- 100; y <- mean(sample(x, size=n))
for (i in 2:10000) {y <- c(y,mean(sample(x, size=n)))}
hist(y,freq=F,xlab="",main=paste("Sampling dist of sample mean: n=",n),xlim=c(0,max(x)),ylim=c(0,0.7))
#
## Now the sample median instead (of mean)
# write a function and apply it for n=10, 20, 100
samp.dist.md <- function(n){
  y <- median(sample(x, size=n))
  for (i in 2:10000) {y <- c(y,median(sample(x, size=n)))}
  hist(y,freq=F,xlab="",main=paste("Sampling dist of sample median: n =",n),xlim=c(0,max(x)),ylim=c(0,0.7))
}
hist(x,breaks=20,main="Circulation ages of 1,000 pennies (years)",col="gray")
samp.dist.md(10)
samp.dist.md(20)
samp.dist.md(100)
#
## Now repeat for sample standard deviation
# write a function and apply it for n=10, 20, 100
samp.dist.sd <- function(n){
  y <- sqrt(var(sample(x, size=n)))
  for (i in 2:10000) {y <- c(y,sqrt(var(sample(x, size=n))))}
  hist(y,freq=F,xlab="",main=paste("Sampling dist of sample standard deviation: n =",n),xlim=c(0,15),ylim=c(0,0.8))
}
hist(x,breaks=20,main="Circulation ages of 1,000 pennies (years)",col="gray")
samp.dist.sd(10)
samp.dist.sd(20)
samp.dist.sd(100)
par(mfrow=c(1,1))