Announcements

Context: central limit theorem applies to means (sums)

n <- 100
box <- c(1,1,1,5)
X <- sample(box, n, replace = TRUE)
# True parameters
true_mu <- mean(box)
true_sd <- sqrt(var(box)*(length(box)-1)/length(box)) # sample variance * (n-1)/n
c(true_mu, true_sd)
## [1] 2.000000 1.732051
# Sample estimates
c(mean(X), sd(X))
## [1] 1.92000 1.69181
# Standard error of the sample mean: divide by sqrt(n)
true_sd/sqrt(n)
## [1] 0.1732051

Larger sample size

n <- 10000
X <- sample(box, n, replace = TRUE)
# Sample estimates
c(mean(X), sd(X))
## [1] 1.990000 1.726325
# Standard error: divide by sqrt(n)
true_sd/sqrt(n)
## [1] 0.01732051

Using the central limit theorem

\[ \bar X_n \sim N\left(E[X], \frac{\text{Var}(X)}{n}\right) \]

n <- 50
df <- data.frame(Xbar = replicate(10000, mean(sample(box, n, replace = TRUE))))
ggplot(df, aes(Xbar)) + 
  geom_bar(alpha = .8, stat = "density", position = "identity") + 
  ylab("Sample dist. of the mean") + theme_tufte() +
  stat_function(fun = dnorm, args = list(mean = mean(box), true_sd/sqrt(n))) +
  ggtitle("Central limit theorem for box example") 

# Filter to Hawaiian airlines flights and "pull" the air_time variable out of the data frame
X <- flights %>% filter(carrier == "HA") %>% pull(air_time)
n <- length(X) # or nrow(flights)
n
## [1] 342
# Central limit theorem: sample mean of air_time variable with this sample size
# is approximately normal with mean and standard deviation given by
c(mean(X), sd(X)/sqrt(n))
## [1] 623.087719   1.118724
mu <- mean(X)
s <- sd(X)

# (a)
pnorm(620, mean = mu, sd = s/sqrt(n)) 
## [1] 0.002889733
# (b)
pnorm(625, mean = mu, sd = s/sqrt(n)) 
## [1] 0.9563062
# (c)
pnorm(625, mean = mu, sd = s/sqrt(n)) - pnorm(620, mean = mu, sd = s/sqrt(n))
## [1] 0.9534164
# (d)
1 - pnorm(625, mean = mu, sd = s/sqrt(n)) 
## [1] 0.04369384
pnorm(625, mean = mu, sd = s/sqrt(n), lower.tail = FALSE) 
## [1] 0.04369384

Revisiting the standard error vs standard deviation

sd(X)
## [1] 20.68882
qplot(flights$air_time, bins = 80) + theme_tufte()
## Warning: Removed 9430 rows containing non-finite values (stat_bin).

The t-distribution

Flaw of averages