8 Rethinking: Chapter 7

Ulysses’ Compass

by Richard McElreath, building on the Summaries by Solomon Kurz and Jake Thompson.

<i>Between Scylla and Charybdis</i> by <a href='https://commons.wikimedia.org/wiki/File:Adolf_Hiremy-Hirschl_-_Zwischen_Skylla_und_Charybdis.jpg'>Adolf Hirémy-Hirschl</a> (1910).

Figure 8.1: Between Scylla and Charybdis by Adolf Hirémy-Hirschl (1910).

8.1 The Problem with Parameters

library(rethinking)
data_brainsize <- tibble(
  species = c("afarensis", "africanus", "habilis", "boisei", "rudolfensis", "ergaster", "sapiens"), 
  brain_size = c(438, 452, 612, 521, 752, 871, 1350), 
  mass = c(37.0, 35.5, 34.5, 41.5, 55.5, 61.0, 53.5)) %>% 
  mutate(brain_size_scl = brain_size/ max(brain_size),
         mass_std = standardize(mass))

data_brainsize %>% 
  ggplot(aes(x = mass, y = brain_size)) +
  geom_point(shape = 21, color = clr2, fill = fll2, size = 3) +
  ggrepel::geom_text_repel(aes(label = species),
                           force = 30, min.segment.length = unit(.1, "npc"),
                           family = fnt_sel, fontface = "italic") +
  coord_fixed(ratio = .03)

8.1.1 The burial of R²

\[ R^{2} = \frac{var(outcome) - var(residuals)}{var(outcome)} = 1 - \frac{var(residuals)}{var(outcome)} \]

8.1.1.1 Linear Model

\[ \begin{array}{rclr} b_{i} & \sim & Normal( \mu_i, \sigma) & \textrm{[likelihood]}\\ \mu_i & = & \alpha + \beta_{m} m_{i} & \textrm{[linear model]}\\ \alpha & \sim & Normal(0.5, 1) & \textrm{[$\alpha$ prior]}\\ \beta_{m} & \sim & Normal(0, 10) & \textrm{[$\beta_{T}$ prior]}\\ \sigma & \sim & Log-Normal(0, 1) & \textrm{[$\sigma$ prior]} \end{array} \]

model_brain_size <- quap(
  flist = alist(
    brain_size_scl ~ dnorm(mu, exp(log_sigma)),
    mu <- alpha + beta_m * mass_std,
    alpha ~ dnorm(0.5, 1),
    beta_m ~ dnorm(0, 10),
    log_sigma ~ dnorm( 0, 1 )
  ),
  data = data_brainsize
)

precis(model_brain_size) %>% 
  knit_precis()

param	mean	sd	5.5%	94.5%
alpha	0.53	0.07	0.42	0.64
beta_m	0.17	0.07	0.05	0.29
log_sigma	-1.71	0.29	-2.18	-1.24

extract_r2 <- function(quap_fit, decimals = 5){
  data <- sim(quap_fit) %>% 
  as_tibble() %>% 
  set_names(nm = data_brainsize$species) %>% 
  summarise(across(everything(), mean, .names = "{.col}")) %>% 
  pivot_longer(everything(), names_to = "species", values_to = "mean_brainsize") %>% 
  mutate(brain_size_scl = data_brainsize$brain_size_scl,
         diff = mean_brainsize - brain_size_scl)

  round(1 - var2(data$diff) / var2(data$brain_size_scl), digits = decimals)
}

set.seed(12)
extract_r2(model_brain_size)

#> [1] 0.47746

8.1.2 Higher order polynomials

\[ \begin{array}{rclcr} b_i & {\sim} & Normal(\mu_i, \sigma) & &\textrm{[likelihood]}\\ \mu_i & = & \alpha + \beta_{1} m_{i} + \beta_{2} m_{i}^2 & &\textrm{[linear model]}\\ \alpha & \sim & Normal(0.5, 1) & & \textrm{[$\alpha$ prior]}\\ \beta_{j} & \sim & Normal(0, 10) & \textrm{for}~j = 1..2 & \textrm{[$\beta$ prior]}\\ \sigma & \sim & Log-Normal(0, 1) & &\textrm{[$\sigma$ prior]} \end{array} \]

model_brain_size2 <- quap(
  flist = alist(
    brain_size_scl ~ dnorm(mu, exp(log_sigma)),
    mu <- alpha + 
      beta_m[1] * mass_std + 
      beta_m[2] * mass_std ^ 2,
    alpha ~ dnorm(0.5, 1),
    beta_m ~ dnorm(0, 10),
    log_sigma ~ dnorm( 0, 1 )
  ),
  data = data_brainsize,
  start = list(beta_m = rep(0, 2))
)

model_brain_size3 <- quap(
  flist = alist(
    brain_size_scl ~ dnorm(mu, exp(log_sigma)),
    mu <- alpha + 
      beta_m[1] * mass_std + 
      beta_m[2] * mass_std ^ 2 + 
      beta_m[3] * mass_std ^ 3,
    alpha ~ dnorm(0.5, 1),
    beta_m ~ dnorm(0, 10),
    log_sigma ~ dnorm( 0, 1 )
  ),
  data = data_brainsize,
  start = list(beta_m = rep(0, 3))
)

model_brain_size4 <- quap(
  flist = alist(
    brain_size_scl ~ dnorm(mu, exp(log_sigma)),
    mu <- alpha + 
      beta_m[1] * mass_std + 
      beta_m[2] * mass_std ^ 2 + 
      beta_m[3] * mass_std ^ 3 + 
      beta_m[4] * mass_std ^ 4,
    alpha ~ dnorm(0.5, 1),
    beta_m ~ dnorm(0, 10),
    log_sigma ~ dnorm( 0, 1 )
  ),
  data = data_brainsize,
  start = list(beta_m = rep(0, 4))
)

model_brain_size5 <- quap(
  flist = alist(
    brain_size_scl ~ dnorm(mu, exp(log_sigma)),
    mu <- alpha + 
      beta_m[1] * mass_std + 
      beta_m[2] * mass_std ^ 2 + 
      beta_m[3] * mass_std ^ 3 + 
      beta_m[4] * mass_std ^ 4 + 
      beta_m[5] * mass_std ^ 5,
    alpha ~ dnorm(0.5, 1),
    beta_m ~ dnorm(0, 10),
    log_sigma ~ dnorm( 0, 1 )
  ),
  data = data_brainsize,
  start = list(beta_m = rep(0, 5))
)

model_brain_size6 <- quap(
  flist = alist(
    brain_size_scl ~ dnorm(mu, exp(log_sigma)),
    mu <- alpha + 
      beta_m[1] * mass_std + 
      beta_m[2] * mass_std ^ 2 + 
      beta_m[3] * mass_std ^ 3 + 
      beta_m[4] * mass_std ^ 4 + 
      beta_m[5] * mass_std ^ 5 + 
      beta_m[6] * mass_std ^ 6,
    alpha ~ dnorm(0.5, 1),
    beta_m ~ dnorm(0, 10),
    log_sigma ~ dnorm( 0, 1 )
  ),
  data = data_brainsize,
  start = list(beta_m = rep(0, 6))
)

mass_seq <- seq( from = min(data_brainsize$mass_std) - .15,
                 to = max(data_brainsize$mass_std) + .15,
                 length.out = 101)

plot_poly <- function(mod, ylim){
  model_posterior_samples <- extract.samples(mod) %>% 
    as.data.frame() %>% 
    as_tibble()
  
  model_posterior_prediction_samples <- link(mod, data = tibble(mass_std = mass_seq)) %>% 
    as_tibble() %>% 
    set_names(nm = mass_seq) %>% 
    pivot_longer(cols = everything(), names_to = "mass_std", values_to = "brain_size_scl") %>% 
    mutate(mass_std = as.numeric(mass_std)) 
  
  model_posterior_prediction_pi <- model_posterior_prediction_samples %>% 
    group_by(mass_std) %>% 
    summarise(mean = mean(brain_size_scl),
              PI_lower = PI(brain_size_scl)[1],
              PI_upper = PI(brain_size_scl)[2]) %>% 
    ungroup()
  
  p <- ggplot(mapping = aes(x = mass_std * sd(data_brainsize$mass) + mean(data_brainsize$mass))) +
    geom_smooth(data = model_posterior_prediction_pi, stat = "identity",
                aes(y = mean * max(data_brainsize$brain_size),
                    ymin = PI_lower  * max(data_brainsize$brain_size),
                    ymax = PI_upper * max(data_brainsize$brain_size)),
                color = clr2, fill = fll2, size = .2) +
    geom_point(data = data_brainsize, aes(y = brain_size_scl * max(data_brainsize$brain_size)),
               color = rgb(0,0,0,.5), size = 1) +
    labs(x = "mass",
         y = "brain_size",
         title = glue("*R<sup>2</sup>:* {extract_r2(mod, decimals = 2)}")) +
    coord_cartesian(ylim = ylim) +
    theme(plot.title = element_markdown())
  
  if(identical(mod, model_brain_size6)) {
    p <- p +
    geom_hline(yintercept = 0, color = clr_dark, linetype = 3 )
  }
  
  p
}

list(model_brain_size, model_brain_size2,model_brain_size3,
     model_brain_size4, model_brain_size5, model_brain_size6) %>% 
  purrr::map2(.y = list(c(420, 1400), c(420, 1400), c(420, 1400),
                        c(300, 1950), c(300, 1950), c(-400, 1500)),
              plot_poly) %>% 
  wrap_plots() +
  plot_annotation(tag_levels = "a")

8.1.3 Underfitting

Leave one out (LOO)

model_loo <- function(idx = 0, mod_degree = 1){
  data <- data_brainsize[-idx, ]
  
  if(mod_degree == 1){
    current_mod <- quap(
      flist = alist(
        brain_size_scl ~ dnorm(mu, exp(log_sigma)),
        mu <- alpha + 
          beta_m[1] * mass_std,
        alpha ~ dnorm(0.5, 1),
        beta_m ~ dnorm(0, 10),
        log_sigma ~ dnorm( 0, 1 )
      ),
      data = data,
      start = list(beta_m = rep(0, 1))
    )
  } else if(mod_degree == 4) {
    current_mod <- quap(
      flist = alist(
        brain_size_scl ~ dnorm(mu, exp(log_sigma)),
        mu <- alpha + 
          beta_m[1] * mass_std + 
          beta_m[2] * mass_std ^ 2 + 
          beta_m[3] * mass_std ^ 3 + 
          beta_m[4] * mass_std ^ 4,
        alpha ~ dnorm(0.5, 1),
        beta_m ~ dnorm(0, 10),
        log_sigma ~ dnorm( 0, 1 )
      ),
      data = data,
      start = list(beta_m = rep(0, 4))
    )
  } else { stop("`mod_degree` needs to be either 1 or 4") }
  
  model_posterior_prediction_samples <- link(current_mod, data = tibble(mass_std = mass_seq)) %>% 
    as_tibble() %>% 
    set_names(nm = mass_seq) %>% 
    pivot_longer(cols = everything(), names_to = "mass_std", values_to = "brain_size_scl") %>% 
    mutate(mass_std = as.numeric(mass_std)) 
  
  model_posterior_prediction_pi <- model_posterior_prediction_samples %>% 
    group_by(mass_std) %>% 
    summarise(mean = mean(brain_size_scl)) %>% 
    ungroup() %>% 
    mutate(idx = idx, mod_degree = mod_degree)
  
  model_posterior_prediction_pi
}

cross_df(list(idx = seq_along(data_brainsize$species),
              mod_degree = c(1, 4))) %>% 
  pmap_dfr(model_loo) %>% 
  ggplot(mapping = aes(x = mass_std * sd(data_brainsize$mass) + mean(data_brainsize$mass))) +
    geom_line(aes(y = mean * max(data_brainsize$brain_size),
                    group = factor(idx),
                  color = idx)) +
    geom_point(data = data_brainsize %>% mutate(idx = row_number()),
               aes(y = brain_size_scl * max(data_brainsize$brain_size),
                   fill = idx, color = after_scale(clr_darken(fill))),
               size = 2, shape = 21) +
    labs(x = "mass",
         y = "brain_size") +
  facet_wrap(mod_degree ~ ., labeller = label_both) +
  scale_color_gradientn(colors = c(clr0dd, clr0, clr2), guide = "none") +
  scale_fill_gradientn(colors = c(clr0dd, clr0, clr2), guide = "none") +
  coord_cartesian(ylim = c(0, 2e3)) +
  theme(plot.title = element_markdown())

8.2 Entropy and Accuracy

8.2.1 Entropy

Definition of Information Entropy

\[ H(p) = - E~\textrm{log}(p_{i}) = - \sum_{i = 1}^n p_{i}~\textrm{log}(p_{i}) \]

or verbally:

The uncertainty contained in a probability distribution is the average log-probability of an event.

which fulfills the requirements:

uncertainty should be continuous
uncertainty should increase with the number of possible events
uncertainty should be additive

Example for $p_1 = 0.3$ and $p_2 = 0.7$:

\[ H(p) = - \big( p_{1} \textrm{log}(p_{1}) + p_{2} \textrm{log}(p_{2}) \big) \approx 0.61 \]

p <- c( .3, .7 )
- sum( p * log(p) )

#> [1] 0.6108643

Compared to Abu Dhabi (“it hardly ever rains”)

p <- c( .01, .99 )
- sum( p * log(p) )

#> [1] 0.05600153

Entropy increases with th dimensionality of the prediction problem (eg. predicting 🌧/ 🌨 / ☀️)

p <- c( .15, .5, .7 )
- sum( p * log(p) )

#> [1] 0.880814

8.2.2 Accuracy

Divergence: The additional uncertainty induced by using probabilities from one distribution to describe another distribution.

The Kullback-Leibler Divergence (KL):

\[ D_{KL}(p, q) = \sum_{i} p_{i} \big( \textrm{log}(p_{i}) - \textrm{log}(q_{i})\big) = \sum_{i} p_{i} \textrm{log}\left(\frac{p_{i}}{q_{i}}\right) \]

tibble(p1 = .3,
       p2 = .7,
       q1 = seq(from = .01, to = .99, by = .01),
       q2 = 1 - q1,
       d_kl = p1 * log(p1 / q1) + p2 * log(p2 / q2)) %>% 
  ggplot(aes(x = q1, y = d_kl)) +
  geom_line(color = clr2) +
  geom_vline(xintercept = .3, color = clr_dark, linetype = 3)

8.2.3 Estimating Divergence

Log-probability score to compare the predictive accuracy of different models:

\[ S(q) = \sum_{i} \textrm(log) (q_{i}) \]

where $i$ indexes each case and $q_{i}$ is the likelihood for each case.

A (re-scaled) equivalent is given with the deviance:

\[ D(q) = -2 \sum_{i} \textrm(log) (q_{i}) \]

and it’s Bayesian version the Log-pointwise-predictive density:

\[ lppd(y, \Theta) = \sum_{i} \textrm{log} \frac{1}{S} \sum_{s} p (y_{i} | \Theta_{s}) \]

where $S$ is the number of samples and $\Theta_{s}$ is the s-th set of sampled parameter values in the posterior distribution.

# lppd <- function (fit, ...) {
#     ll <- sim(fit, ll = TRUE, ...)
#     n <- ncol(ll)
#     ns <- nrow(ll)
#     f <- function(i) log_sum_exp(ll[, i]) - log(ns)
#     lppd <- sapply(1:n, f)
#     return(lppd)
# }
set.seed(1)
lppd(model_brain_size, n = 1e4)

#> [1]  0.6098668  0.6483438  0.5496093  0.6234934  0.4648143  0.4347605 -0.8444632

8.2.4 Scoring the right data

tibble(model_degree = 1:6,
       model = list(model_brain_size, model_brain_size2,
                    model_brain_size3, model_brain_size4,
                    model_brain_size5, model_brain_size6)) %>% 
  mutate(log_prob_score = map_dbl(model, .f = function(mod){sum(lppd(mod))}))

#> # A tibble: 6 × 3
#>   model_degree model  log_prob_score
#>          <int> <list>          <dbl>
#> 1            1 <map>            2.42
#> 2            2 <map>            2.65
#> 3            3 <map>            3.69
#> 4            4 <map>            5.32
#> 5            5 <map>           14.1 
#> 6            6 <map>           39.6

n_cores <- 8

run_sim <- function(k, n_samples, n_sim = 1e3, b_sigma = 100){
  mcreplicate(n_sim, sim_train_test(N = n_samples, k = k, b_sigma = b_sigma),
              mc.cores = n_cores) %>% 
    t() %>% 
    as_tibble() %>% 
    summarise(mean_p = mean(V1),
              mean_q = mean(V2),
              sd_p = sd(V1),
              sd_q = sd(V2)) %>% 
    mutate(k = k, n_samples = n_samples, b_sigma = b_sigma)
}

tictoc::tic()
data_sim <- cross_df(list(k = 1:5,
                          n_samples = c(20, 100))) %>% 
  pmap_dfr(run_sim)
tictoc::toc()
write_rds(data_sim, "data/rethinking_c6_data_sim.Rds")

tictoc::tic()
data_sim_var_beta <- crossing(k = 1:5,
         n_samples = c(20, 100),
         b_sigma = c(1, 0.5, 0.2)) %>% 
  pmap_dfr(run_sim)
tictoc::toc()
write_rds(data_sim_var_beta, "data/rethinking_c6_data_sim_var_beta.Rds")

data_sim <- read_rds("data/rethinking_c6_data_sim.Rds")
x_dodge <- .3
data_sim %>% 
  ggplot() +
  geom_pointrange(aes(x = k - .5 * x_dodge,
                      ymin = mean_p - sd_p, y = mean_p, ymax = mean_p + sd_p,
                      color = "train", fill = after_scale(clr_lighten(color))),
                  shape = 21) +
    geom_pointrange(aes(x = k + .5 * x_dodge,
                      ymin = mean_q - sd_q, y = mean_q, ymax = mean_q + sd_q,
                      color = "test", fill = after_scale(clr_lighten(color))),
                  shape = 21) +
  scale_color_manual("", values = c(train = clr0dd, test = clr2)) +
  facet_wrap(n_samples ~ ., scales = "free", label = label_both) +
  labs(x = "number of parameters", y = "deviance") +
  theme(legend.position = "bottom")

p_curves <- ggplot() +
  stat_function(fun = function(x){dnorm(x = x, mean = 0, sd = .2)},
                color = clr0dd, linetype = 1, xlim = c(-3, 3), n = 301) +
  stat_function(fun = function(x){dnorm(x = x, mean = 0, sd = .5)},
                color = clr0dd, linetype = 2, xlim = c(-3, 3), n = 301) +
  stat_function(fun = function(x){dnorm(x = x, mean = 0, sd = 1)},
                color = clr0dd, linetype = 3, xlim = c(-3, 3), n = 501) +
  labs(x = "parameter value", y = "density")

data_sim_var_beta <- read_rds("data/rethinking_c6_data_sim_var_beta.Rds")

p_lines <- data_sim_var_beta %>%
  dplyr::select(mean_p, mean_q,k:b_sigma) %>% 
  pivot_longer(cols = mean_p:mean_q, names_to = "set", values_to = "mean") %>% 
  ggplot() +
  geom_line(aes(x = k,
                y = mean,
                linetype = factor(b_sigma),
                color = set)) +
  geom_point(data = data_sim %>%
               dplyr::select(mean_p, mean_q,k:b_sigma) %>% 
               pivot_longer(cols = mean_p:mean_q, names_to = "set", values_to = "mean"),
             aes(x = k,
                 y = mean,
                 color = set,
                 fill = after_scale(clr_lighten(color))),
             shape = 21, size = .9) +
  scale_color_manual("", 
                     values = c(mean_p = clr0dd, mean_q = clr2),
                     labels = c(mean_p = "test", mean_q = "train")) +
  scale_linetype_manual("beta prior width",
                        values = c(`1` = 3, `0.5` = 2, `0.2` = 1)) +
  facet_wrap(n_samples ~ ., scales = "free", label = label_both) +
  labs(x = "number of parameters", y = "deviance") +
  theme(legend.position = "bottom")

p_curves + p_lines + 
  plot_annotation(tag_levels = "a") +
  plot_layout(widths = c(.5, 1), guides = "collect") &
  theme(legend.position = "bottom")

8.3 Golem taming: regularization

8.3.1 Cross-validataion

leave-one-out cross-validataion (loocv): dropping one data point in each fold (resulting in $n$ test sets, so $n$ refits of the model and $n$ posterior distributions)
Pareto-smoothed importance sampling cross-validataion (PSIS): approximates loocv by sampling from the original posterior while taking the importance/weight of each data point into account (provides feedback about it’s ow reliability, no model re-fitting necessary)

\[ s_{PSIS} = \sqrt{N~\textrm{var}(\textrm{psis}_{i})} \]

The importance sampling estimate of out-of-sample lppd:

\[ lppd_{IS} = \sum_{i=1}^{N} \textrm{log} \frac{\Sigma_{s=1}^{S} r(\theta_{s}p(y_{i}|\theta_{s}))}{\sigma_{s=1}^{S} r(\theta_{s})} \]

The Pareto distribution

\[ p(r | u, \sigma, k) = \sigma^{-1} \big(1 + k (r - u) \sigma ^{-1}\big)^{-\frac{1}{k}-1} \]

ggplot() +
  stat_function(fun = function(x){dpareto(x = x, xmin = 1, alpha = 1)},aes(color = "1"), xlim = c(0, 5), n = 201)+
  stat_function(fun = function(x){dpareto(x = x, xmin = 1, alpha = .7)},aes(color = "0.7"), xlim = c(0, 5), n = 201)+
  stat_function(fun = function(x){dpareto(x = x, xmin = 1, alpha = .3)},aes(color = "0.3"), xlim = c(0, 5), n = 201)+
  stat_function(fun = function(x){dpareto(x = x, xmin = 1, alpha = .1)},aes(color = "0.1"), xlim = c(0, 5),  n = 201) +
  scale_color_manual("alpha", values = c(`1` = clr0dd, `0.7` = clr1, `0.3` = clr2, `0.1` = clr3)) +
  coord_cartesian(ylim = c(-.1, 15), expand = 0) +
  theme(legend.position = "bottom")

8.3.2 Information Criteria

8.3.2.1 Akaike information criterion

Only for legacy reasons

\[ AIC = D_{train} + 2p = -2 lppd + 2p \]

AIC is an approximation that depends on

flat priors
posterior distribution is $\sim$ gaussian
sample size $N$ is much greater than numbers of parameters $k$

8.3.2.2 Widely Applicable Information Criterion

\[ WAIC(y, \Theta) = -2 (lppd - \underbrace{\sum_{i} \textrm{var}_{\theta}~\textrm{log}~p(y_{i}|\theta)}_{\textrm{penalty term}}) \]

WAIC calculations

data(cars)

set.seed(94)
model_cars <- quap(
  flist = alist(
    dist ~ dnorm(mu, sigma),
    mu <- alpha + beta_speed * speed,
    alpha ~ dnorm(0, 100),
    beta_speed ~ dnorm(0, 10),
    sigma ~ dexp(1)
  ),
  data = cars
)

set.seed(94)
n_samples <- 1e3
n_cases <-  nrow(cars)

cars_posterior_predictive_samples <- extract.samples(model_cars, n =  n_samples) %>% 
  as_tibble()

logprob <- sapply(1:n_samples,
                  function(idx){ 
  mu <- cars_posterior_predictive_samples$alpha[idx] + cars_posterior_predictive_samples$beta_speed[idx] * cars$speed
  dnorm( cars$dist, mean = mu, sd = cars_posterior_predictive_samples$sigma[idx], log = TRUE)})

lppd <- sapply(1:n_cases, function(i){log_sum_exp(logprob[i,]) - log(n_samples)})
pWAIC <- sapply(1:n_cases, function(i){var(logprob[i,])})

-2 * (sum(lppd) - sum(pWAIC))

#> [1] 423.3127

waic_vec <- -2 * (lppd - pWAIC)
sqrt(n_cases * var(waic_vec))

#> [1] 17.81271

8.3.3 Comparing CV, PSIS and WAIC

make_sim <- function(n, k, b_sigma) {
  
  r <- mcreplicate(n_sim, 
                   sim_train_test(N = n,
                                  k = k,
                                  b_sigma = b_sigma,
                                  WAIC = TRUE,
                                  LOOCV = TRUE, 
                                  LOOIC = TRUE),
                   mc.cores = n_cores)
  
  t <- tibble(
      deviance_os = mean(unlist(r[2, ])),
      deviance_w = mean(unlist(r[3, ])),
      deviance_p = mean(unlist(r[11, ])),
      deviance_c = mean(unlist(r[19, ])),
      error_w = mean(unlist(r[7, ])),
      error_p = mean(unlist(r[15, ])),
      error_c = mean(unlist(r[20, ]))
      )
  
  return(t)
}

n_sim <- 1e3
n_cores <- 8

tictoc::tic()
data_sim_scores <- crossing(n = c(20, 100),
                            k = 1:5,
                            b_sigma = c(0.5, 100)) %>%
  mutate(sim = pmap(list(n, k, b_sigma), make_sim)) %>% 
  unnest(sim)
tictoc::toc()
# 119984.727 sec elapsed

write_rds(data_sim_scores, "data/rethinking_c6_data_sim_scores.Rds")

data_sim_scores <- read_rds("data/rethinking_c6_data_sim_scores.Rds")

data_sim_scores %>% 
  pivot_longer(deviance_w:deviance_c) %>% 
  mutate(criteria = ifelse(name == "deviance_w", "WAIC",
                           ifelse(name == "deviance_p", "PSIS", "CV"))) %>% 
  ggplot(aes(x = k)) +
  geom_line(aes(y = value, color = criteria)) +
  geom_point(aes(y = deviance_os, shape = factor(b_sigma)), size = 1.5) +
  scale_shape_manual("sigma", values = c(19, 1)) +
  scale_color_manual(values = c(clr0dd, clr1, clr2)) +
  labs(x = "number of parameters (k)",
       y = "average deviance") +
  facet_grid(n ~ b_sigma, scales = "free_y", labeller = label_both)+
  theme(legend.position = "bottom")

8.4 Model comparison

8.4.1 Model mis-selection

chapter6_models <- read_rds("envs/chapter6_models.rds")

set.seed(11)
WAIC(chapter6_models$model_fungus_post_treatment)

#>       WAIC      lppd  penalty  std_err
#> 1 361.4511 -177.1724 3.553198 14.17033

set.seed(77)
comp_waic <- compare(chapter6_models$model_fungus_post_treatment,
        chapter6_models$model_fungus_no_treatment,
        chapter6_models$model_fungus_only_treatment,
        func = WAIC) %>% 
  as_tibble_rn() %>% 
  mutate(model = str_remove(param,".*\\$"),
         mod = model %>%  purrr::map(.f = function(m){chapter6_models[[m]]}),
         deviance = mod %>% 
          purrr::map_dbl(.f = rethinking::deviance),
        model = str_remove(model,"model_fungus_")) %>% 
  dplyr::select(-param)

comp_waic %>% 
  dplyr::select(-mod) %>%
  dplyr::select(model, everything())

#> # A tibble: 3 × 8
#>   model           WAIC    SE dWAIC   dSE pWAIC   weight deviance
#>   <chr>          <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl>    <dbl>
#> 1 post_treatment  361.  14.2   0    NA    3.57 1.00e+ 0     354.
#> 2 only_treatment  403.  11.3  41.3  10.5  2.65 1.08e- 9     397.
#> 3 no_treatment    406.  11.8  44.7  12.2  1.70 1.98e-10     402.

WAIC and PSIS result in similar values:

compare(chapter6_models$model_fungus_post_treatment,
        chapter6_models$model_fungus_no_treatment,
        chapter6_models$model_fungus_only_treatment,
        func = PSIS) %>% 
  as_tibble_rn()

#> # A tibble: 3 × 7
#>    PSIS    SE dPSIS   dSE pPSIS   weight param                                  
#>   <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl> <chr>                                  
#> 1  362.  14.3   0    NA    3.93 1.00e+ 0 chapter6_models$model_fungus_post_trea…
#> 2  403.  11.3  40.6  10.4  2.69 1.50e- 9 chapter6_models$model_fungus_only_trea…
#> 3  406.  11.7  43.9  12.2  1.63 2.99e-10 chapter6_models$model_fungus_no_treatm…

tibble(post_treatment = WAIC(chapter6_models$model_fungus_post_treatment, pointwise = TRUE)$WAIC,
       only_treatment = WAIC(chapter6_models$model_fungus_only_treatment, pointwise = TRUE)$WAIC,
       no_treatment = WAIC(chapter6_models$model_fungus_no_treatment, pointwise = TRUE)$WAIC) %>% 
  mutate(model_difference_post_only = post_treatment - only_treatment,
         model_difference_no_only = no_treatment - only_treatment) %>% 
  summarise(`post-only` = sqrt(n()[[1]] * var(model_difference_post_only)),
            `no-only` = sqrt(n()[[1]] * var(model_difference_no_only))) %>% 
  pivot_longer(everything(),
               names_to = "comparison",
               values_to = "se_of_model_differnce")

#> # A tibble: 2 × 2
#>   comparison se_of_model_differnce
#>   <chr>                      <dbl>
#> 1 post-only                  10.5 
#> 2 no-only                     4.90

$\rightarrow$ compare to WAIC table $dSE[[2]]

estimating the model difference for a z-score of $\sim$ 2.6 (99%)"

comp_waic$dWAIC[[2]] + c(-1, 1) * comp_waic$dSE[[2]] * 2.6

#> [1] 14.12572 68.47753

library(tidybayes)
comp_waic %>% 
  ggplot() +
  geom_vline(xintercept = comp_waic$WAIC[[1]], color = clr_dark, linetype = 3) +
  geom_pointinterval(aes(y = model, x = WAIC, xmin = WAIC - SE, xmax = WAIC + SE,
                      color = "WAIC", fill = after_scale(clr_lighten(color))), 
                     shape = 21, size = 2.5) +
  geom_point(aes(y = model, x = deviance, color = "WAIC")) +
  geom_pointinterval(data = comp_waic %>% filter(row_number() > 1),
                     aes(y = as.numeric(factor(model)) + .25, 
                         x = WAIC, xmin = WAIC - dSE, xmax = WAIC + dSE,
                         color = "dSE"), 
                     shape = 17, size = .9) +
  scale_color_manual(values = c(WAIC = clr2, dSE = clr0dd)) +
  theme(legend.position = "bottom")

set.seed(93)
compare(chapter6_models$model_fungus_post_treatment,
        chapter6_models$model_fungus_no_treatment,
        chapter6_models$model_fungus_only_treatment)@dSE %>%
  round(digits = 2) %>% 
  as_tibble() %>% 
  set_names(., nm =  names(.) %>% str_remove(pattern = ".*model_fungus_")) %>% 
  mutate(` ` = comp_waic$model) %>% 
  dplyr::select(` `, everything()) %>% 
  knitr::kable()

	post_treatment	no_treatment	only_treatment
post_treatment	NA	12.22	10.49
only_treatment	12.22	NA	4.86
no_treatment	10.49	4.86	NA

Weight of a model (last column of compare(), the relative support for each model):

\[ w_{i} = \frac{\textrm{exp}(-0.5\Delta_{i})}{\Sigma_{j}\textrm{exp}(-0.5\Delta_{j})} \]

These weights are important for model averaging.

8.4.2 Outliers and other illusions

chapter5_models <- read_rds("envs/chapter5_models.rds")

set.seed(24071847)
compare(chapter5_models$model_age,
        chapter5_models$model_marriage,
        chapter5_models$model_multiple) %>% 
  as_tibble_rn()

#> # A tibble: 3 × 7
#>    WAIC    SE dWAIC    dSE pWAIC   weight param                         
#>   <dbl> <dbl> <dbl>  <dbl> <dbl>    <dbl> <chr>                         
#> 1  127.  14.2  0    NA      4.41 0.679    chapter5_models$model_age     
#> 2  129.  14.4  1.51  0.883  5.48 0.320    chapter5_models$model_multiple
#> 3  141.  11.0 13.5  10.4    3.71 0.000805 chapter5_models$model_marriage

psis_k <- tibble(waic_penalty = (function(){set.seed(set.seed(23)); WAIC(chapter5_models$model_multiple, pointwise = TRUE)$penalty})(),
       psis_k = (function(){set.seed(set.seed(23)); PSIS(chapter5_models$model_multiple, pointwise = TRUE)$k})(),
       location = chapter5_models$data_waffle$Loc)

p_psis_k <- psis_k %>% 
  ggplot(aes(x = psis_k ,y = waic_penalty)) +
  geom_vline(xintercept = .5, color = clr_dark, linetype = 3) +
  geom_point(shape = 21, size = 2, color = clr2, fill = fll2) +
  geom_text(data = psis_k %>%  filter(location %in% c("ME", "ID")),
            aes(x = psis_k - .15, label = location))

p_dens <- ggplot() +
  stat_function(fun = function(x){dnorm(x = x, sd = .8)}, xlim = c(-4, 4), n = 201, aes(color = "gaussian"), linetype = 3)+
  stat_function(fun = function(x){dstudent(x = x, nu = 2, sigma = .55)}, xlim = c(-4, 4), n = 201, aes(color = "student t")) +
  labs(y = "density", x = "value")

p_logdens <- ggplot() +
  stat_function(fun = function(x){-log(dnorm(x = x, sd = .8))}, xlim = c(-4, 4), n = 201, aes(color = "gaussian"), linetype = 3)+
  stat_function(fun = function(x){-log(dstudent(x = x, nu = 2, sigma = .55))}, xlim = c(-4, 4), n = 201, aes(color = "student t")) +
  labs(y = "- log density", x = "value")


p_psis_k + p_dens + p_logdens +
  plot_layout(guides = "collect") &
  scale_color_manual("distribution", values = c(gaussian = clr0dd, `student t` = clr2)) &
  theme(legend.position = "bottom")

chapter5_models$model_multiple

#> 
#> Quadratic approximate posterior distribution
#> 
#> Formula:
#> divorce_std ~ dnorm(mu, sigma)
#> mu <- alpha + beta_M * marriage_std + beta_A * median_age_std
#> alpha ~ dnorm(0, 0.2)
#> beta_A ~ dnorm(0, 0.5)
#> beta_M ~ dnorm(0, 0.5)
#> sigma ~ dexp(1)
#> 
#> Posterior means:
#>         alpha        beta_A        beta_M         sigma 
#> -2.484974e-08 -6.135134e-01 -6.538068e-02  7.851183e-01 
#> 
#> Log-likelihood: -59.24

model_multiple_sudent <- quap(
  flist = alist(
    divorce_std ~ dstudent(2, mu, sigma),
    mu <- alpha + beta_M * marriage_std + beta_A * median_age_std,
    alpha ~ dnorm(0, 0.2),
    beta_A ~ dnorm(0, 0.5),
    beta_M ~ dnorm(0, 0.5),
    sigma ~ dexp(1)
  ),
  data = chapter5_models$data_waffle
)

PSIS(chapter5_models$model_multiple)

#>       PSIS      lppd  penalty  std_err
#> 1 130.2264 -65.11318 6.373124 15.15747

With the Student T distribution as a the likelihood, $k$ is reduced as there is more mass in the tails of the distribution (thus, Idaho is less surprising)

PSIS(model_multiple_sudent)

#>       PSIS      lppd  penalty  std_err
#> 1 133.7128 -66.85638 6.890912 11.90639

precis(chapter5_models$model_multiple) %>% 
  knit_precis()

param	mean	sd	5.5%	94.5%
alpha	0.00	0.10	-0.16	0.16
beta_A	-0.61	0.15	-0.85	-0.37
beta_M	-0.07	0.15	-0.31	0.18
sigma	0.79	0.08	0.66	0.91

precis(model_multiple_sudent) %>% 
  knit_precis()

param	mean	sd	5.5%	94.5%
alpha	0.02	0.10	-0.14	0.17
beta_A	-0.70	0.13	-0.91	-0.49
beta_M	0.00	0.21	-0.33	0.34
sigma	0.55	0.08	0.42	0.68

Also, as the influence of Idaho is reduced, the estimate of beta_A is decreased in the updated model.

library(rlang)
chapter7_models <- env(
  data_brainsize = data_brainsize,
  model_brain_size = model_brain_size,
  model_brain_size2 = model_brain_size2,
  model_brain_size3 = model_brain_size3,
  model_brain_size4 = model_brain_size4,
  model_brain_size5 = model_brain_size5,
  model_brain_size6 = model_brain_size6,
  cars = cars,
  model_cars = model_cars
)

8.5 Homework

Criteria defining information entropy

The measure of uncertainty should be continuous (so there should be sudden shifts in the change of uncertainty for minor changes in the underlying probabilities)
The measure of uncertainty should increase with the number of possible events (higher uncertainties are expected as the dimensionality of the underlying possibility increases)
The measure of uncertainty should be additive (the sum of the uncertainty of all sub-events should result exactly in total uncertainty)

recall Information Entropy

\[ H(p) = - \sum_{i=1}^{n} p_i \textrm{log}(p_i) = -\big( p_1 \textrm{log}(p_1) + p_2 \textrm{log}(p_2) \big) \]

p_coin <- c(tails = .3, heads = .7)
-sum(p_coin * log(p_coin))

#> [1] 0.6108643

p_die <- c(`1` = .2, `2` = .25, `3` = .25, `4` = .3)
-sum(p_die * log(p_die))

#> [1] 1.376227

using L’Hôpital’s Rule for the limit $\textrm{lim}_{p_{i}\rightarrow\infty} p_{i} \textrm{log}(p_i) =0$:

p_die <- c(`1` = 1/3, `2` = 1/3, `3` = 1/3, `4` = 0)
-sum(c(p_die[1:3] * log(p_die[1:3]), 0))

#> [1] 1.098612

\[ \begin{array}{rcl} AIC & = & D_{train} + 2p\\ & = & -2 lppd + 2p \\ &&\\ WAIC(y, \Theta) & = &-2 (lppd - \underbrace{\sum_{i} \textrm{var}_{\theta}~\textrm{log}~p(y_{i}|\theta)}_{\textrm{penalty term}})\\ & = & -2 lppd + 2 \sum_{i} \textrm{var}_{\theta}~\textrm{log}~p(y_{i}|\theta)\\ \end{array} \]

$AIC = WAIC$ if:

\[ \begin{array}{rcl} p & = & \sum_{i} \textrm{var}_{\theta}~\textrm{log}~p(y_{i}|\theta)\\ \end{array} \]

We can expect the more general criterion (WAIC) to match the more specific one (AIC), if:

the priors are flat and overwhelmed by the likelihood
the posterior distribution is approximately gaussian
the number of samples ($n$) is much larger than the number of parameters ($k$)

Both model selection and model comparison list the WAIC of a suite of models. For model selection however, all but the best performing models are discarded, while in model comparison the distribution of the criterion and the relative performance can be used to investigate subtle influences on the individual influences on the models. This provides much more context for the following inferences.

The magnitude of information criteria are dependent on the number of observations. models fitted to different data sets are thus not comparable:

sim_waic <- tibble(sample_size = rep(c(100, 500, 1e3), each = 100)) %>% 
  mutate(data = purrr::map(
    sample_size,
    .f = function(sample_size){
      tibble(x = rnorm(n = sample_size),
             y = rnorm(n = sample_size, .5 + .75 * x)) %>% 
        mutate(across(everything(), standardize),)
    }),
    model = purrr::map(
      data,
      function(data){
        quap(
          flist = alist(
            y ~ dnorm(mu, sigma),
            mu <- alpha + beta * x,
            alpha ~ dnorm(0, .2),
            beta ~ dnorm(0, .5),
            sigma ~ dexp(1)
          ), 
          data = data,
          start = list(alpha = 0, beta = 0, sigma = .2))
      }
    ),
    lppd = map_dbl(model, ~sum(rethinking::lppd(.x))),
    information_criterion = purrr:::map(
      model,
      function(model){
        tibble(WAIC = rethinking::WAIC(model)$WAIC,
               PSIS = suppressMessages(rethinking::PSIS(model)$PSIS))
      }
    )) %>% 
  unnest(information_criterion)

sim_waic %>% 
  dplyr::select(sample_size, lppd:PSIS) %>% 
  pivot_longer(cols = lppd:PSIS,
               names_to = "information_criterion") %>% 
  ggplot(aes(x = value, y = factor(sample_size), color = information_criterion)) +
  stat_slab(slab_type = "pdf",
            aes(fill = after_scale(clr_alpha(color))), 
            size = .5,
            trim = FALSE, n = 301) +
  scale_color_manual(values = c(clr0dd, clr2, clr3),
                     guide = "none") +
  labs(y = "sample_size") +
  facet_wrap(information_criterion ~ .,
             scales = "free") +
  theme(axis.title.x = element_blank())

Given the WAIC formula

A more concentrated prior translates to less variation in the prior (thus a smaller variance). Given the formula for $WAIC$, where the effective number of parameters ($p_{WAIC}$) is represented by the penalty term $\sum_{i} \textrm{var}_{\theta}~\textrm{log}~p(y_{i}|\theta)$:

\[ WAIC(y, \Theta) = -2 (lppd - \underbrace{\sum_{i} \textrm{var}_{\theta}~\textrm{log}~p(y_{i}|\theta)}_{\textrm{penalty term}}) \]

A smaller $\textrm{var}_{\theta}$ will also decrease the entire $p_{WAIC}$

library(cli)
quap_prior_var <- function(df, p_idx) {
  if(p_idx == prior_widths[[1]]){
  mod <- quap(
    flist = alist(
      y ~ dnorm(mu, sigma),
      mu <- alpha + beta_1 * x1 + beta_2 * x2 + beta_3 * x3,
      alpha ~ dnorm(0, .2),
      c(beta_1, beta_2, beta_3) ~ dnorm(0, prior_widths[[1]]),
      sigma ~ dexp(1)
    ), 
    data = df,
    start = list(alpha = 0, beta_1 = 0, beta_2 = 0,
                 beta_3 = 0, sigma = .2)   
  )
  } else if(p_idx == prior_widths[[2]]){
  mod <- quap(
    flist = alist(
      y ~ dnorm(mu, sigma),
      mu <- alpha + beta_1 * x1 + beta_2 * x2 + beta_3 * x3,
      alpha ~ dnorm(0, .2),
      c(beta_1, beta_2, beta_3) ~ dnorm(0, prior_widths[[2]]),
      sigma ~ dexp(1)
    ), 
    data = df,
    start = list(alpha = 0, beta_1 = 0, beta_2 = 0,
                 beta_3 = 0, sigma = .2))
    } else if(p_idx == prior_widths[[3]]){
  mod <- quap(
    flist = alist(
      y ~ dnorm(mu, sigma),
      mu <- alpha + beta_1 * x1 + beta_2 * x2 + beta_3 * x3,
      alpha ~ dnorm(0, .2),
      c(beta_1, beta_2, beta_3) ~ dnorm(0, prior_widths[[3]]),
      sigma ~ dexp(1)
    ), 
    data = df,
    start = list(alpha = 0, beta_1 = 0, beta_2 = 0,
                 beta_3 = 0, sigma = .2))
    } else if(p_idx == prior_widths[[4]]){
  mod <- quap(
    flist = alist(
      y ~ dnorm(mu, sigma),
      mu <- alpha + beta_1 * x1 + beta_2 * x2 + beta_3 * x3,
      alpha ~ dnorm(0, .2),
      c(beta_1, beta_2, beta_3) ~ dnorm(0, prior_widths[[4]]),
      sigma ~ dexp(1)
    ), 
    data = df,
    start = list(alpha = 0, beta_1 = 0, beta_2 = 0,
                 beta_3 = 0, sigma = .2)   
  )
    } else { 
      stop(glue::glue("p_idx needs to be one of ({prior_widths[[1]]}, {prior_widths[[2]]}, {prior_widths[[3]]} or {prior_widths[[4]]})"))
      }
  cli_progress_update(.envir = .GlobalEnv)
  
  tibble(model = list(mod)) %>% 
    mutate(lppd = sum(rethinking::lppd(mod)),
           WAIC = rethinking::WAIC(mod)$WAIC,
           PSIS = suppressMessages(rethinking::PSIS(mod)$PSIS))
}

set.seed(42)
prior_widths <- c(.1, .32, 1, 3.2)
n <- 75
cli_progress_bar("Simulate | Prior Width", total = n * length(prior_widths))
prior_sim <- tibble(prior_sd = rep(prior_widths, each = n)) %>%
  mutate(
    sample_data = purrr::map(1:n(),
                             function(x, prior_sd) {
                               n <- 20
                               tibble(x1 = rnorm(n = n),
                                      x2 = rnorm(n = n),
                                      x3 = rnorm(n = n)) %>%
                                 mutate(y = rnorm(n = n, mean = 0.3 + 0.8 * x1 +
                                                    0.6 * x2 + 1.2 * x3),
                                        across(everything(), standardize))
                             }),
    mod = map2(sample_data, prior_sd, quap_prior_var)) %>% 
  unnest(mod) %>% 
  mutate(p_waic = .5 * (WAIC + 2 * lppd))

prior_sim %>%
  ggplot(aes(x = p_waic, y = factor(prior_sd))) +
   stat_slab(slab_type = "pdf",
            fill = fll0,
            color = clr0dd,
            size = .5,
            adjust = .5,
            trim = FALSE, 
            n = 501) +
  labs(y = "prior_sd")

Informative priors prevent overfitting because the prevent the model from simply encoding the data.

Overly informative priors result in underfitting because the likelihood has only a marginal influence on the posterior. As a result, the posterior simply resembles the prior.

data(Laffer)
data_laffer <- Laffer %>% 
  as_tibble() %>% 
  mutate(dplyr::across(everything(), standardize, .names = "{.col}_std"),
         tax_rate_std_sq = tax_rate_std ^ 2)

model_laffer_0 <- quap(
  flist = alist(
    tax_revenue_std ~ dnorm(mu, simga),
    mu <- alpha + 0 * tax_rate_std,
    alpha ~ dnorm(0, .2),
    simga ~ dexp(1)
  ),
  data = data_laffer
)


tax_seq <- seq(from = min(data_laffer$tax_rate_std) - .05 * diff(range(data_laffer$tax_rate_std)),
               to = max(data_laffer$tax_rate_std) + .05 * diff(range(data_laffer$tax_rate_std)),
               length.out = 101)

tax_model_0_posterior_prediction_samples <- link(model_laffer_0,
                                               data = data.frame(tax_rate_std = tax_seq)) %>% 
  as_tibble() %>% 
  set_names(nm = tax_seq) %>% 
  pivot_longer(cols = everything(), names_to = "tax_rate_std", values_to = "tax_revenue_std") %>% 
  mutate(tax_rate_std = as.numeric(tax_rate_std),
         tax_revenue = tax_revenue_std * sd(data_laffer$tax_revenue_std) + mean(data_laffer$tax_revenue_std),
         tax_rate = tax_rate_std * sd(data_laffer$tax_rate) + mean(data_laffer$tax_rate)) 

tax_model_0_posterior_prediction_pi <- tax_model_0_posterior_prediction_samples %>% 
  group_by(tax_rate) %>% 
  summarise(mean = mean(tax_revenue_std),
            PI_lower = PI(tax_revenue_std)[1],
            PI_upper = PI(tax_revenue_std)[2]) %>% 
  ungroup()

model_laffer_1 <- quap(
  flist = alist(
    tax_revenue_std ~ dnorm(mu, simga),
    mu <- alpha + beta_1 * tax_rate_std,
    alpha ~ dnorm(0, .2),
    beta_1 ~ dnorm(0, .5),
    simga ~ dexp(1)
  ),
  data = data_laffer
)

tax_model_1_posterior_prediction_samples <- link(model_laffer_1,
                                               data = data.frame(tax_rate_std = tax_seq)) %>% 
  as_tibble() %>% 
  set_names(nm = tax_seq) %>% 
  pivot_longer(cols = everything(), names_to = "tax_rate_std", values_to = "tax_revenue_std") %>% 
  mutate(tax_rate_std = as.numeric(tax_rate_std),
         tax_revenue = tax_revenue_std * sd(data_laffer$tax_revenue_std) + mean(data_laffer$tax_revenue_std),
         tax_rate = tax_rate_std * sd(data_laffer$tax_rate) + mean(data_laffer$tax_rate)) 

tax_model_1_posterior_prediction_pi <- tax_model_1_posterior_prediction_samples %>% 
  group_by(tax_rate) %>% 
  summarise(mean = mean(tax_revenue_std),
            PI_lower = PI(tax_revenue_std)[1],
            PI_upper = PI(tax_revenue_std)[2]) %>% 
  ungroup()

model_laffer_2 <- quap(
  flist = alist(
    tax_revenue_std ~ dnorm(mu, simga),
    mu <- alpha + beta_1 * tax_rate_std + beta_2 * tax_rate_std ^ 2,
    alpha ~ dnorm(0, .2),
    c(beta_1, beta_2) ~ dnorm(0, .5),
    simga ~ dexp(1)
  ),
  data = data_laffer
)

tax_model_2_posterior_prediction_samples <- link(model_laffer_2,
                                               data = data.frame(tax_rate_std = tax_seq)) %>% 
  as_tibble() %>% 
  set_names(nm = tax_seq) %>% 
  pivot_longer(cols = everything(), names_to = "tax_rate_std", values_to = "tax_revenue_std") %>% 
  mutate(tax_rate_std = as.numeric(tax_rate_std),
         tax_revenue = tax_revenue_std * sd(data_laffer$tax_revenue_std) + mean(data_laffer$tax_revenue_std),
         tax_rate = tax_rate_std * sd(data_laffer$tax_rate) + mean(data_laffer$tax_rate)) 

tax_model_2_posterior_prediction_pi <- tax_model_2_posterior_prediction_samples %>% 
  group_by(tax_rate) %>% 
  summarise(mean = mean(tax_revenue_std),
            PI_lower = PI(tax_revenue_std)[1],
            PI_upper = PI(tax_revenue_std)[2]) %>% 
  ungroup() 

n_knots <- 5
knot_list <- quantile(data_laffer$tax_rate_std, probs = seq(0, 1, length.out = n_knots))
library(splines)
b_spline_laffer <- bs(data_laffer$tax_rate_std,
                      knots = knot_list[-c(1, n_knots)],
                      degree = 3,
                      intercept = TRUE)

model_laffer_3 <- quap(
  flist = alist(
    tax_revenue_std ~ dnorm(mu, sigma),
    mu <- alpha + B %*% w,
    alpha ~ dnorm(0, .5),
    w ~ dnorm(0, .5),
    sigma ~ dexp(1)
  ),
  data = list(tax_revenue_std = data_laffer$tax_revenue_std,
              B = b_spline_laffer),
  start = list(w = rep(0, ncol(b_spline_laffer)))
)

b_spline_tax <- bs(tax_seq,
                   knots = knot_list[-c(1, n_knots)],
                   degree = 3,
                   intercept = TRUE)

tax_model_3_posterior_prediction_samples <- link(model_laffer_3, data = list(B = b_spline_tax)) %>% 
  as_tibble() %>% 
  set_names(nm = tax_seq) %>% 
  pivot_longer(cols = everything(), names_to = "tax_rate_std", values_to = "tax_revenue_std") %>% 
  mutate(tax_rate_std = as.numeric(tax_rate_std),
         tax_revenue = tax_revenue_std * sd(data_laffer$tax_revenue_std) + mean(data_laffer$tax_revenue_std),
         tax_rate = tax_rate_std * sd(data_laffer$tax_rate) + mean(data_laffer$tax_rate)) 

tax_model_3_posterior_prediction_pi <- tax_model_3_posterior_prediction_samples %>% 
  group_by(tax_rate) %>% 
  summarise(mean = mean(tax_revenue_std),
            PI_lower = PI(tax_revenue_std)[1],
            PI_upper = PI(tax_revenue_std)[2]) %>% 
  ungroup() 



rethinking::compare(model_laffer_0,
                    model_laffer_1,
                    model_laffer_2,
                    model_laffer_3,
                    func = WAIC) %>% 
  knit_precis(param_name = "model")

model	WAIC	SE	dWAIC	dSE	pWAIC	weight
model_laffer_1	89.18	22.74	0.00	NA	6.14	0.33
model_laffer_2	89.46	25.26	0.28	3.03	7.35	0.29
model_laffer_0	90.05	21.21	0.87	3.44	5.19	0.22
model_laffer_3	90.67	23.47	1.49	2.49	7.93	0.16

First of all, all models do share a substantial share of the weight within the model comparison making it hard to opt for one in particular. Although the model with the lowest $WAIC$ is in fact the quadratic one, the data falls almost exclusively inside the rising part of the parabola - the situation for the spline is similar.

There might be a saturation effect after an initial increase, but since even the flat model (model_laffer_0) rrecives a substantial share of the weight ($\sim$ 22 %), the other models might just be over fitting the outliers while actually there tax revenue and tax rate might be independent.

model_laffer_0_student <- quap(
  flist = alist(
    tax_revenue_std ~ dstudent(2, mu, sigma),
    mu <- alpha + 0 * tax_rate_std,
    alpha ~ dnorm(0, 0.2),
    sigma ~ dexp(1)
  ),
  data = data_laffer
)

tax_model_0_student_posterior_prediction_samples <- link(model_laffer_0_student,
                                               data = data.frame(tax_rate_std = tax_seq)) %>% 
  as_tibble() %>% 
  set_names(nm = tax_seq) %>% 
  pivot_longer(cols = everything(), names_to = "tax_rate_std", values_to = "tax_revenue_std") %>% 
  mutate(tax_rate_std = as.numeric(tax_rate_std),
         tax_revenue = tax_revenue_std * sd(data_laffer$tax_revenue_std) + mean(data_laffer$tax_revenue_std),
         tax_rate = tax_rate_std * sd(data_laffer$tax_rate) + mean(data_laffer$tax_rate)) 

tax_model_0_student_posterior_prediction_pi <- tax_model_0_student_posterior_prediction_samples %>% 
  group_by(tax_rate) %>% 
  summarise(mean = mean(tax_revenue_std),
            PI_lower = PI(tax_revenue_std)[1],
            PI_upper = PI(tax_revenue_std)[2]) %>% 
  ungroup()

model_laffer_1_student <- quap(
  flist = alist(
    tax_revenue_std ~ dstudent(2, mu, sigma),
    mu <- alpha + beta_1 * tax_rate_std,
    alpha ~ dnorm(0, 0.2),
    beta_1 ~ dnorm(0, .5),
    sigma ~ dexp(1)
  ),
  data = data_laffer
)

tax_model_1_student_posterior_prediction_samples <- link(model_laffer_1_student,
                                               data = data.frame(tax_rate_std = tax_seq)) %>% 
  as_tibble() %>% 
  set_names(nm = tax_seq) %>% 
  pivot_longer(cols = everything(), names_to = "tax_rate_std", values_to = "tax_revenue_std") %>% 
  mutate(tax_rate_std = as.numeric(tax_rate_std),
         tax_revenue = tax_revenue_std * sd(data_laffer$tax_revenue_std) + mean(data_laffer$tax_revenue_std),
         tax_rate = tax_rate_std * sd(data_laffer$tax_rate) + mean(data_laffer$tax_rate)) 

tax_model_1_student_posterior_prediction_pi <- tax_model_1_student_posterior_prediction_samples %>% 
  group_by(tax_rate) %>% 
  summarise(mean = mean(tax_revenue_std),
            PI_lower = PI(tax_revenue_std)[1],
            PI_upper = PI(tax_revenue_std)[2]) %>% 
  ungroup()

model_laffer_2_student <- quap(
  flist = alist(
    tax_revenue_std ~ dstudent(2, mu, sigma),
    mu <- alpha + beta_1 * tax_rate_std + beta_2 * tax_rate_std ^ 2,
    alpha ~ dnorm(0, 0.2),
    c(beta_1, beta_2) ~ dnorm(0, .5),
    sigma ~ dexp(1)
  ),
  data = data_laffer
)

tax_model_2_student_posterior_prediction_samples <- link(model_laffer_2_student,
                                               data = data.frame(tax_rate_std = tax_seq)) %>% 
  as_tibble() %>% 
  set_names(nm = tax_seq) %>% 
  pivot_longer(cols = everything(), names_to = "tax_rate_std", values_to = "tax_revenue_std") %>% 
  mutate(tax_rate_std = as.numeric(tax_rate_std),
         tax_revenue = tax_revenue_std * sd(data_laffer$tax_revenue_std) + mean(data_laffer$tax_revenue_std),
         tax_rate = tax_rate_std * sd(data_laffer$tax_rate) + mean(data_laffer$tax_rate)) 

tax_model_2_student_posterior_prediction_pi <- tax_model_2_student_posterior_prediction_samples %>% 
  group_by(tax_rate) %>% 
  summarise(mean = mean(tax_revenue_std),
            PI_lower = PI(tax_revenue_std)[1],
            PI_upper = PI(tax_revenue_std)[2]) %>% 
  ungroup() 

model_laffer_3_student <- quap(
  flist = alist(
    tax_revenue_std ~ dstudent(2, mu, sigma),
    mu <- alpha + B %*% w,
    alpha ~ dnorm(0, .5),
    w ~ dnorm(0, .5),
    sigma ~ dexp(1)
  ),
  data = list(tax_revenue_std = data_laffer$tax_revenue_std,
              B = b_spline_laffer),
  start = list(w = rep(0, ncol(b_spline_laffer)))
)

tax_model_3_student_posterior_prediction_samples <- link(model_laffer_3_student, 
                                                         data = list(B = b_spline_tax)) %>% 
  as_tibble() %>% 
  set_names(nm = tax_seq) %>% 
  pivot_longer(cols = everything(), names_to = "tax_rate_std", values_to = "tax_revenue_std") %>% 
  mutate(tax_rate_std = as.numeric(tax_rate_std),
         tax_revenue = tax_revenue_std * sd(data_laffer$tax_revenue_std) + mean(data_laffer$tax_revenue_std),
         tax_rate = tax_rate_std * sd(data_laffer$tax_rate) + mean(data_laffer$tax_rate)) 

tax_model_3_student_posterior_prediction_pi <- tax_model_3_student_posterior_prediction_samples %>% 
  group_by(tax_rate) %>% 
  summarise(mean = mean(tax_revenue_std),
            PI_lower = PI(tax_revenue_std)[1],
            PI_upper = PI(tax_revenue_std)[2]) %>% 
  ungroup()

get_psis_and_waic <- function(model, model_type,  m_idx){
  PSIS(model, pointwise = TRUE) %>% 
    as_tibble() %>% 
    dplyr::select(PSIS = PSIS, lppd_psis = lppd, `psis-k` = k) %>% 
    set_names(nm = names(.) %>% str_c(., "_",model_type,"_m",m_idx)) %>% 
    bind_cols(WAIC(model, pointwise = TRUE) %>% 
    as_tibble() %>% 
          dplyr::select(WAIC = WAIC, lppd_waic = lppd, `waic-penalty` = penalty) %>% 
    set_names(nm = names(.) %>% str_c(., "_",model_type,"_m",m_idx)))
}

data_laffer_waic <- tibble(model = list(model_laffer_0, model_laffer_1, model_laffer_2, model_laffer_3,
                                        model_laffer_0_student, model_laffer_1_student,
                                        model_laffer_2_student, model_laffer_3_student),
       model_type = rep(c("norm", "student"), each = 4),
       m_idx = c(0:3, 0:3)) %>% 
  pmap(get_psis_and_waic) %>% 
  reduce(bind_cols, .init = data_laffer) %>% 
  mutate(rn = row_number())

p0 <- ggplot(mapping = aes(x = tax_rate)) +
  geom_smooth(data = tax_model_0_posterior_prediction_pi,
              stat = "identity",
              aes(y = mean, ymin = PI_lower, ymax = PI_upper,
                  color = "normal", fill = after_scale(clr_alpha(color))),
              size = .2) +
  geom_point(data = data_laffer_waic,
             aes(y = tax_revenue_std, size = WAIC_norm_m0),
             color = clr_dark, fill = fll0, shape = 21) +
  geom_text(data = data_laffer_waic %>%
              filter(rn %in% c(1, 11, 12)), 
            aes(y = tax_revenue_std, label = rn), family = fnt_sel) +
  labs(subtitle = "0: flat")

p1 <- ggplot(mapping = aes(x = tax_rate)) +
  geom_smooth(data = tax_model_1_posterior_prediction_pi,
              stat = "identity",
              aes(y = mean, ymin = PI_lower, ymax = PI_upper,
                  color = "normal", fill = after_scale(clr_alpha(color))),
              size = .2) +
    geom_point(data = data_laffer_waic,
             aes(y = tax_revenue_std, size = WAIC_norm_m1),
             color = clr_dark, fill = fll0, shape = 21) +
  labs(subtitle = "1: linear")

p2 <- ggplot(mapping = aes(x = tax_rate)) +
  geom_smooth(data = tax_model_2_posterior_prediction_pi,
              stat = "identity",
              aes(y = mean, ymin = PI_lower, ymax = PI_upper,
                  color = "normal", fill = after_scale(clr_alpha(color))),
              size = .2) +
  geom_point(data = data_laffer_waic,
             aes(y = tax_revenue_std, size = WAIC_norm_m2),
             color = clr_dark, fill = fll0, shape = 21) +
    labs(subtitle = "2: quadratic")

p3 <- ggplot(mapping = aes(x = tax_rate)) +
  geom_smooth(data = tax_model_3_posterior_prediction_pi,
              stat = "identity",
              aes(y = mean, ymin = PI_lower, ymax = PI_upper,
                  color = "normal", fill = after_scale(clr_alpha(color))),
              size = .2) +
    geom_point(data = data_laffer_waic,
             aes(y = tax_revenue_std, size = WAIC_norm_m3),
             color = clr_dark, fill = fll0, shape = 21) +
  labs(subtitle = "3: spline")

p0s <- ggplot(mapping = aes(x = tax_rate)) +
  geom_smooth(data = tax_model_0_student_posterior_prediction_pi,
              stat = "identity",
              aes(y = mean, ymin = PI_lower, ymax = PI_upper,
                  color = "student", fill = after_scale(clr_alpha(color))),
              size = .2) +
  geom_point(data = data_laffer_waic,
             aes(y = tax_revenue_std, size = WAIC_student_m0),
             color = clr_dark, fill = fll0, shape = 21) +
  labs(subtitle = "0: flat")

p1s <- ggplot(mapping = aes(x = tax_rate)) +
  geom_smooth(data = tax_model_1_student_posterior_prediction_pi,
              stat = "identity",
              aes(y = mean, ymin = PI_lower, ymax = PI_upper,
                  color = "student", fill = after_scale(clr_alpha(color))),
              size = .2) +
    geom_point(data = data_laffer_waic,
             aes(y = tax_revenue_std, size = WAIC_student_m1),
             color = clr_dark, fill = fll0, shape = 21) +
  labs(subtitle = "1: linear")

p2s <- ggplot(mapping = aes(x = tax_rate)) +
  geom_smooth(data = tax_model_2_student_posterior_prediction_pi,
              stat = "identity",
              aes(y = mean, ymin = PI_lower, ymax = PI_upper,
                  color = "student", fill = after_scale(clr_alpha(color))),
              size = .2) +
  geom_point(data = data_laffer_waic,
             aes(y = tax_revenue_std, size = WAIC_student_m2),
             color = clr_dark, fill = fll0, shape = 21) +
    labs(subtitle = "2: quadratic")

p3s <- ggplot(mapping = aes(x = tax_rate)) +
  geom_smooth(data = tax_model_3_student_posterior_prediction_pi,
              stat = "identity",
              aes(y = mean, ymin = PI_lower, ymax = PI_upper,
                  color = "student", fill = after_scale(clr_alpha(color))),
              size = .2) +
    geom_point(data = data_laffer_waic,
             aes(y = tax_revenue_std, size = WAIC_student_m3),
             color = clr_dark, fill = fll0, shape = 21) +
  labs(subtitle = "3: spline")


waic_range <- data_laffer_waic %>%
  pivot_longer(cols = starts_with("WAIC")) %>% 
  .$value %>%
  range()

p0 + p1 + p2 + p3  +
  p0s + p1s + p2s + p3s +
  plot_layout(nrow = 2, guides = "collect")  &
  theme(plot.subtitle = element_text(hjust = .5), legend.position = "bottom") &
  scale_size_continuous("WAIC",
                        breaks = c(5,10,15,20),
                        range =  c(.5, 7), limits = waic_range) &
  scale_color_manual("Prior Type", values = c(normal = clr0d, student = clr1)) &
  labs(y = "tax_revenue")

data_laffer_waic %>% 
  dplyr::select(c(starts_with("waic",ignore.case = FALSE),
                  starts_with("psis",ignore.case = FALSE))) %>% 
  mutate(rn = row_number()) %>% 
  pivot_longer(-rn) %>% 
  separate(name, into = c("statistic", "model_type", "model"), sep = "_") %>% 
  pivot_wider(names_from = "statistic", values_from = "value") %>% 
  ggplot(aes(x = `psis-k`, y = `waic-penalty`)) +
  geom_vline(xintercept = .5, linetype = 3, color = clr_dark) +
  geom_point(aes(color = model_type, fill = after_scale(clr_alpha(color))),
             shape = 21, size = 1.5) +
  ggrepel::geom_text_repel(data = . %>% filter(rn %in% c(1, 11, 12)), 
                           aes(label = rn), family = fnt_sel) +
  facet_grid(model_type ~ model, scales = "free")+
  scale_color_manual("Prior Type", values = c(normal = clr0d, student = clr1)) +
  theme(legend.position = "bottom",
        panel.background = element_rect(fill = "transparent", color = clr0d))

data_birds <- tibble(
  island = 1:3,
  species_a = c(.2, .8, .05),
  species_b = c(.2, .1, .15),
  species_c = c(.2, .05, .7),
  species_d = c(.2, .025, .05),
  species_e = c(.2, .025, .05)
)

data_birds %>% 
  group_by(island) %>% 
  mutate(total = sum(across(starts_with("species"))),
         entropy = -sum(across(starts_with("species"),
                              .fns = ~(function(x){x * log(x)})(.x)))) %>% 
  ungroup()

#> # A tibble: 3 × 8
#>   island species_a species_b species_c species_d species_e total entropy
#>    <int>     <dbl>     <dbl>     <dbl>     <dbl>     <dbl> <dbl>   <dbl>
#> 1      1      0.2       0.2       0.2      0.2       0.2       1   1.61 
#> 2      2      0.8       0.1       0.05     0.025     0.025     1   0.743
#> 3      3      0.05      0.15      0.7      0.05      0.05      1   0.984

The entropy in island 1 is highest as all bird species are equally common. In contrast island 2 has the lowest entropy which translates to the most irregular bird distribution.

data_birds_compact <- tibble(island = 1:3,
                          birds = list(data_birds[1, 2:6] %>% unlist(),
                                       data_birds[2, 2:6] %>% unlist(),
                                       data_birds[3, 2:6] %>% unlist()))


kl_div <- function(p,q){
  sum(p * log(p/q))
}

kl_birds <- function(i1, i2){
  kl_div(p = data_birds_compact[i1,]$birds[[1]],
         q = data_birds_compact[i2,]$birds[[1]])
}

cross_df(list(i1 = 1:3, i2 = 1:3 )) %>% 
  mutate(kl_divergence = map2_dbl(i1, i2,.f = kl_birds)) %>% 
  ggplot(aes(x = i1, y = i2, fill = kl_divergence)) +
  geom_tile() +
  geom_text(aes(label = round(kl_divergence, digits = 2)),
            color = "white", family = fnt_sel) +
  scale_fill_gradientn(colours = c(clr0d, clr1)) +
  coord_equal()

Islands 1 and 3 predict each other reasonably well - these are the islands that have the highest entropy to begin with and which are thus not easily surprised. Using Island 1 as predictor (i2 / q) does generally produce the lowest KL divergences.

dagify(M ~ A + H,
       coords = tibble(name = c("M", "A", "H"),
                     x = c(.5, 0, 1),
                     y = c(0, 1, 1))) %>%
  fortify() %>% 
  mutate(stage = if_else(name == "M", "response",
                         if_else(name %in% c("A", "H"),
                                 "predictor", "confounds"))) %>% 
  plot_dag(clr_in = clr3) + 
  scale_y_continuous(limits = c(-.05, 1.05)) +
  scale_x_continuous(limits = c(-.05, 1.05)) +
  coord_fixed(ratio = .6)

chapter6_models$model_happy

#> 
#> Quadratic approximate posterior distribution
#> 
#> Formula:
#> happiness ~ dnorm(mu, sigma)
#> mu <- alpha + beta_age * age_trans
#> alpha ~ dnorm(0, 1)
#> beta_age ~ dnorm(0, 2)
#> sigma ~ dexp(1)
#> 
#> Posterior means:
#>         alpha      beta_age         sigma 
#>  1.028282e-07 -1.313332e-07  1.210334e+00 
#> 
#> Log-likelihood: -1623.32

chapter6_models$model_happy

#> 
#> Quadratic approximate posterior distribution
#> 
#> Formula:
#> happiness ~ dnorm(mu, sigma)
#> mu <- alpha + beta_age * age_trans
#> alpha ~ dnorm(0, 1)
#> beta_age ~ dnorm(0, 2)
#> sigma ~ dexp(1)
#> 
#> Posterior means:
#>         alpha      beta_age         sigma 
#>  1.028282e-07 -1.313332e-07  1.210334e+00 
#> 
#> Log-likelihood: -1623.32

rethinking::compare(chapter6_models$model_happy,
                    chapter6_models$model_happy_married,
                    func = WAIC) %>% 
  knit_precis(param_name = "model")

model	WAIC	SE	dWAIC	dSE	pWAIC	weight
chapter6_models$model_happy_married \| 2817.31\| 41.49\| 0.00\| NA\| 3.74\| 1\| \|chapter6_models$model_happy	3252.08	28.37	434.77	38.87	2.41	0

The model comparison very strongly favors the model including age as predictor of happiness (model_happy_married). This is probably due to the effect that conditioning on marriage opens this collider and introduces a spurious correlation. This correlation can be used to predict inside the small world despite not having any causal justification.

data(foxes)

data_fox <- foxes %>%
  as_tibble() %>% 
  drop_na(everything()) %>%
  mutate(across(-group, standardize,
                .names = "{str_to_lower(.col)}_std"))

fox_weight_range <- tibble(weight = c(2.2, 14),
                           weight_std = (weight - mean(data_fox$weight))/ sd(data_fox$weight))

dag_fox <- dagify(
  W ~ F + G,
  G ~ F,
  F ~ A,
  exposure = "A",
  outcome = "W",
  coords = tibble(name = c("W", "F", "G", "A"),
                  x = c(.5, 0, 1, .5),
                  y = c(0, .5, .5, 1)))

dag_fox %>%
  fortify() %>% 
  mutate(stage = if_else(name == "W", "response",
                         if_else(name %in% c("A", "F", "G"),
                                 "predictor", "confounds"))) %>% 
  plot_dag(clr_in = clr2) + 
  scale_y_continuous(limits = c(-.1, 1.1)) +
  coord_fixed(ratio = .6)

model_fox_1 <- quap(
  flist = alist(
    weight_std ~ dnorm(mu, sigma),
    mu <- alpha + beta_food * avgfood_std + beta_groupsize * groupsize_std + beta_area * area_std,
    alpha ~ dnorm(0,.2),
    c(beta_food, beta_groupsize, beta_area) ~ dnorm(0, .5),
    sigma ~ dexp(1)
  ),
  data = data_fox
)

model_fox_2 <- quap(
  flist = alist(
    weight_std ~ dnorm(mu, sigma),
    mu <- alpha + beta_food * avgfood_std + beta_groupsize * groupsize_std,
    alpha ~ dnorm(0,.2),
    c(beta_food, beta_groupsize) ~ dnorm(0, .5),
    sigma ~ dexp(1)
  ),
  data = data_fox
)

model_fox_3 <- quap(
  flist = alist(
    weight_std ~ dnorm(mu, sigma),
    mu <- alpha + beta_groupsize * groupsize_std + beta_area * area_std,
    alpha ~ dnorm(0,.2),
    c(beta_groupsize, beta_area) ~ dnorm(0, .5),
    sigma ~ dexp(1)
  ),
  data = data_fox
)

model_fox_4 <- quap(
  flist = alist(
    weight_std ~ dnorm(mu, sigma),
    mu <- alpha + beta_food * avgfood_std,
    alpha ~ dnorm(0,.2),
    beta_food ~ dnorm(0, .5),
    sigma ~ dexp(1)
  ),
  data = data_fox
)

model_fox_5 <- quap(
  flist = alist(
    weight_std ~ dnorm(mu, sigma),
    mu <- alpha + beta_area * area_std,
    alpha ~ dnorm(0,.2),
    beta_area ~ dnorm(0, .5),
    sigma ~ dexp(1)
  ),
  data = data_fox
)

compare(model_fox_1,
        model_fox_2,
        model_fox_3,
        model_fox_4,
        model_fox_5,
        func = WAIC) %>% 
  knit_precis(param_name = "model")

model	WAIC	SE	dWAIC	dSE	pWAIC	weight
model_fox_1	323.46	16.32	0.00	NA	4.96	0.39
model_fox_2	323.77	16.09	0.31	3.62	3.67	0.34
model_fox_3	324.26	15.79	0.80	2.96	3.89	0.26
model_fox_4	333.54	13.85	10.08	7.22	2.46	0.00
model_fox_5	333.89	13.80	10.43	7.27	2.72	0.00

There are two groups of models (A: model_fox_1-3 ans B: model_fox_4-5). The group A closes all the backdoor paths into $W$. Also in group B, conditioning on A or F is basically equivalent because F is a descendant of A and an intermediate between A and the rest of the DAG.

8.6 {brms} section

8.6.1 The problem with parameters

8.6.1.1 More parameters (almost) always improve fit.

brms_c7_model_brain_size <- brm(
  data = data_brainsize,
  family = gaussian,
  brain_size_scl ~ 1 + mass_std,
  prior = c(prior(normal(0.5, 1), class = Intercept),
            prior(normal(0, 10), class = b),
            prior(lognormal(0, 1), class = sigma)),
  iter = 2000, warmup = 1000,
  chains = 4, cores = 4,
  seed = 42,
  file = "brms/brms_c7_model_brain_size"
)

mixedup::summarise_model(brms_c7_model_brain_size)

#>     Group Effect Variance   SD SD_2.5 SD_97.5 Var_prop
#>  Residual            0.07 0.27   0.13    0.60     1.00
#>       Term Value   SE Lower_2.5 Upper_97.5
#>  Intercept  0.53 0.11      0.32       0.76
#>   mass_std  0.17 0.12     -0.07       0.41

brms_R2_is_bad <- function(brm_fit, seed = 7, ...) {
  
  set.seed(seed)
  p <- brms:::predict.brmsfit(brm_fit, summary = F, ...)
  r <- apply(p, 2, mean) - data_brainsize$brain_size_scl
  1 - rethinking::var2(r) / rethinking::var2(data_brainsize$brain_size_scl)
}

brms_R2_is_bad(brms_c7_model_brain_size)

#> [1] 0.4873914

brms_c7_model_brain_size2 <- update(
  brms_c7_model_brain_size,
  newdata = data_brainsize, 
  formula = brain_size_scl ~ 1 + mass_std + I(mass_std^2),
  iter = 2000, warmup = 1000,
  chains = 4, cores = 4,
  seed = 42,
  file = "brms/brms_c7_model_brain_size2")

brms_c7_model_brain_size3 <- update(
  brms_c7_model_brain_size,
  newdata = data_brainsize, 
  formula = brain_size_scl ~ 1 + mass_std + I(mass_std^2) + I(mass_std^3),
  iter = 2000, warmup = 1000,
  chains = 4, cores = 4,
  seed = 42,
  control = list(adapt_delta = .995),
  file = "brms/brms_c7_model_brain_size3")

brms_c7_model_brain_size4 <- update(
  brms_c7_model_brain_size,
  newdata = data_brainsize, 
  formula = brain_size_scl ~ 1 + mass_std + I(mass_std^2) + I(mass_std^3) + I(mass_std^4),
  iter = 2000, warmup = 1000,
  chains = 4, cores = 4,
  seed = 42,
  control = list(adapt_delta = .9995, max_treedepth = 15),
  file = "brms/brms_c7_model_brain_size4")

brms_c7_model_brain_size5 <- update(
  brms_c7_model_brain_size,
  newdata = data_brainsize, 
  formula = brain_size_scl ~ 1 + mass_std + I(mass_std^2) + I(mass_std^3) + I(mass_std^5),
  iter = 2000, warmup = 1000,
  chains = 4, cores = 4,
  seed = 42,
  control = list(adapt_delta = .99995, max_treedepth = 15),
  file = "brms/brms_c7_model_brain_size5")

defining a custom response distribution (setting $\sigma$ to a constant value to be in line with the model of the sixth order polynomial).

custom_normal <- custom_family(
  "custom_normal", dpars = "mu",
  links = "identity",
  type = "real"
)

stan_funs  <- "real custom_normal_lpdf(real y, real mu) {
  return normal_lpdf(y | mu, 0.001);
}
real custom_normal_rng(real mu) {
  return normal_rng(mu, 0.001);
}
" 

stanvars <- stanvar(scode = stan_funs, block = "functions")

brms_c7_model_brain_size6 <- brm(
  data = data_brainsize, 
  family = custom_normal,
  brain_size_scl ~ 1 + mass_std + I(mass_std^2) + I(mass_std^3) + I(mass_std^4) + 
    I(mass_std^5) + I(mass_std^6),
  prior = c(prior(normal(0.5, 1), class = Intercept),
            prior(normal(0, 10), class = b)),
  iter = 2000, warmup = 1000, chains = 4, cores = 4,
  seed = 42,
  stanvars = stanvars,
  control = list(max_treedepth = 15),
  file = "brms/brms_c7_model_brain_size6")

Defining custom functions to work with the special case for the last model (which includes family = custom_normal)

expose_functions(brms_c7_model_brain_size6, vectorize = TRUE)

posterior_epred_custom_normal <- function(prep) {
  mu <- prep$dpars$mu
  mu 
}

posterior_predict_custom_normal <- function(i, prep, ...) {
  mu <- prep$dpars$mu
  mu 
  custom_normal_rng(mu)
}

log_lik_custom_normal <- function(i, prep) {
  mu <- prep$dpars$mu
  y <- prep$data$Y[i]
  custom_normal_lpdf(y, mu)
}

make_r2_figure <- function(brms_fit, ylim = range(data_brainsize$brain_size_scl)) {
  
  # compute the R2
  r2 <- brms_R2_is_bad(brms_fit)
  
  # define the new data 
  nd <- tibble(mass_std = seq(from = -2, to = 2, length.out = 200))
  
  # simulate and wrangle
  fitted(brms_fit, newdata = nd, probs = c(.055, .945)) %>% 
    data.frame() %>% 
    bind_cols(nd) %>% 
    
    # plot!  
    ggplot(aes(x = mass_std)) +
    geom_lineribbon(aes(y = Estimate, ymin = Q5.5, ymax = Q94.5),
                    color = clr0dd, size = 1/2, 
                    fill = fll0) +
    geom_point(data = data_brainsize,
               aes(y = brain_size_scl),
               color = clr_dark) +
    labs(subtitle = bquote(italic(R)^2==.(round(r2, digits = 2))),
         x = "body mass (std)",
         y = "brain volume (std)") +
    coord_cartesian(xlim = c(-1.2, 1.5),
                    ylim = ylim)
}

tibble(brms_fit = list(brms_c7_model_brain_size, brms_c7_model_brain_size2,
                       brms_c7_model_brain_size3, brms_c7_model_brain_size4,
                       brms_c7_model_brain_size5, brms_c7_model_brain_size6),
       ylim = list(range(data_brainsize$brain_size_scl),range(data_brainsize$brain_size_scl),
                   range(data_brainsize$brain_size_scl), c(.25, 1.1),
                    c(.1, 1.4), c(-0.25, 1.5))) %>% 
  pmap(make_r2_figure) %>% 
  wrap_plots(nrow = 2)

8.6.1.2 Too few parameters hurts, too

brain_loo_lines <- function(brms_fit, row, ...) {
  
  nd <- tibble(mass_std = seq(from = -2, to = 2, length.out = 200))
  
  # refit the model
  new_fit <- 
    update(brms_fit,
           newdata = filter(data_brainsize, row_number() != row), 
           iter = 2000, warmup = 1000, chains = 4, cores = 4,
           seed = 7,
           refresh = 0,
           ...)
  
  # pull the lines values
  fitted(new_fit, 
         newdata = nd) %>% 
    data.frame() %>% 
    select(Estimate) %>% 
    bind_cols(nd)
  
}

poly1_fits <- tibble(row = 1:7) %>% 
  mutate(post = purrr::map(row, ~brain_loo_lines(brms_fit = brms_c7_model_brain_size,
                                                 row = .))) %>% 
  unnest(post)

poly4_fits <- tibble(row = 1:7) %>% 
  mutate(post = purrr::map(row, ~brain_loo_lines(brms_fit = brms_c7_model_brain_size4, 
                                                 row = ., 
                                                 control = list(adapt_delta = .9995)))) %>% 
  unnest(post)

p1 <- poly1_fits %>% 
  ggplot(aes(x = mass_std, y = Estimate)) +
  coord_cartesian(xlim = range(data_brainsize$mass_std),
                  ylim = range(data_brainsize$brain_size_scl)) +
  labs(subtitle = "brms_c7_model_brain_size") 

p2 <- poly4_fits %>% 
  ggplot(aes(x = mass_std, y = Estimate)) +
  coord_cartesian(xlim = range(data_brainsize$mass_std),
                  ylim =  c(-0.1, 1.4)) +
  labs(subtitle = "brms_c7_model_brain_size4") 

p1 + p2 &
    geom_line(aes(group = row),
            color = clr0d, size = 1/2, alpha = 1/2) &
  geom_point(data = data_brainsize,
             aes(y = brain_size_scl),
             color = clr_dark)

8.6.2 Entropy and accuracy

8.6.2.1 Firing the weatherperson

Current weatherperson

weatherperson <- tibble(prediction = rep(c(1,.6), c(3,7)),
       observed = rep(c(emo::ji("cloud_with_rain"), emo::ji("sunny")), c(3,7)))

weatherperson %>% 
  t() %>% 
  knitr::kable(col.names = 1:10)

	1	2	3	4	5	6	7	8	9	10
prediction	1.0	1.0	1.0	0.6	0.6	0.6	0.6	0.6	0.6	0.6
observed	🌧	🌧	🌧	☀️	☀️	☀️	☀️	☀️	☀️	☀️

newcomer

newcomer <- tibble(prediction = rep(0, 10),
       observed = rep(c(emo::ji("cloud_with_rain"), emo::ji("sunny")), c(3,7)))

newcomer %>% 
  t() %>% 
  knitr::kable(col.names = 1:10)

	1	2	3	4	5	6	7	8	9	10
prediction	0	0	0	0	0	0	0	0	0	0
observed	🌧	🌧	🌧	☀️	☀️	☀️	☀️	☀️	☀️	☀️

weather_happy <- weatherperson %>% 
  rename(weatherperson = "prediction") %>% 
  mutate(newcomer = newcomer$prediction,
         observed_code = if_else(observed == emo::ji("cloud_with_rain"),
                                 1, 0),
         day = row_number()) %>% 
  pivot_longer(c(weatherperson, newcomer), names_to = "person", values_to = "prediction") %>% 
  mutate(hit = ifelse(prediction == observed_code, 1, 1 - prediction - observed_code),
         happy = if_else(prediction == observed_code, 
                         if_else(observed_code == 1, -1, 0), 
                         if_else(observed_code == 1,
                                 -5 * (observed_code - prediction),
                                 -1 * (prediction - observed_code))))

weather_happy  %>% 
  pivot_wider(id_cols = observed:day, names_from = "person", values_from = prediction:happy) %>% 
  dplyr::select(-day) %>% 
  t() %>% 
  knitr::kable(col.names = 1:10)

	1	2	3	4	5	6	7	8	9	10
observed	🌧	🌧	🌧	☀️	☀️	☀️	☀️	☀️	☀️	☀️
observed_code	1	1	1	0	0	0	0	0	0	0
prediction_weatherperson	1.0	1.0	1.0	0.6	0.6	0.6	0.6	0.6	0.6	0.6
prediction_newcomer	0	0	0	0	0	0	0	0	0	0
hit_weatherperson	1.0	1.0	1.0	0.4	0.4	0.4	0.4	0.4	0.4	0.4
hit_newcomer	0	0	0	1	1	1	1	1	1	1
happy_weatherperson	-1.0	-1.0	-1.0	-0.6	-0.6	-0.6	-0.6	-0.6	-0.6	-0.6
happy_newcomer	-5	-5	-5	0	0	0	0	0	0	0

weather_happy %>% 
  group_by(person) %>% 
  summarise(total_hit = sum(hit),
            total_happy = sum(happy))

#> # A tibble: 2 × 3
#>   person        total_hit total_happy
#>   <chr>             <dbl>       <dbl>
#> 1 newcomer            7         -15  
#> 2 weatherperson       5.8        -7.2

weather_happy %>% count(person, hit) %>% 
  mutate(power = hit ^ n) %>% 
  group_by(person) %>% 
  summarise(joint_likelihood = prod(power))

#> # A tibble: 2 × 2
#>   person        joint_likelihood
#>   <chr>                    <dbl>
#> 1 newcomer               0      
#> 2 weatherperson          0.00164

8.6.2.2 Divergence depends upon direction

tibble(direction = c("Earth to Mars", "Mars to Earth"),
       p_1       = c(.01, .7),
       q_1       = c(.7, .01)) %>% 
  mutate(p_2 = 1 - p_1,
         q_2 = 1 - q_1) %>%
  mutate(d_kl = (p_1 * log(p_1 / q_1)) + (p_2 * log(p_2 / q_2)))

#> # A tibble: 2 × 6
#>   direction       p_1   q_1   p_2   q_2  d_kl
#>   <chr>         <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 Earth to Mars  0.01  0.7   0.99  0.3   1.14
#> 2 Mars to Earth  0.7   0.01  0.3   0.99  2.62

log-pointwise-predictive-density (lppd) with {brms} (not included, we have to do it manually)

(lppd_by_sepc <- brms_c7_model_brain_size %>%
  log_lik() %>%
  as_tibble() %>% 
  set_names(pull(data_brainsize, species)) %>% 
  pivot_longer(everything(),
               names_to = "species",
               values_to = "log_prob") %>% 
  mutate(prob = exp(log_prob)) %>% 
  group_by(species) %>% 
  summarise(log_probability_score = mean(prob) %>% log()) %>% 
  ungroup())

#> # A tibble: 7 × 2
#>   species     log_probability_score
#>   <chr>                       <dbl>
#> 1 afarensis                   0.379
#> 2 africanus                   0.406
#> 3 boisei                      0.392
#> 4 ergaster                    0.225
#> 5 habilis                     0.325
#> 6 rudolfensis                 0.264
#> 7 sapiens                    -0.593

lppd_by_sepc %>% 
  summarise(total_log_probability_score = sum(log_probability_score))

#> # A tibble: 1 × 1
#>   total_log_probability_score
#>                         <dbl>
#> 1                        1.40

8.6.2.3 Computing the lppd

brms_log_prob <- brms_c7_model_brain_size %>%
  log_lik() %>%
  as_tibble()

brms_prob <- brms_log_prob %>%
  set_names(pull(data_brainsize, species)) %>% 
  # add an s iteration index, for convenience
  mutate(s = 1:n()) %>% 
  # make it long
  pivot_longer(-s,
               names_to = "species",
               values_to = "log_prob") %>% 
  # compute the probability scores
  mutate(prob = exp(log_prob))

brms_prob_score <- brms_prob %>% 
  group_by(species) %>% 
  summarise(log_probability_score = mean(prob) %>% log()) %>% 
  ungroup()

brms_prob_score %>%
  summarise(total_log_probability_score = sum(log_probability_score))

#> # A tibble: 1 × 1
#>   total_log_probability_score
#>                         <dbl>
#> 1                        1.40

8.6.2.4 Scoring the right data

brms_lppd <- function(brms_fit) {
  
  log_lik(brms_fit) %>% 
    data.frame() %>% 
    pivot_longer(everything(),
                 values_to = "log_prob") %>% 
    mutate(prob = exp(log_prob)) %>% 
    group_by(name) %>% 
    summarise(log_probability_score = mean(prob) %>% log()) %>% 
    summarise(total_log_probability_score = sum(log_probability_score))
  
}

tibble(name = str_c("brms_c7_model_brain_size", c("",2:6))) %>% 
  mutate(brms_fit = purrr::map(name, get)) %>% 
  mutate(lppd = purrr::map(brms_fit, ~brms_lppd(.))) %>% 
  unnest(lppd)

#> # A tibble: 6 × 3
#>   name                      brms_fit  total_log_probability_score
#>   <chr>                     <list>                          <dbl>
#> 1 brms_c7_model_brain_size  <brmsfit>                       1.40 
#> 2 brms_c7_model_brain_size2 <brmsfit>                       0.557
#> 3 brms_c7_model_brain_size3 <brmsfit>                       0.608
#> 4 brms_c7_model_brain_size4 <brmsfit>                      -0.213
#> 5 brms_c7_model_brain_size5 <brmsfit>                       5.87 
#> 6 brms_c7_model_brain_size6 <brmsfit>                      26.0

8.6.3 Information criteria

8.6.3.1 WAIC calculation

brms_c7_model_cars <- brm(data = cars, 
      family = gaussian,
      dist ~ 1 + speed,
      prior = c(prior(normal(0, 100), class = Intercept),
                prior(normal(0, 10), class = b),
                prior(exponential(1), class = sigma)),
      iter = 2000, warmup = 1000,
      chains = 4, cores = 4,
      seed = 42,
      file = "brms/brms_c7_model_cars")

n_cases <- nrow(cars)

log_likelihood_i <- log_lik(brms_c7_model_cars) %>%
  as_tibble() %>%
  set_names(str_pad(1:n_cases,width = 2, pad = 0))

dim(log_likelihood_i)

#> [1] 4000   50

log_mu_l <- log_likelihood_i %>% 
  pivot_longer(everything(),
               names_to = "i",
               values_to = "log_likelihood") %>% 
  mutate(likelihood = exp(log_likelihood)) %>% 
  group_by(i) %>% 
  summarise(log_mean_likelihood = mean(likelihood) %>% log())

(
  cars_lppd <- log_mu_l %>% 
  summarise(lppd = sum(log_mean_likelihood)) %>% 
  pull(lppd) 
)

#> [1] -206.62

computing $p_{WAIC}$ and $V(y_{i})$ (varinace in the log-likelihood)

v_i <- log_likelihood_i %>% 
  pivot_longer(everything(),
               names_to = "i",
               values_to = "log_likelihood") %>% 
  group_by(i) %>% 
  summarise(var_loglikelihood = var(log_likelihood))

pwaic <- v_i %>%
  summarise(pwaic = sum(var_loglikelihood)) %>% 
  pull()

pwaic

#> [1] 4.080994

$WAIC = -2 (lppd - p_{WAIC})$

-2 * (cars_lppd - pwaic)

#> [1] 421.402

{brms} function:

waic(brms_c7_model_cars)

#> 
#> Computed from 4000 by 50 log-likelihood matrix
#> 
#>           Estimate   SE
#> elpd_waic   -210.7  8.2
#> p_waic         4.1  1.5
#> waic         421.4 16.4
#> 
#> 2 (4.0%) p_waic estimates greater than 0.4. We recommend trying loo instead.

Pointwise values:

waic(brms_c7_model_cars)$pointwise %>% 
  as_tibble()

#> # A tibble: 50 × 3
#>    elpd_waic p_waic  waic
#>        <dbl>  <dbl> <dbl>
#>  1     -3.65 0.0217  7.30
#>  2     -4.02 0.0949  8.05
#>  3     -3.68 0.0209  7.37
#>  4     -4.00 0.0597  7.99
#>  5     -3.59 0.0102  7.18
#>  6     -3.74 0.0210  7.48
#>  7     -3.60 0.0103  7.21
#>  8     -3.62 0.0110  7.23
#>  9     -3.98 0.0351  7.96
#> 10     -3.77 0.0171  7.54
#> # … with 40 more rows

8.6.4 Model comparison

8.6.4.1 Model mis-selection

brms_c6_model_fungus_post_treatment <- read_rds("brms/brms_c6_model_fungus_post_treatment.rds")
brms_c6_model_fungus_no_treatment <- read_rds("brms/brms_c6_model_fungus_no_treatment.rds")
brms_c6_model_fungus_only_treatment <- read_rds("brms/brms_c6_model_fungus_only_treatment.rds")

brms_c6_model_fungus_post_treatment <- add_criterion(brms_c6_model_fungus_post_treatment, 
                                                     criterion = "waic") 
brms_c6_model_fungus_no_treatment <- add_criterion(brms_c6_model_fungus_no_treatment, 
                                                     criterion = "waic") 
brms_c6_model_fungus_only_treatment <- add_criterion(brms_c6_model_fungus_only_treatment, 
                                                     criterion = "waic") 

brms_fungus_compare <- loo_compare(brms_c6_model_fungus_post_treatment,
                                   brms_c6_model_fungus_no_treatment,
                                   brms_c6_model_fungus_only_treatment,
                                   criterion = "waic")

print(brms_fungus_compare, simplify = FALSE)

#>                                     elpd_diff se_diff elpd_waic se_elpd_waic
#> brms_c6_model_fungus_post_treatment    0.0       0.0  -180.7       6.7      
#> brms_c6_model_fungus_only_treatment  -20.6       4.9  -201.3       5.4      
#> brms_c6_model_fungus_no_treatment    -22.3       5.8  -203.0       5.7      
#>                                     p_waic se_p_waic waic   se_waic
#> brms_c6_model_fungus_post_treatment    3.4    0.5     361.4   13.5 
#> brms_c6_model_fungus_only_treatment    2.5    0.3     402.7   10.8 
#> brms_c6_model_fungus_no_treatment      1.6    0.2     406.0   11.4

With respect to the output, notice the elpd_diff column and the adjacent se_diff column. Those are our WAIC differences in the elpd metric. The models have been rank ordered from the highest (i.e., brms_c6_model_fungus_post_treatment) to the lowest (i.e., brms_c6_model_fungus_no_treatment). The scores listed are the differences of brms_c6_model_fungus_post_treatment minus the comparison model. Since brms_c6_model_fungus_post_treatment is the comparison model in the top row, the values are naturally 0 (i.e., $x−x=0$). But now here’s another critical thing to understand: Since the {brms} version 2.8.0 update, WAIC and LOO differences are no longer reported in the $−2\times x$ metric. Remember how multiplying (lppd - pwaic) by -2 is a historic artifact associated with the frequentist $\chi^2$ test? We’ll, the makers of the {loo} package aren’t fans and they no longer support the conversion.

So here’s the deal. The substantive interpretations of the differences presented in an elpd_diff metric will be the same as if presented in a WAIC metric. But if we want to compare our elpd_diff results to those in the text, we will have to multiply them by -2. And also, if we want the associated standard error in the same metric, we’ll need to multiply the se_diff column by 2

brms_c6_model_fungus_post_treatment <- add_criterion(brms_c6_model_fungus_post_treatment, 
                                                     criterion = "loo") 
brms_c6_model_fungus_no_treatment <- add_criterion(brms_c6_model_fungus_no_treatment, 
                                                     criterion = "loo") 
brms_c6_model_fungus_only_treatment <- add_criterion(brms_c6_model_fungus_only_treatment, 
                                                     criterion = "loo") 

loo_compare(brms_c6_model_fungus_post_treatment,
            brms_c6_model_fungus_no_treatment,
            brms_c6_model_fungus_only_treatment,
            criterion = "loo") %>% 
  print(simplify = FALSE)

#>                                     elpd_diff se_diff elpd_loo se_elpd_loo
#> brms_c6_model_fungus_post_treatment    0.0       0.0  -180.7      6.7     
#> brms_c6_model_fungus_only_treatment  -20.6       4.9  -201.3      5.4     
#> brms_c6_model_fungus_no_treatment    -22.3       5.8  -203.0      5.7     
#>                                     p_loo  se_p_loo looic  se_looic
#> brms_c6_model_fungus_post_treatment    3.4    0.5    361.5   13.5  
#> brms_c6_model_fungus_only_treatment    2.6    0.3    402.7   10.8  
#> brms_c6_model_fungus_no_treatment      1.6    0.2    406.0   11.4

computing the standard error of the WAIC difference for brms_c6_model_fungus_post_treatment and brms_c6_model_fungus_only_treatment

n <- length(brms_c6_model_fungus_no_treatment$criteria$waic$pointwise[, "waic"])

tibble(waic_no_treatment = brms_c6_model_fungus_no_treatment$criteria$waic$pointwise[, "waic"],
       waic_post_treatment = brms_c6_model_fungus_post_treatment$criteria$waic$pointwise[, "waic"]) %>% 
  mutate(diff = waic_no_treatment - waic_post_treatment) %>% 
  summarise(diff_se = sqrt(n * var(diff)))

#> # A tibble: 1 × 1
#>   diff_se
#>     <dbl>
#> 1    11.6

brms_fungus_compare[2, 2] * 2

#> [1] 9.840665

(brms_fungus_compare[2, 1] * -2) + c(-1, 1) * (brms_fungus_compare[2, 2] * 2) * 2.6

#> [1] 15.65820 66.82966

brms_fungus_compare[, 7:8] %>% 
  data.frame() %>% 
  rownames_to_column("model_name") %>% 
  as_tibble() %>% 
  mutate(model_name = fct_reorder(model_name, waic, .desc = TRUE)) %>%
  ggplot(aes(x = waic, y = model_name, 
             xmin = waic - se_waic, 
             xmax = waic + se_waic)) +
  geom_pointrange(color = clr0dd, 
                  fill = clr0, shape = 21) +
  labs(title = "custom WAIC plot",
       x = NULL, y = NULL) +
  theme(axis.ticks.y = element_blank())

in {brms}, weights need to be computed explicitly for the models

model_weights(brms_c6_model_fungus_post_treatment,
            brms_c6_model_fungus_no_treatment,
            brms_c6_model_fungus_only_treatment,
            weights = "waic") %>% 
  round(digits = 2)

#> brms_c6_model_fungus_post_treatment   brms_c6_model_fungus_no_treatment 
#>                                   1                                   0 
#> brms_c6_model_fungus_only_treatment 
#>                                   0

8.6.5 Outliers and other illusions

brms_c5_model_age <- read_rds("brms/brms_c5_model_age.rds") %>% 
   add_criterion(criterion = "loo") 

brms_c5_model_marriage <- read_rds("brms/brms_c5_model_marriage.rds") %>% 
   add_criterion(criterion = "loo") 

brms_c5_model_multiple <- read_rds("brms/brms_c5_model_multiple.rds") %>% 
   add_criterion(criterion = "loo") 

loo_compare(brms_c5_model_age,
            brms_c5_model_marriage,
            brms_c5_model_multiple,
            criterion = "loo") %>% 
  print(simplify = FALSE)

#>                        elpd_diff se_diff elpd_loo se_elpd_loo p_loo se_p_loo
#> brms_c5_model_age        0.0       0.0   -63.0      6.5         3.7   1.8   
#> brms_c5_model_multiple  -0.8       0.4   -63.8      6.4         4.7   1.9   
#> brms_c5_model_marriage  -6.8       4.7   -69.7      5.0         3.0   0.9   
#>                        looic se_looic
#> brms_c5_model_age      125.9  12.9   
#> brms_c5_model_multiple 127.5  12.8   
#> brms_c5_model_marriage 139.4  10.0

loo(brms_c5_model_multiple)

#> 
#> Computed from 4000 by 50 log-likelihood matrix
#> 
#>          Estimate   SE
#> elpd_loo    -63.8  6.4
#> p_loo         4.7  1.9
#> looic       127.5 12.8
#> ------
#> Monte Carlo SE of elpd_loo is 0.1.
#> 
#> All Pareto k estimates are good (k < 0.5).
#> See help('pareto-k-diagnostic') for details.

library(loo)

loo(brms_c5_model_multiple) %>% 
  pareto_k_ids(threshold = 0.4)

#> [1] 13 20

chapter5_models$data_waffle %>% 
  slice(13) %>% 
  select(Location:Loc)

#> # A tibble: 1 × 2
#>   Location Loc  
#>   <fct>    <fct>
#> 1 Idaho    ID

pareto_k_values(loo(brms_c5_model_multiple))[13]

#> [1] 0.4485035

brms_c5_model_multiple <- add_criterion(brms_c5_model_multiple, "waic", 
                                        file = "brms/brms_c5_model_multiple")

p1 <- tibble(pareto_k = brms_c5_model_multiple$criteria$loo$diagnostics$pareto_k,
       p_waic = brms_c5_model_multiple$criteria$waic$pointwise[, "p_waic"],
       Loc = pull(chapter5_models$data_waffle, Loc)) %>% 
  ggplot(aes(x = pareto_k, y = p_waic, color = Loc == "ID")) +
  geom_vline(xintercept = .5, linetype = 3, color = clr_dark) +
  geom_point(aes(shape = Loc == "ID")) +
  geom_text(data = . %>% filter(p_waic > 0.5),
            aes(x = pareto_k - 0.03, label = Loc),
            hjust = 1) +
  scale_color_manual(values = c(clr0dd, clr2)) +
  scale_shape_manual(values = c(1, 19)) +
  labs(subtitle = "Gaussian model (brms_c5_model_multiple)") +
  theme(legend.position = "none")

To use the Student-t distribution in {brms} set family = student (note the inclusion of $\nu$ in bf())

brms_c7_model_multiple <- brm(
  data = chapter5_models$data_waffle, 
  family = student,
  bf(divorce_std ~ 1 + marriage_std + median_age_std, nu = 2),
  prior = c(prior(normal(0, 0.2), class = Intercept),
            prior(normal(0, 0.5), class = b),
            prior(exponential(1), class = sigma)),
  iter = 2000, warmup = 1000,
  chains = 4, cores = 4,
  seed = 42,
  file = "brms/brms_c7_model_multiple")

brms_c7_model_multiple <- add_criterion(brms_c7_model_multiple,
                                        criterion = c("loo", "waic"))

p2 <- tibble(pareto_k = brms_c7_model_multiple$criteria$loo$diagnostics$pareto_k,
       p_waic   = brms_c7_model_multiple$criteria$waic$pointwise[, "p_waic"],
       Loc      = pull(chapter5_models$data_waffle, Loc)) %>% 
  ggplot(aes(x = pareto_k, y = p_waic, color = Loc == "ID")) +
  geom_vline(xintercept = .5, linetype = 3, color = clr_dark) +
  geom_point(aes(shape = Loc == "ID")) +
  geom_text(data = . %>% filter(p_waic > 0.5),
            aes(x = pareto_k - 0.03, label = Loc),
            hjust = 1) +
  scale_color_manual(values = c(clr0dd, clr2)) +
  scale_shape_manual(values = c(1, 19)) +
  labs(subtitle = "Student-t model (brms_c7_model_multiple)") +
  theme(legend.position = "none")

p1 + p2

loo_compare(brms_c5_model_multiple, 
            brms_c7_model_multiple, 
            criterion = "waic") %>%
  print(simplify = FALSE)

#>                        elpd_diff se_diff elpd_waic se_elpd_waic p_waic
#> brms_c5_model_multiple   0.0       0.0   -63.7       6.4          4.6 
#> brms_c7_model_multiple  -2.7       3.0   -66.4       5.8          6.1 
#>                        se_p_waic waic  se_waic
#> brms_c5_model_multiple   1.8     127.3  12.7  
#> brms_c7_model_multiple   1.0     132.8  11.6

bind_rows(as_draws_df(brms_c5_model_multiple),
          as_draws_df(brms_c7_model_multiple)) %>% 
  as_tibble() %>% 
  mutate(fit = rep(c("Gaussian (brms_c5_model_multiple)",
                     "Student-t (brms_c7_model_multiple)"), 
                   each = n() / 2)) %>% 
  pivot_longer(b_Intercept:sigma) %>% 
  mutate(name = factor(name,
                       levels = c("b_Intercept", "b_median_age_std", "b_marriage_std", "sigma"),
                       labels = c("alpha", "beta[a]", "beta[m]", "sigma"))) %>%
  ggplot(aes(x = value, y = fit, color = fit)) +
  stat_pointinterval(.width = .95, aes(fill = after_scale(clr_lighten(color))),
                     size = 3, shape = 21) +
  facet_wrap(~ name, ncol = 1)+
  scale_color_manual(values = c(clr0dd, clr2), guide = "none") +
  labs(x = "posterior", y = NULL)

8.6.6 {brms} $r^2$

bayes_R2(brms_c5_model_multiple) %>%
  round(digits = 3) %>% 
  as_tibble() %>% 
  knitr::kable()

Estimate	Est.Error	Q2.5	Q97.5
0.329	0.087	0.149	0.479

rbind(bayes_R2(brms_c5_model_multiple), 
      bayes_R2(brms_c5_model_age), 
      bayes_R2(brms_c5_model_marriage)) %>%
  as_tibble() %>%
  mutate(model = c("brms_c5_model_multiple", "brms_c5_model_age", "brms_c5_model_marriage"),
         r_square_posterior_mean = round(Estimate, digits = 3)) %>%
  select(model, r_square_posterior_mean)

#> # A tibble: 3 × 2
#>   model                  r_square_posterior_mean
#>   <chr>                                    <dbl>
#> 1 brms_c5_model_multiple                   0.329
#> 2 brms_c5_model_age                        0.326
#> 3 brms_c5_model_marriage                   0.13

brms_r2 <- cbind(bayes_R2(brms_c5_model_multiple, summary = FALSE),
                 bayes_R2(brms_c5_model_age, summary = FALSE),
                 bayes_R2(brms_c5_model_marriage, summary = FALSE)) %>% 
  as_tibble() %>% 
  set_names(c("multiple", "age", "marriage"))

p1 <- brms_r2  %>% 
  pivot_longer(everything(),
               names_to = "model",
               values_to = "r2") %>% 
  ggplot() +
  geom_density(aes(x = r2, color = model, fill = after_scale(clr_alpha(color))),
               size = .5, adjust = .5) +
  scale_x_continuous(NULL, limits = c(0, 1)) +
  scale_y_continuous(NULL, breaks = NULL) +
  scale_color_manual(values = c(age = clr0, marriage = clr0dd, multiple = clr2)) +
  labs(subtitle = expression(italic(R)^2~distributions)) +
  facet_wrap(model ~ ., ncol = 1) +
  theme(legend.position = "none")

p2 <- brms_r2 %>%
  mutate(diff = multiple - marriage) %>% 
  ggplot(aes(x = diff, y = 0)) +
  stat_halfeye(point_interval = median_qi,
               .width = .95,
               shape = 21,
               fill = fll0, 
               color = clr0dd) +
  scale_y_continuous(NULL, breaks = NULL) +
  labs(subtitle = expression(italic(marriage)~has~lower~italic(R)^2~than~italic(multiple)),
       x = expression(Delta*italic(R)^2))

p1 + p2

Statistical Explorations

8 Rethinking: Chapter 7

8.1 The Problem with Parameters

8.1.1 The burial of R²

8.1.1.1 Linear Model

8.1.2 Higher order polynomials

8.1.3 Underfitting

8.2 Entropy and Accuracy

8.2.1 Entropy

8.2.2 Accuracy

8.2.3 Estimating Divergence

8.2.4 Scoring the right data

8.3 Golem taming: regularization

8.3.1 Cross-validataion

8.3.2 Information Criteria

8.3.2.1 Akaike information criterion

8.3.2.2 Widely Applicable Information Criterion

8.3.3 Comparing CV, PSIS and WAIC

8.4 Model comparison

8.4.1 Model mis-selection

8.4.2 Outliers and other illusions

8.5 Homework

8.6 {brms} section

8.6.1 The problem with parameters

8.6.1.1 More parameters (almost) always improve fit.

8.6.1.2 Too few parameters hurts, too

8.6.2 Entropy and accuracy

8.6.2.1 Firing the weatherperson

8.6.2.2 Divergence depends upon direction

8.6.2.3 Computing the lppd

8.6.2.4 Scoring the right data

8.6.3 Information criteria

8.6.3.1 WAIC calculation

8.6.4 Model comparison

8.6.4.1 Model mis-selection

8.6.5 Outliers and other illusions

8.6.6 {brms} \(r^2\)

8.7 pymc3 section

8 Rethinking: Chapter 7

8.1 The Problem with Parameters

8.1.1 The burial of R2

8.1.1.1 Linear Model

8.1.2 Higher order polynomials

8.1.3 Underfitting

8.2 Entropy and Accuracy

8.2.1 Entropy

8.2.2 Accuracy

8.2.3 Estimating Divergence

8.2.4 Scoring the right data

8.3 Golem taming: regularization

8.3.1 Cross-validataion

8.3.2 Information Criteria

8.3.2.1 Akaike information criterion

8.3.2.2 Widely Applicable Information Criterion

8.3.3 Comparing CV, PSIS and WAIC

8.4 Model comparison

8.4.1 Model mis-selection

8.4.2 Outliers and other illusions

8.5 Homework

8.6 {brms} section

8.6.1 The problem with parameters

8.6.1.1 More parameters (almost) always improve fit.

8.6.1.2 Too few parameters hurts, too

8.6.2 Entropy and accuracy

8.6.2.1 Firing the weatherperson

8.6.2.2 Divergence depends upon direction

8.6.2.3 Computing the lppd

8.6.2.4 Scoring the right data

8.6.3 Information criteria

8.6.3.1 WAIC calculation

8.6.4 Model comparison

8.6.4.1 Model mis-selection

8.6.5 Outliers and other illusions

8.6.6 {brms} \(r^2\)

8.7 pymc3 section

8.1.1 The burial of R²