4 Rethinking: Chapter 3
Sampling the Imaginary
by Richard McElreath, building on the Summary by Solomon Kurz
\[ Pr(vampire|positive) = \frac{Pr(positive|vampire) \times Pr(vampire)}{Pr(positive)} \]
<- .95
pr_positive_on_vamp <- .01
pr_positive_on_mort <- .001
pr_vamp <- pr_positive_on_vamp * pr_vamp + pr_positive_on_mort * (1 - pr_vamp)
pr_positive <- pr_positive_on_vamp * pr_vamp /pr_positive) (pr_vamp_on_positive
#> [1] 0.08683729
4.1 Sampling from a grid approximate posterior
posterior here means simply ‘the probability of p conditional on the data’:
<- function(n_grid = 20, L = 6, W = 3, prior = function(x){rep(1, length(x))}){
grid_approx tibble(p_grid = seq(0, 1, length.out = n_grid),
prior = prior(p_grid),
likelihood = dbinom(L, size = W + L, prob = p_grid),
posterior_unstand = likelihood * prior,
posterior = posterior_unstand / sum(posterior_unstand))
}
<- grid_approx(n_grid = 10^4)
grid_data
<- tibble(sample = 1:(10^4),
samples proportion_water = sample(x = grid_data$p_grid, size = length(sample),
prob = grid_data$posterior, replace = TRUE))
<- samples %>%
p_scatter ggplot(aes(x = sample, y = proportion_water)) +
geom_point(size = .75, shape = 21, color = clr_alpha(clr2,.3), fill = clr_alpha(clr2,.1)) +
scale_x_continuous(expand = c(0,0))
<- samples %>%
p_dens ggplot(aes(x = proportion_water)) +
geom_density( color = clr2, fill = fll2) +
scale_x_continuous(limits = c(0,1), expand = c(0, 0))
+ p_dens p_scatter
4.2 Sampling to Summarize
Once the posterior distribution is created, the model is done.
Typical targets / questions:
- intervals of defined boundaries
- intervals of defined probability mass
- point estimates
4.2.1 Intervals of devined boundaries
sum(grid_data$posterior[grid_data$p_grid < 0.5])
#> [1] 0.171875
sum(samples$proportion_water < .5) / length(samples$proportion_water)
#> [1] 0.1674
sum(samples$proportion_water > .5 & samples$proportion_water < .75) / length(samples$proportion_water)
#> [1] 0.6038
# f_post <- function(x){dbeta(x = x, shape1 = 9 +1 , shape2 = 6 +1)}
<- function(x){dbinom(x = 6, size = 9, prob = x)}
f_post <- function(x){ f_post(x) / integrate(f = f_post,lower = 0, upper = 1)[[1]]} f_post_norm
<- function(x_bounds = c(0, 1),
plot_intervals x_line = as.numeric(NA),
f_posterior = f_post_norm,
data = samples,
ylim = c(0, 3)){
<- ggplot() +
p_d stat_function(fun = f_posterior, xlim = c(0,1),
geom = "area", color = clr0, fill = fll0) +
stat_function(fun = f_posterior, xlim = x_bounds,
geom = "area", color = clr2, fill = fll2) +
labs(y = "density", x = "proportion_water")
<- data %>%
p_d_emp ggplot(aes(x = proportion_water)) +
geom_density( color = clr0, fill = fll0) +
stat_function(fun = function(x){demp(obs = data$proportion_water, x = x)},
xlim = x_bounds,
geom = "area", color = clr2, fill = fll2) +
labs(y = "empirical density")
+ p_d_emp &
p_d geom_vline(data = tibble(x = x_line),
aes(xintercept = x), linetype = 3) &
scale_y_continuous(limits = ylim)&
scale_x_continuous(limits = c(0, 1))
}
plot_intervals(x_bounds = c(0, .5), x_line = .5) /
plot_intervals(x_bounds = c(.5, .75), x_line = c(.5, .75))
4.2.2 Intervals of defined mass
aka.:
- compatibility interval
- credible interval
- percentile interval
special form: highest posterior density interval (HPDI)
<- quantile(samples$proportion_water, probs = .8)
qnt_80 <- quantile(samples$proportion_water, probs = c(.1, .9))
qnt_80_inner plot_intervals(x_bounds = c(0, qnt_80), x_line = qnt_80)/
plot_intervals(x_bounds = qnt_80_inner, x_line = qnt_80_inner)
library(rethinking)
<- purrr::map
map
<- grid_approx(L = 3, W = 0, n_grid = 10^4)
grid_data_skew
<- tibble(sample = 1:(10^4),
samples_skew proportion_water = sample(x = grid_data_skew$p_grid, size = length(sample),
prob = grid_data_skew$posterior, replace = TRUE))
<- function(x){dbinom(x = 3, size = 3, prob = x)}
f_post_skew <- function(x){ f_post_skew(x) / integrate(f = f_post_skew, lower = 0, upper = 1)[[1]]}
f_post_norm_skew
<- PI(samples_skew$proportion_water, prob = .5)
qnt_50_inner <- HPDI(samples_skew$proportion_water, prob = .5)
qnt_50_high_dens
plot_intervals(x_bounds = qnt_50_inner, x_line = qnt_50_inner,
f_posterior = f_post_norm_skew, data = samples_skew, ylim = c(0, 4)) /
plot_intervals(x_bounds = qnt_50_high_dens, x_line = qnt_50_high_dens,
f_posterior = f_post_norm_skew, data = samples_skew, ylim = c(0, 4))
4.3 Point estimates
<- tibble(proportion_water= list(mean, median, chainmode) %>%
point_estimates map_dbl(.f = function(f, vals){ f(vals) },
vals = samples_skew$proportion_water),
statistic = c("mean", "median", "mode"))
<- ggplot() +
p_point_estimates stat_function(fun = f_post_norm_skew, xlim = c(0,1),
geom = "area", color = clr0, fill = fll0) +
geom_vline(data = point_estimates,
aes(xintercept = proportion_water, linetype = statistic),
color = clr1) +
labs(y = "density")
<- function(x){
f_loss map_dbl(x, function(x){ sum( grid_data_skew$posterior * abs( x - grid_data_skew$p_grid))
})}
<- function(x){
f_loss_quad map_dbl(x, function(x){ sum( grid_data_skew$posterior * ( x - grid_data_skew$p_grid) ^ 2)
})}
<- ggplot() +
p_loss stat_function(fun = f_loss,
xlim = c(0,1),
geom = "area", color = clr0, fill = fll0) +
geom_point(data = tibble(x = point_estimates$proportion_water[2],
y = f_loss(point_estimates$proportion_water[2])),
aes(x = x, y = y), shape = 1, size = 3, color = clr1) +
labs(x = "poportion_water", y = "expected proportional loss")
<- ggplot() +
p_loss_quad stat_function(fun = f_loss_quad,
xlim = c(0,1),
geom = "area", color = clr0, fill = fll0) +
geom_point(data = tibble(x = point_estimates$proportion_water[2],
y = f_loss_quad(point_estimates$proportion_water[2])),
aes(x = x, y = y), shape = 1, size = 3, color = clr1) +
labs(x = "poportion_water", y = "expected proportional loss")
+ p_loss + p_loss_quad +
p_point_estimates plot_layout(guide = "collect") & theme(legend.position = "bottom")
4.4 sample to simulate prediction
binomial likelihood
\[ Pr(W | N ,p) = \frac{N!}{W! (N -W)!} p^{W}(1 - p)^{N-W} \]
dbinom( 0:2, size = 2, prob = .7)
#> [1] 0.09 0.42 0.49
rbinom( 10, size = 2, prob = .7)
#> [1] 1 2 1 0 1 2 2 1 1 1
<- function(size, prob){
create_dummy_w tibble(x = rbinom(10^5, size = size, prob = prob),
size = size,
prob = prob)
}
<- create_dummy_w(size = 9, prob = .7)
dummy_w
%>%
dummy_w group_by(x) %>%
count() %>%
ungroup() %>%
ggplot(aes(x = factor(x), y = n)) +
geom_bar(stat = "identity", color = clr1, fill = fll1, width = .6) +
labs(y = "count", x = "dummy water count")
tibble(size = rep(c(3,6,9), each = 3),
prob = rep(c(.3,.6,.9), 3)) %>%
pmap_dfr(create_dummy_w) %>%
group_by(x, size , prob) %>%
count() %>%
ungroup() %>%
ggplot(aes(x = factor(x), y = n)) +
geom_bar(stat = "identity", color = clr1, fill = fll1, width = .6) +
facet_grid(prob ~ size, scales = "free",
space = "free_x", labeller = label_both) +
labs(y = "count", x = "dummy water count") +
theme(panel.background = element_rect(color = clr0d, fill = clr_alpha(clr0d,.2)))
<- grid_approx(n_grid = 10^4 + 1, L = 6, W = 3,
grid_data prior = function(x){rep(1, length(x))}) %>%
mutate(idx = 1:(10^4 + 1))
%>%
grid_data ggplot(aes(x = p_grid))+
geom_line(aes(y = posterior, color = "posterior")) +
scale_color_manual(values = c(posterior = clr1), guide = "none") +
theme(legend.position = "bottom")
<- grid_data %>%
samples slice_sample(n = 10^5 , weight_by = posterior, replace = TRUE) %>%
mutate(w = purrr::map_dbl(p_grid, rbinom, n = 1, size = 9),
seq = map(w, .f = function(x){sample(x = rep(c("W","L"), c(x, 9-x)),
size = 9,
replace = FALSE)}),
max_run_length = map_dbl(seq,.f = function(x){rle(x)$lengths %>% max()}),
n_switches = map_dbl(seq,.f = function(x){(rle(x)$lengths %>% length()) -1}))
<- grid_data %>%
p_posterior ggplot(aes(x = p_grid, y = posterior)) +
geom_area(color = clr0d, fill = fll0) +
geom_segment(data = grid_data %>% filter(idx %in% (1+ (1:9)*1000)),
aes(xend = p_grid,
yend = 0, size = posterior),
color = fll1) +
geom_point(data = grid_data %>% filter(idx %in% (1+ (1:9)*1000)),
color = clr1) +
scale_size_continuous(range = c(.1, 1), guide = "none") +
labs(x = "probability of water", y = "density", title = "Posterior Probability")
<- function(probability, n_draws = 10^5, size = 9) {
simulate_binom rbinom(n_draws, size = size, prob = probability)
}
<- tibble(probability = seq(from = .1, to = .9, by = .1)) %>%
d_small mutate(draws = purrr::map(probability, simulate_binom)) %>%
unnest(draws)
<- d_small %>%
p_small ggplot(aes(x = draws)) +
geom_bar(stat = "count", color = clr1, fill = fll1, width = .6) +
facet_wrap(probability ~ ., nrow = 1)+
labs(title = "Sampling Distributions")
<- samples %>%
p_posterior_predictive ggplot(aes(x = factor(w))) +
geom_bar(stat = "count",
aes(color = w == 6 ,
fill = after_scale(clr_alpha(color, .3))),
width = .6) +
scale_color_manual(values = c(`TRUE` = clr1, `FALSE` = clr0d), guide = "none") +
labs(x = "number of water samples", title = "Posterior Predictive Distribution")
/
p_posterior /
p_small p_posterior_predictive
sum( samples$w == 6 ) / length( samples$w )
#> [1] 0.20034
<- c("w", "l", "w", "w", "w", "l", "w", "l", "w")
globe_data <- rle(globe_data)$lengths %>% max()
globe_run_length <- (rle(globe_data)$lengths %>% length()) -1
globe_n_switches
<- samples %>%
p_run_length ggplot(aes(x = factor(max_run_length))) +
geom_bar(stat = "count", aes(color = max_run_length == globe_run_length,
fill = after_scale(clr_alpha(color))), width = .6) +
scale_color_manual(values = c(`TRUE` = clr2, `FALSE` = clr0d), guide = "none") +
labs(x = "longest run length")
<- samples %>%
p_switches ggplot(aes(x = factor(n_switches))) +
geom_bar(stat = "count", aes(color = n_switches == globe_n_switches,
fill = after_scale(clr_alpha(color))), width = .6) +
scale_color_manual(values = c(`TRUE` = clr2, `FALSE` = clr0d), guide = "none") +
labs(x = "number of switches")
+ p_switches p_run_length
library(rlang)
<- env(
chapter3_models grid_data = grid_data
)
write_rds(chapter3_models, "envs/chapter3_models.rds")
4.5 Homework
<- 1e4
n <- tibble(p_grid = seq( from = 0, to = 1, length.out = n ),
easy_data prior = rep(1 , n),
likelihood = dbinom( 6, size = 9, prob = p_grid),
posterior_unscaled = likelihood * prior,
posterior = posterior_unscaled / sum(posterior_unscaled),
cummulative_posterior = cumsum(posterior))
%>%
easy_data ggplot(aes(x = p_grid)) +
geom_line(aes(y = prior / sum(prior), color = "prior")) +
geom_line(aes(y = likelihood / sum(likelihood),
color = "likelihood")) +
geom_line(aes(y = posterior, color = "posterior"), linetype = 3) +
geom_line(aes(y = cummulative_posterior / sum(cummulative_posterior),
color = "cummulative_posterior"), linetype = 3) +
scale_color_manual(values = c(prior = clr1, likelihood = clr0d,
posterior = clr2, cummulative_posterior = "black")) +
theme(axis.title.y = element_blank(),
axis.text.y = element_blank(),
legend.position = "bottom")
set.seed( 100 )
<- easy_data %>%
easy_samples slice_sample(n = n, weight_by = posterior, replace = TRUE)
%>%
easy_samples ggplot(aes(x = p_grid)) +
geom_density(color = clr0d, fill = fll0) +
scale_x_continuous(limits = c(0,1)) +
geom_vline(data = tibble(x = c(.2, .8)), aes(xintercept = x), linetype = 3)
E1
mean(easy_samples$p_grid <= .2)
#> [1] 5e-04
E2
mean(easy_samples$p_grid > .8)
#> [1] 0.1219
E3
mean( easy_samples$p_grid > .2 & easy_samples$p_grid < .8)
#> [1] 0.8776
E4
quantile(easy_samples$p_grid , probs = .2)
#> 20%
#> 0.5145315
E5
quantile(easy_samples$p_grid , probs = .8)
#> 80%
#> 0.7618962
E6
HPDI(easy_samples$p_grid, prob = .66)
#> |0.66 0.66|
#> 0.5138514 0.7886789
<- easy_samples %>%
p_e6 ggplot(aes(x = p_grid)) +
geom_density(color = clr0d, fill = fll0) +
stat_function(fun = function(x){demp(x = x, obs = easy_samples$p_grid)},
geom = "area",
xlim = HPDI(easy_samples$p_grid, prob = .66),
color = clr2, fill = fll2) +
scale_color_manual(values = c(posterior = clr2), guide = "none") +
theme(legend.position = "bottom")
E7
PI(easy_samples$p_grid, prob = .66)
#> 17% 83%
#> 0.4972327 0.7745775
<- easy_samples %>%
p_e7 ggplot(aes(x = p_grid)) +
geom_density(color = clr0d, fill = fll0) +
stat_function(fun = function(x){demp(x = x, obs = easy_samples$p_grid)},
geom = "area",
xlim = PI(easy_samples$p_grid, prob = .66),
color = clr2, fill = fll2) +
scale_color_manual(values = c(posterior = clr2), guide = "none") +
theme(legend.position = "bottom")
+ p_e7 p_e6
M1
<- grid_approx(n_grid = 10^4 + 1, L = 8, W = 7,
grid_data prior = function(x){rep(1, length(x))}) %>%
mutate(idx = 1:(10^4 + 1))
%>%
grid_data ggplot(aes(x = p_grid))+
geom_line(aes(y = posterior, color = "posterior")) +
scale_color_manual(values = c(posterior = clr2), guide = "none") +
theme(legend.position = "bottom")
M2
<- grid_data %>%
samples slice_sample(n = 10^5 , weight_by = posterior, replace = TRUE) %>%
mutate(w = purrr::map_dbl(p_grid, rbinom, n = 1, size = 15))
HPDI(samples$p_grid, prob = .9)
#> |0.9 0.9|
#> 0.3329 0.7218
M3
<- grid_data %>%
p_posterior ggplot(aes(x = p_grid, y = posterior)) +
geom_area(color = clr0d, fill = fll0) +
geom_segment(data = grid_data %>% filter(idx %in% (1+ (1:9)*1000)),
aes(xend = p_grid,
yend = 0, size = posterior),
color = fll2) +
geom_point(data = grid_data %>% filter(idx %in% (1+ (1:9)*1000)),
color = clr2) +
scale_size_continuous(range = c(.1, 1), guide = "none") +
labs(x = "probability of water", y = "density", title = "Posterior Probability")
<- tibble(probability = seq(from = .1, to = .9, by = .1)) %>%
d_small mutate(draws = purrr::map(probability, simulate_binom, size = 15)) %>%
unnest(draws)
<- d_small %>%
p_small ggplot(aes(x = draws)) +
geom_bar(stat = "count", color = clr2, fill = fll2, width = .6) +
facet_wrap(probability ~ ., nrow = 1)+
labs(title = "Sampling Distributions")
<- samples %>%
p_posterior_predictive ggplot(aes(x = factor(w))) +
geom_bar(stat = "count",
aes(color = w == 8 ,
fill = after_scale(clr_alpha(color, .3))),
width = .6) +
scale_color_manual(values = c(`TRUE` = clr2, `FALSE` = clr0d), guide = "none") +
labs(x = "number of water samples", title = "Posterior Predictive Distribution")
/
p_posterior /
p_small p_posterior_predictive
sum( samples$w == 8 ) / length( samples$w )
#> [1] 0.14738
M4
<- grid_data %>%
samples slice_sample(n = 10^5 , weight_by = posterior, replace = TRUE) %>%
mutate(w = purrr::map_dbl(p_grid, rbinom, n = 1, size = 9))
%>%
samples ggplot(aes(x = factor(w))) +
geom_bar(stat = "count",
aes(color = w == 6 ,
fill = after_scale(clr_alpha(color, .3))),
width = .6) +
scale_color_manual(values = c(`TRUE` = clr2, `FALSE` = clr0d), guide = "none") +
labs(x = "number of water samples", title = "Posterior Predictive Distribution")
sum( samples$w == 6 ) / length( samples$w )
#> [1] 0.17634
M5
<- grid_approx(n_grid = 10^4 + 1, L = 8, W = 7,
grid_data prior = function(x){if_else(x < .5, 0, 1)}) %>%
mutate(idx = 1:(10^4 + 1))
<- grid_data %>%
samples slice_sample(n = 10^5 , weight_by = posterior, replace = TRUE) %>%
mutate(w = purrr::map_dbl(p_grid, rbinom, n = 1, size = 15))
HPDI(samples$p_grid, prob = .9)
#> |0.9 0.9|
#> 0.5000 0.7117
<- grid_data %>%
p_posterior ggplot(aes(x = p_grid, y = posterior)) +
geom_area(color = clr0d, fill = fll0) +
geom_segment(data = grid_data %>% filter(idx %in% (1+ (1:9)*1000)),
aes(xend = p_grid,
yend = 0, size = posterior),
color = fll2) +
geom_point(data = grid_data %>% filter(idx %in% (1+ (1:9)*1000)),
color = clr2) +
scale_size_continuous(range = c(.1, 1), guide = "none") +
labs(x = "probability of water", y = "density", title = "Posterior Probability")
<- tibble(probability = seq(from = .1, to = .9, by = .1)) %>%
d_small mutate(draws = purrr::map(probability, simulate_binom, size = 15)) %>%
unnest(draws)
<- d_small %>%
p_small ggplot(aes(x = draws)) +
geom_bar(stat = "count", color = clr2, fill = fll2, width = .6) +
facet_wrap(probability ~ ., nrow = 1)+
labs(title = "Sampling Distributions")
<- samples %>%
p_posterior_predictive ggplot(aes(x = factor(w))) +
geom_bar(stat = "count",
aes(color = w == 8 ,
fill = after_scale(clr_alpha(color, .3))),
width = .6) +
scale_color_manual(values = c(`TRUE` = clr2, `FALSE` = clr0d), guide = "none") +
labs(x = "number of water samples", title = "Posterior Predictive Distribution")
/
p_posterior /
p_small p_posterior_predictive
sum( samples$w == 8 ) / length( samples$w )
#> [1] 0.15846
M6
<- function(n, n_grid = 1e4, n_posterior_sample = 1e4,
random_tosses prior = function(x){rep(1, length(x))}){
<- tibble(p_grid = seq(0, 1, length.out = n_grid),
grid_data prior = prior(p_grid),
likelihood = dbinom(rbinom(1, size = n, prob = .7),
size = n, prob = p_grid),
posterior_unstand = likelihood * prior,
posterior = posterior_unstand / sum(posterior_unstand))
<- grid_data %>%
samples slice_sample(n = n_posterior_sample, weight_by = posterior, replace = TRUE)
tibble(n = n,
grid_data = list(grid_data),
samples = list(samples),
hpdi = list(HPDI(samples[[1]]$p_grid, prob = .99)),
hpdi_width = diff(hpdi[[1]]))
}
c(c(1:10),((1:100) *30)) %>%
map_dfr(random_tosses) %>%
ggplot(aes(x = n, y = hpdi_width)) +
geom_point(aes(color = hpdi_width < .05)) +
geom_hline(yintercept = .05, color = "black", linetype = 3) +
scale_color_manual(values = c(`FALSE` = clr0d, `TRUE` = clr2),
guide = "none")
H1
library(rethinking)
data(homeworkch3)
<- 1e4 + 1
n_grid <- tibble(p_grid = seq(0, 1, length.out = n_grid),
grid_data prior = (function(x){rep(1, length(x))})(p_grid),
likelihood = dbinom(sum(birth1 + birth2),
size = length(c(birth1, birth2)),
prob = p_grid),
posterior_unstand = likelihood * prior,
posterior = posterior_unstand / sum(posterior_unstand))
<- grid_data %>%
samples slice_sample(n = 1e5,
weight_by = posterior, replace = TRUE)
<- chainmode(samples$p_grid)) (mode_posterior
#> [1] 0.5547754
%>%
grid_data ggplot(aes(x = p_grid, y = posterior)) +
geom_area(color = clr0d, fill = fll0, size = .5) +
geom_vline(xintercept = mode_posterior, color = clr2, linetype = 3)
H2
<- tibble(boundary = c("lower", "upper"),
(percentile_intervals p50 = HPDI(samples$p_grid, prob = .5),
p89 = HPDI(samples$p_grid, prob = .89),
p97 = HPDI(samples$p_grid, prob = .97))) %>%
::kable() knitr
boundary | p50 | p89 | p97 |
---|---|---|---|
lower | 0.5306 | 0.4977 | 0.4775 |
upper | 0.5778 | 0.6090 | 0.6274 |
%>%
grid_data ggplot(aes(x = p_grid, y = posterior)) +
stat_function(fun = function(x){demp(x = x, obs = samples$p_grid)},
geom = "line", color = clr0d, xlim = c(0,1), n = 501) +
stat_function(fun = function(x){demp(x = x, obs = samples$p_grid)},
geom = "area", color = clr2, fill = clr_alpha(fll2,.2),
xlim = percentile_intervals$p50, n = 501) +
stat_function(fun = function(x){demp(x = x, obs = samples$p_grid)},
geom = "area", color = clr2, fill = clr_alpha(fll2,.2),
xlim = percentile_intervals$p89, n = 501) +
stat_function(fun = function(x){demp(x = x, obs = samples$p_grid)},
geom = "area", color = clr2, fill = clr_alpha(fll2,.2),
xlim = percentile_intervals$p97, n = 501)
H3
<- grid_data %>%
random_births slice_sample(n = 1e4,
weight_by = posterior, replace = TRUE) %>%
mutate(births = map(p_grid, .f = function(x){rbinom(n = 200, size = 1, prob = x)}),
n_boys = map_dbl(births, sum),
n_girls = 200 - n_boys,
n_boys_firstborn = map_dbl(births, function(x){ sum(x[1:100]) }))
sum(random_births$n_boys < sum(birth1 + birth2)) / sum(birth1 + birth2)
#> [1] 43.45946
<- random_births %>%
p_all ggplot(aes(x = n_boys)) +
geom_density(color = clr0d, fill = fll0) +
geom_vline(xintercept = sum(birth1 + birth2),
color = clr2, linetype = 3) +
scale_x_continuous(limits = c(0, 200))
<- random_births %>%
p_first ggplot(aes(x = n_boys_firstborn)) +
geom_density(color = clr0d, fill = fll0) +
geom_vline(xintercept = sum(birth1),
color = clr2, linetype = 3) +
scale_x_continuous(limits = c(0, 100))
+ p_first p_all
H5
<- tibble(birth1 = birth1,
births_first_girl birth2 = birth2) %>%
filter(birth1 == 0)
<- length(births_first_girl$birth2)
n_first_girl
<- grid_data %>%
random_births slice_sample(n = 1e4,
weight_by = posterior, replace = TRUE) %>%
mutate(births = map(p_grid, .f = function(x){rbinom(n = n_first_girl, size = 1, prob = x)}),
n_boys = map_dbl(births, sum),
n_girls = n_first_girl - n_boys)
sum(random_births$n_boys < sum(births_first_girl$birth2)) / 1e4
#> [1] 0.9991
%>%
random_births ggplot(aes(x = n_boys)) +
geom_density(color = clr0d, fill = fll0) +
geom_vline(xintercept = sum(births_first_girl$birth2),
color = clr2, linetype = 3) +
scale_x_continuous(limits = c(0, n_first_girl),
expand = c(0, 0))
4.6 {brms} section
<- brm(data = list(w = 6),
brms_c3_6in9 family = binomial(link = "identity"),
| trials(9) ~ 0 + Intercept,
w # this is a flat prior
prior(beta(1, 1), class = b, lb = 0, ub = 1),
iter = 5000, warmup = 1000,
seed = 42,
file = "brms/brms_c3_6in9")
posterior_summary(brms_c3_6in9) %>%
round(digits = 3) %>%
::kable() knitr
Estimate | Est.Error | Q2.5 | Q97.5 | |
---|---|---|---|---|
b_Intercept | 0.637 | 0.140 | 0.346 | 0.879 |
lp__ | -3.310 | 0.748 | -5.392 | -2.780 |
<- 9
n_trials <- fitted(brms_c3_6in9,
samples_brms summary = FALSE,
scale = "linear") %>%
as_tibble() %>%
set_names(nm = "p") %>%
mutate(w = rbinom(n(), size = n_trials, prob = p))
<- samples_brms %>%
p_brms_posterior ggplot(aes(x = p)) +
geom_density(color = clr1, fill = fll1) +
scale_x_continuous(limits = c(0, 1), expand = c(0, 0)) +
labs(y = "density", x = "proportion water",
title = "{brms} posterior probability (6 in 9)")
<- samples_brms %>%
p_brms_posterior_predictive ggplot(aes(x = factor(w))) +
geom_bar(stat = "count",
aes(color = w == 6 ,
fill = after_scale(clr_alpha(color, .3))),
width = .6) +
scale_color_manual(values = c(`TRUE` = clr1, `FALSE` = clr0d), guide = "none") +
labs(y = "count", x = "number of water samples",
title = "posterior predictive distribution")
+ p_brms_posterior_predictive p_brms_posterior
4.7 pymc3 section
Taken from pymc-devs…
import arviz as az
import matplotlib.pyplot as plt
import numpy as np
import pymc3 as pm
import scipy.stats as stats
import seaborn as sns
import matplotlib
from matplotlib.colors import ListedColormap
import matplotlib.font_manager
#matplotlib.font_manager._rebuild()
def posterior_grid_approx(grid_points = 5, success = 6, tosses = 9):
""""""
# define grid
= np.linspace(0, 1, grid_points)
p_grid
# define prior
= np.repeat(5, grid_points) # uniform
prior # prior = (p_grid >= 0.5).astype(int) # truncated
# prior = np.exp(- 5 * abs(p_grid - 0.5)) # double exp
# compute likelihood at each point in the grid
= stats.binom.pmf(success, tosses, p_grid)
likelihood
# compute product of likelihood and prior
= likelihood * prior
unstd_posterior
# standardize the posterior, so it sums to 1
= unstd_posterior / unstd_posterior.sum()
posterior return p_grid, posterior
= 0.95
PrPV = 0.01
PrPM = 0.001
PrV = PrPV * PrV + PrPM * (1 - PrV)
PrP = PrPV * PrV / PrP
PrVP PrVP
#> 0.08683729433272395
= posterior_grid_approx(grid_points = 100, success = 6, tosses = 9)
p_grid, posterior = np.random.choice(p_grid, p = posterior, size = int(1e4), replace = True) samples
= plt.subplots(1, 2, figsize = (12, 4))
fig, (ax0, ax1) "o", alpha = 0.2, color = r.clr3)
ax0.plot(samples, "sample number")
ax0.set_xlabel("proportion water (p)")
ax0.set_ylabel('left'].set_visible(False)
ax0.spines['right'].set_visible(False)
ax0.spines['top'].set_visible(False)
ax0.spines['bottom'].set_visible(False)
ax0.spines[= 'dotted')
ax0.grid(linestyle = ax1, plot_kwargs = {"color": r.clr3})
az.plot_kde(samples, ax "proportion water (p)")
ax1.set_xlabel("density")
ax1.set_ylabel('left'].set_visible(False)
ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)
ax1.spines['bottom'].set_visible(False)
ax1.spines[= 'dotted')
ax1.grid(linestyle plt.show()
sum(posterior[p_grid < 0.5])
#> 0.17183313110747475
sum(samples < 0.5) / 1e4
#> 0.1767
sum((samples > 0.5) & (samples < 0.75)) / 1e4
#> 0.6113
80) np.percentile(samples,
#> 0.7575757575757577
10, 90]) np.percentile(samples, [
#> array([0.44444444, 0.80808081])
= posterior_grid_approx(grid_points = 100, success = 3, tosses = 3)
p_grid, posterior = r.clr3)
plt.plot(p_grid, posterior, color "proportion water (p)")
plt.xlabel("Density")
plt.ylabel('left'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)
plt.gca().spines[= 'dotted')
plt.grid(linestyle plt.show()
= np.random.choice(p_grid, p = posterior, size = int(1e4), replace = True)
samples 25, 75]) np.percentile(samples, [
#> array([0.71717172, 0.93939394])
= 0.5) az.hdi(samples, hdi_prob
#> array([0.84848485, 1. ])
== max(posterior)] p_grid[posterior
#> array([1.])
0] stats.mode(samples)[
#> array([1.])
np.mean(samples), np.median(samples)
#> (0.8061979797979799, 0.8484848484848485)
sum(posterior * abs(0.5 - p_grid))
#> 0.31626874808692995
= [sum(posterior * abs(p - p_grid)) for p in p_grid]
loss == min(loss)] p_grid[loss
#> array([0.84848485])
range(3), n = 2, p = 0.7) stats.binom.pmf(
#> array([0.09, 0.42, 0.49])
= 2, p = 0.7, size = 1) stats.binom.rvs(n
#> array([1])
= 2, p = 0.7, size = 10) stats.binom.rvs(n
#> array([2, 2, 1, 2, 1, 0, 2, 1, 2, 2])
= stats.binom.rvs(n = 2, p = 0.7, size = int(1e5))
dummy_w == i).mean() for i in range(3)] [(dummy_w
#> [0.08832, 0.42272, 0.48896]
= stats.binom.rvs(n = 9, p = 0.7, size = int(1e5))
dummy_w # dummy_w = stats.binom.rvs(n=9, p=0.6, size=int(1e4))
# dummy_w = stats.binom.rvs(n=9, p=samples)
= 0.7
bar_width = np.arange(0, 11) - bar_width / 2, width = bar_width, color = r.clr3) plt.hist(dummy_w, bins
#> (array([2.0000e+00, 4.2000e+01, 3.9200e+02, 2.1170e+03, 7.4230e+03,
#> 1.7153e+04, 2.6752e+04, 2.6557e+04, 1.5640e+04, 3.9220e+03]), array([-0.35, 0.65, 1.65, 2.65, 3.65, 4.65, 5.65, 6.65, 7.65,
#> 8.65, 9.65]), <BarContainer object of 10 artists>)
0, 9.5) plt.xlim(
#> (0.0, 9.5)
"dummy water count")
plt.xlabel("Frequency")
plt.ylabel('left'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)
plt.gca().spines[= 'dotted')
plt.grid(linestyle plt.show()
= posterior_grid_approx(grid_points = 100, success = 6, tosses = 9)
p_grid, posterior 100)
np.random.seed(= np.random.choice(p_grid, p = posterior, size = int(1e4), replace = True) samples
= np.array([1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1,
birth1 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1,
1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0,
0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1])
= np.array([0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0,
birth2 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0,
0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0])
sum(birth1) + sum(birth2)
#> 111
×