Think Bayes - Notes
Notes related to the book Think Bayes v2 which introduces Bayesian statistics using computational methods
1 Bayes’s theorem
2 Pmf / Cdf
The followings are class implemented in the empiricaldist library which extands pd.Series class.
- Pmf: Probablity mass function
- Cdf: Cumulative density function
2.1 Creation and usage
Code
# ----------
# Create Pmf
# from sequence
pmf = Pmf.from_seq([1, 0, 0, 1, 1, 1])
# specify range
range_qs = np.linspace(0, 10, 101)
ps = scipy.stats.norm(5, 1).pdf(range_qs) # normal distribution
pmf = Pmf(ps, range_qs)
pmf.normalize()
# ------------------
# Specific functions
pmf.mean()
pmf.credible_interval(0.9)
pmf.make_cdf()
pmf.idxmax()
pmf.max_prob()
# ----------
# Create Cdf
# from pmf
cdf = pmf.make_cdf()
pmf = cdf.make_pmf()
2.2 Mixture
TODO: Add corresponding matrix operations
We can compute a distribution as mixture of other distributions
Code
probs | |
---|---|
1 | 0.159722 |
2 | 0.159722 |
3 | 0.159722 |
4 | 0.159722 |
5 | 0.118056 |
6 | 0.118056 |
7 | 0.062500 |
8 | 0.062500 |
2.3 Utils Pmf
Code
def pmf_from_dist(dist, qs: np.array) -> Pmf:
"""
Make a discrete approximation of a scipy distribution.
dist: SciPy distribution object
qs: quantities
"""
ps = dist.pdf(qs)
pmf = Pmf(ps, qs)
pmf.normalize()
return pmf
def pmf_kde_from_sample(sample: np.array, qs: np.array, **options) -> Pmf:
"""
Make a kernel density estimate from a sample
sample: sequence of values
qs: quantities where we should evaluate the KDE
"""
kde = scipy.stats.gaussian_kde(sample)
ps = kde(qs)
pmf = Pmf(ps, qs, **options)
pmf.normalize()
return pmf
def kde_from_pmf(pmf: Pmf, n: int = 101, qs: Optional[np.array] = None, **options) -> Pmf:
"""
Make a kernel density estimate from a Pmf.
pmf: Pmf object
n: number of points
"""
if not qs:
qs = np.linspace(pmf.qs.min(), pmf.qs.max(), n)
kde = scipy.stats.gaussian_kde(pmf.qs, weights=pmf.ps)
ps = kde.evaluate(qs)
pmf = Pmf(ps, qs, **options)
pmf.normalize()
return pmf
3 Grid method
Given a prior and likelihood, we can compute the posterior estimation for the entire range of hypothesis
3.1 1-parameter grid
Steps:
Define prior (\(P(H)\))
Define likelihood (\(P(Obs|H)\))
Multiply prior and likelihood (\(P(Obs|H) * P(Obs|H)\))
Normalize (Divide by \(P(Obs) = \sum_{i=1}^n P(H) * P(Obs_i | H)\))
Assert the range of your parameters covers all plausible values
Code
print(f"""Parameter value:
- Mean : {pmf_posterior.mean():.3f}
- 90% Credible interval: {pmf_posterior.credible_interval(0.9)}
- Most probable value: {pmf_posterior.idxmax()}
"""
)
pmf_posterior.plot(
**{
"title": "Posterior probability of parameter given observations",
"xlabel": "Parameter possible values",
"ylabel": "Probability",
}
)
plt.show()
Parameter value:
- Mean : 0.333
- 90% Credible interval: [0.14 0.56]
- Most probable value: 0.3
3.2 2-parameters grid
We can use the 1-parameter grid approach with 2 parameters
Functions:
Code
def make_joint(pmf1: Pmf, pmf2: Pmf) -> pd.DataFrame:
"""
Compute the outer product of two Pmfs, usually priors of two parameters.
Output dataframe: (columns = pmf1.qs / index = pmf2.qs)
"""
X, Y = np.meshgrid(pmf1, pmf2)
return pd.DataFrame(X * Y, columns=pmf1.qs, index=pmf2.qs)
def plot_joint(
joint: pd.DataFrame,
title: str = "Plot joint",
xlabel: str = "x",
ylabel: str = "y",
cmap: str = "Blues",
add_contour: bool = True,
) -> None:
"""
Plot a joint distribution with a color mesh.
x-axis: joint DataFrame columns
y-axis: joint DataFrame index
"""
vmax = joint.to_numpy().max() * 1.1
plt.pcolormesh(
joint.columns,
joint.index,
joint,
cmap=cmap,
vmax=vmax,
shading="nearest",
)
plt.colorbar()
if add_contour:
plt.contour(joint.columns, joint.index, joint, linewidths=1.5, cmap="gray_r")
plt.gca().set(title=title, xlabel=xlabel, ylabel=ylabel)
def joint_to_marginal(joint: pd.DataFrame) -> tuple[Pmf, Pmf]:
"""
Compute the marginal distribution for a joint distribution
Outuput tuple:
- First distribution: marginal of columns
- Second distribution: marginal of index
"""
marginal_columns = joint.sum(axis=0)
marginal_index = joint.sum(axis=1)
pmf_marginal_columns = Pmf(marginal_columns)
pmf_marginal_index = Pmf(marginal_index)
return pmf_marginal_columns, pmf_marginal_index
Code
# Prior
range_size = np.arange(140, 205, 0.5)
pmf_prior_size_man = Pmf(scipy.stats.norm(178, 7.7).pdf(range_size), range_size + 10)
pmf_prior_size_man.normalize()
pmf_prior_size_woman = Pmf(scipy.stats.norm(163, 7.3).pdf(range_size), range_size)
pmf_prior_size_woman.normalize()
joint_prior_size = make_joint(pmf_prior_size_man, pmf_prior_size_woman)
# Likelihood
# Observed: woman > man
col_vals, index_vals = np.meshgrid(joint_prior_size.columns, joint_prior_size.index)
likelihood = col_vals < index_vals
# Bayes theorem
joint_posterior_size = joint_prior_size * likelihood
joint_posterior_size /= joint_posterior_size.sum().sum()
# Marginal
pmf_posterior_size_man, pmf_posterior_size_woman = joint_to_marginal(joint_posterior_size)
Code
# Plot results
plt.subplot(311)
plot_joint(joint_prior_size, title="Prior size distribution", xlabel="Man", ylabel="Woman")
plt.show()
plt.subplot(312)
plot_joint(
pd.DataFrame(likelihood, columns=joint_prior_size.columns, index=joint_prior_size.index),
title="likelihood",
xlabel="Man",
ylabel="Woman",
)
plt.show()
plt.subplot(313)
pmf_prior_size_man.plot(label="Prior size man (cm)", linestyle="--", color="orangered")
pmf_prior_size_woman.plot(label="Prior size woman (cm)", linestyle="--", color="navy")
pmf_posterior_size_man.plot(label="Posterior size man (cm)", color="orangered")
pmf_posterior_size_woman.plot(label="Posterior size woman (cm)", color="navy")
plt.gca().set(title="Posterior marginal distributions")
plt.legend()
plt.show()
4 Probability laws
The following section introduces several probability laws and their usage in bayesin statistics
4.1 Binomial
4.1.1 Estimating proportion p
4.1.1.1 Grid Method
Code
nb_conversion, nb_trials = 3, 10
# Prior
range_qs = np.linspace(0, 1, 101)
pmf_prior = Pmf(1, range_qs)
pmf_prior.normalize()
# Likelihood
likelihood = scipy.stats.binom.pmf(nb_conversion, nb_trials, pmf_prior.qs) # binom.pmf(k, n, p)
# Bayes formula
pmf_posterior = pmf_prior * likelihood
pmf_posterior.normalize()
4.1.1.2 Conjugate priors
4.1.1.3 MCMC
Code
nb_conversion, nb_trials = 3, 10
with pm.Model() as model:
conversion_rates = pm.Uniform("conversion_rates", 0, 1)
nb_conversion = pm.Binomial("nb_conversion", n=nb_trials, observed=nb_conversion, p=conversion_rates)
trace_binom = pm.sample(1000, return_inferencedata=False)
posterior_samples = trace_binom.get_values("conversion_rates")
pd.Series(posterior_samples).quantile([0.05, .95]).round(2)
4.1.2 Estimating count
Code
# Prior
range_qs = np.arange(0, 2000)
pmf_prior = Pmf(1, range_qs)
pmf_prior.normalize()
# Likelihood
likelihood = np.ones(len(pmf_prior))
for nb_conversion in [2, 1, 0]:
likelihood *= scipy.stats.binom.pmf(nb_conversion, pmf_prior.qs, 1 / 365)
# Bayes formula
pmf_posterior = pmf_prior * likelihood
pmf_posterior.normalize()
4.2 Multinomial
Modelize n independant trials with k possible outcomes at each trials, with a given probability for each outcome
Code
# Priors
range_p = np.linspace(0, 1, 51)
range_nb_errors = np.arange(32, 350, 5)
cols = ["p1", "p2", "nb_e"]
prior = pd.DataFrame(list(product(range_p, range_p, range_nb_errors)), columns=cols)
prior["p"] = 1 / len(prior)
prior = Pmf(prior.set_index(cols).p)
# Likelihood
k10 = 20 - 3 # nb seen by first reviewer only
k01 = 15 - 3 # nb seen by second reviewer only
k11 = 3 # nb seen by both reviewers
likelihoods = prior.copy()
for p1, p2, nb_e in prior.index:
k00 = nb_e - (k10 + k01 + k11)
obs = [k00, k10, k01, k11]
probs = [
(1 - p1) * (1 - p2), # prob k00
p1 * (1 - p2), # prob k10
(1 - p1) * p2, # prob k01
p1 * p2, # prob k11
]
likelihood = scipy.stats.multinomial.pmf(obs, nb_e, probs)
likelihoods[p1, p2, nb_e] = likelihood
# Bayes formula
posterior = prior * likelihoods
posterior.normalize()
# Marginal
marginal_nb_e = Pmf(posterior.rename("p").reset_index().groupby("nb_e").p.sum())
4.3 Poisson
4.3.1 Grid method
Code
nb_events_obs = 4
# Prior
range_qs = np.linspace(0, 10, 101)
pmf_prior = Pmf(scipy.stats.gamma(1.4).pdf(range_qs), range_qs)
pmf_prior.normalize()
# Likelihood
likelihood = scipy.stats.poisson(pmf_prior.qs).pmf(nb_events_obs)
# Bayes formula
pmf_posterior = pmf_prior * likelihood
pmf_posterior.normalize()
4.3.2 Conjugate priors
Code
alpha, beta = 1.4, 1
obs_k, obs_t = 4, 1
alpha_updated, beta_updated = alpha + obs_k, (beta + obs_t)
gamma_dist = scipy.stats.gamma(alpha_updated, scale=1 / beta_updated) # scale factor equal to 1 / b
range_qs = np.linspace(0, 10, 101)
pmf_posterior_conjugate = Pmf(gamma_dist.pdf(range_qs), range_qs)
pmf_posterior_conjugate.normalize()
4.3.3 MCMC
Code
4.4 Exponential
4.4.1 Grid method
Code
elapsed_time_1 = 11 / 90
elapsed_time_2 = (23 - 11) / 90
# Prior: range of lambda
range_qs = np.linspace(0, 10, 101)
pmf_prior = Pmf(scipy.stats.gamma(1.4).pdf(range_qs), range_qs)
pmf_prior.normalize()
# Likelihood
lams = range_qs
likelihood_1 = lams * np.exp(-lams * elapsed_time_1)
likelihood_2 = lams * np.exp(-lams * elapsed_time_2)
# Bayes formula
pmf_posterior = pmf_prior * likelihood_1 * likelihood_2
pmf_posterior.normalize()
4.5 Normal
The aim is usually to estimate the parameters \((\mu, \sigma²)\) of the normal distribution
4.5.1 Grid method
Code
control_obs = [24, 43, 58, 71, 43, 49, 61, 44, 67, 49, 53, 56, 59, 52, 62, 54, 57, 33, 46, 43, 57]
test_obs = [42, 43, 55, 26, 62, 37, 33, 41, 19, 54, 20, 85, 46, 10, 17, 60, 53, 42, 37, 42, 55, 28, 48]
# Define priors
range_mu = np.linspace(30, 60, 201)
pmf_prior_mu = Pmf(1, range_mu)
pmf_prior_mu.normalize()
range_sigma = np.linspace(7, 30, 201)
pmf_prior_sigma = Pmf(1, range_sigma)
pmf_prior_sigma.normalize()
prior_joint = make_joint(pmf_prior_mu, pmf_prior_sigma)
# Likelihood
# Control group
mu_mesh, sigma_mesh, obs_mesh = np.meshgrid(pmf_prior_mu.qs, pmf_prior_sigma.qs, control_obs)
likelihood_control = scipy.stats.norm(mu_mesh, sigma_mesh).pdf(obs_mesh)
likelihood_control = likelihood_control.prod(axis=-1)
# Test group
mu_mesh, sigma_mesh, obs_mesh = np.meshgrid(pmf_prior_mu.qs, pmf_prior_sigma.qs, test_obs)
likelihood_test = scipy.stats.norm(mu_mesh, sigma_mesh).pdf(obs_mesh)
likelihood_test = likelihood_test.prod(axis=-1)
# Bayes formula
posterior_joint_control = prior_joint * likelihood_control
posterior_joint_control /= posterior_joint_control.sum().sum()
posterior_joint_test = prior_joint * likelihood_test
posterior_joint_test /= posterior_joint_test.sum().sum()
# Marginal
pmf_posterior_mu_control, pmf_posterior_sigma_control = joint_to_marginal(posterior_joint_control)
pmf_posterior_mu_test, pmf_posterior_sigma_test = joint_to_marginal(posterior_joint_test)
# Diff of mu
posterior_mu_diff = pmf_posterior_mu_control.sub_dist(pmf_posterior_mu_test)
posterior_mu_diff = kde_from_pmf(posterior_mu_diff)
Code
plt.subplot(221)
plt.contour(posterior_joint_control)
plt.contour(posterior_joint_test)
plt.gca().set(title="Joint posterior density")
plt.subplot(222)
pmf_posterior_mu_control.plot(label="Control mu")
pmf_posterior_mu_test.plot(label="Test mu")
plt.legend()
plt.gca().set(title="Marginal posterior density of mu")
plt.subplot(223)
pmf_posterior_sigma_control.plot(label="Control sigma")
pmf_posterior_sigma_test.plot(label="Test sigma")
plt.legend()
plt.gca().set(title="Marginal posterior density of sigma")
plt.subplot(224)
posterior_mu_diff.plot(label="Difference mu")
plt.legend()
plt.gca().set(title="Posterior mu difference")
plt.tight_layout()
plt.show()
4.5.2 MCMC
Code
with pm.Model() as model:
mu = pm.Uniform("mu", 30, 60)
sigma = pm.Uniform("sigma", 7, 30)
result = pm.Normal("result", mu=mu, sigma=sigma, observed=control_obs)
trace_binom = pm.sample(1000, return_inferencedata=False)
posterior_samples = trace_binom.get_values("mu")
pd.Series(posterior_samples).quantile([0.05, .95]).round(2)
4.6 Weibull
The Weibull distribution is used in survival analysis to predict time until an event (e.g. death)
Here we’ll use the weibull_min implementation of scipy. Given a parameter \(\lambda\) and k its pdf describe the time until an event
4.6.1 Parameters’ influence
- \(\lambda\): Mean of lifetime
- \(k\): Shape
Code
range_wb = np.linspace(0, 10, 101)
fig, axs = plt.subplots(nrows=1, ncols=2)
for lam in [1, 3, 5]:
for k in [3]:
sns.lineplot(scipy.stats.weibull_min(c=k, scale=lam).pdf(range_wb), label=f"(lam;k) = ({lam};{k})", ax=axs[0])
for lam in [3]:
for k in [1, 3, 5]:
sns.lineplot(scipy.stats.weibull_min(c=k, scale=lam).pdf(range_wb), label=f"(lam;k) = ({lam};{k})", ax=axs[1])
axs[0].title.set_text('Variation of lambda')
axs[1].title.set_text('Variation of k')
plt.tight_layout()
plt.show()
Code
# Generate distribution and observations
np.random.seed(1)
# Distribution
k, lam = 0.8, 3
actual_dist = scipy.stats.weibull_min(c=k, scale=lam)
# Observations
obs_time = 8
nb_obs = 20
start = np.random.uniform(0, 8, size=nb_obs)
duration = actual_dist.rvs(size=nb_obs, random_state=42)
df_obs = pd.DataFrame({"start": start,"duration": duration,}).round(2)
df_obs["end"] = df_obs["start"] + df_obs["duration"]
df_obs["censored"] = df_obs["end"] > obs_time
df_obs["obs_t"] = df_obs["end"].clip(upper=8) - df_obs["start"]
Code
plt.hlines(
y=df_obs.index,
xmin=df_obs.start,
xmax=df_obs.end,
color=df_obs.censored.apply(lambda c: 'blue' if not c else "red"),
alpha=0.4)
plt.scatter(df_obs.end, df_obs.index, color='blue')
plt.axvline(x=obs_time)
plt.gca().set(title="Subjects' lifetime", xlabel="Lifetime", ylabel="Subject n°")
plt.show()
4.6.2 Grid method
Code
# Define priors
range_k = np.linspace(0.1, 2, 101)
pmf_prior_k = Pmf(1, range_k)
pmf_prior_k.normalize()
range_lam = np.linspace(0.1, 6, 101)
pmf_prior_lam = Pmf(1, range_lam)
pmf_prior_lam.normalize()
joint_prior = make_joint(pmf_prior_k, pmf_prior_lam)
# Likelihood
k_mesh, lam_mesh, time_mesh = np.meshgrid(range_k, range_lam, df_obs[~df_obs.censored].obs_t)
likelihood_uncensored = scipy.stats.weibull_min(c=k_mesh, scale=lam_mesh).pdf(time_mesh)
likelihood_uncensored = likelihood_uncensored.prod(axis=-1)
k_mesh, lam_mesh, time_mesh = np.meshgrid(range_k, range_lam, df_obs[df_obs.censored].obs_t)
likelihood_censored = scipy.stats.weibull_min(c=k_mesh, scale=lam_mesh).sf(time_mesh)
likelihood_censored = likelihood_censored.prod(axis=-1)
# Bayes formula
joint_posterior = joint_prior * likelihood_uncensored * likelihood_censored
joint_posterior /= joint_posterior.sum().sum()
# Marginal
posterior_k, posterior_lam = joint_to_marginal(joint_posterior)
TODO:
- Add MCMC with censor and uncensored data
- Add Hypergeometric distribution
5 Regression
5.1 Logistic regression
The aim is to define the value of \(a\) and \(b\)
5.1.1 Grid method
Code
temp = [66, 70, 69, 68, 67, 72, 73, 70, 57, 63, 70, 78, 67, 53, 67, 75, 70, 81, 76, 79, 75, 76, 58]
converted = [0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1]
df = pd.DataFrame({"temp": temp, "conv": converted})
# Normalize data
offset = df["temp"].mean()
df["x"] = df["temp"] - offset
df["y"] = df["conv"]
Code
# Priors
range_a = np.linspace(-0.8, 0.1, 51)
range_b = np.linspace(-5, 1, 51)
cols = ["a", "b"]
prior = pd.DataFrame(list(product(range_a, range_b)), columns=cols)
prior["p"] = 1 / len(prior)
prior = Pmf(prior.set_index(cols).p)
# Likelihood
likelihood = prior.copy()
for a, b in prior.index:
likelihoods_converted = scipy.special.expit(a * df[df.y == 1].x + b)
likelihoods_not_converted = 1 - scipy.special.expit((a * df[df.y == 0].x + b))
likelihood[a, b] = likelihoods_converted.prod() * likelihoods_not_converted.prod()
# Bayes formula
posterior = prior * likelihood
posterior /= posterior.sum()
# Posterior marginal
posterior_a = Pmf(posterior.rename("p").reset_index().groupby("a").p.sum())
posterior_b = Pmf(posterior.rename("p").reset_index().groupby("b").p.sum())
5.1.2 MCMC - PyMC
Code
with pm.Model() as model:
a = pm.Uniform("a", -0.8, 0.1)
b = pm.Uniform("b", -5, 1)
est_expit = 1 / (1 + np.exp(- (a * df.x + b)))
bernouilli = pm.Bernoulli('trials', p=est_expit, observed=df.y)
trace_binom = pm.sample(1000, return_inferencedata=False)
a_posterior_samples = trace_binom.get_values("a")
b_posterior_samples = trace_binom.get_values("b")
posterior_samples =pd.DataFrame({"a": a_posterior_samples, "b":b_posterior_samples})
posterior_samples.mean().round(2)
5.1.3 MCMC - Bambi
5.1.4 Interpretation
5.1.4.1 Interpretation of intercept
We can use the posterior distribution of the intercept to get the probability when x = 0:
\(b = log(O(H)) \implies P(H) = \frac{1}{1 + e^{-b}}\)
5.1.4.2 Interpretation of slope
The distribution of \(a\) gives us the log likelihood ratio so:
\[ a = log (\frac{P(Obs|H)}{P(Obs|\bar{H})}) \implies likelihood \: ratio = \frac{P(Obs|H)}{P(Obs|\bar{H})} = e^a \]
5.1.4.3 Uncertainty on logistic curve
By leveraging simulations on posteriors, we can define credible interval on our logistic regression:
- For each value of \(x\):
- Compute probabilities by sampling from the posterior distribution of \(a\) and \(b\)
- Compute the related credible interval
Code
# Simulation parameters
RANGE_T = np.linspace(30, 85, 101)
RANGE_X = RANGE_T - offset
NB_SAMPLE_PER_X = 10_000
QUANTS = [0.05, 0.5, 0.95]
x_to_probs_thresh = {} # Will contain the q5, q50 and q95 of the distribution of probs for each x
# For each value of x
for x in RANGE_X:
# compute n probs by sampling from posteriors
log_odds = posterior_a.choice(NB_SAMPLE_PER_X) * x + posterior_b.choice(NB_SAMPLE_PER_X)
probs = scipy.special.expit(log_odds)
x_to_probs_thresh[x] = pd.Series(probs).quantile(QUANTS)
# Format the data
x_to_probs_thresh = pd.DataFrame(x_to_probs_thresh).T.rename_axis("x").reset_index()
x_to_probs_thresh["temp"] = RANGE_T
Code
plt.fill_between(
x_to_probs_thresh["temp"],
x_to_probs_thresh[0.05],
x_to_probs_thresh[0.95],
color="cornflowerblue",
alpha=0.2,
)
sns.lineplot(data=x_to_probs_thresh, x="temp", y=0.5, label="Logistic model")
sns.scatterplot(
data=df, x="temp", y="y", alpha=0.2, color="orangered", label="Observed data"
)
plt.gca().set(
title="Median P(Y = 1 | T°) with 90% credible interval",
xlabel="T°",
ylabel="P(Y=1)",
)
plt.show()
5.2 Linear regression
The aim is to estimate the values of \(w_i\) and \(\sigma\)
5.2.1 Grid method
Code
year = [1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]
amount_fall = [28.6, 44.7, 99.2, 66.8, 54.6, 68.7, 12.5, 46.5, 61.9, 57.7, 65.5, 100.6, 34.4, 27.7, 54.2, 64.1, 50.2, 56.3, 49.7, 38.3, 89.3, 53.5, 38.5, 52.5, 30.7, 39.0, 80.1, 88.7, 46.0, 124.2, 77.1, 24.1, 56.6, 43.7, 90.8, 61.5, 112.0, 51.8, 110.7, 40.6, 54.2, 59.1, 85.8, 57.6, 82.3, 31.1, 99.1, 80.6, 141.1, 64.0, 68.0, 90.4, 59.6, 12.2]
df_snow = pd.DataFrame({"year": year, "amount_fall": amount_fall})
offset = df_snow.year.mean()
df_snow["x"] = df_snow.year - offset
df_snow["y"] = df_snow.amount_fall
Code
# Priors
range_a = np.linspace(-0.5, 1.5, 51)
range_b = np.linspace(54, 75, 21)
range_sigma = np.linspace(20, 35, 31) # epsilon sigma
cols = ["a", "b", "sigma"]
prior = pd.DataFrame(list(product(range_a, range_b, range_sigma)), columns=cols)
prior["p"] = 1 / len(prior)
prior = Pmf(prior.set_index(cols).p)
# Likelihood
likelihood = prior.copy()
for a, b, sigma in prior.index:
#likelihood = scipy.stats.multinomial.pmf(obs, nb_e, probs)
y_est = a * df_snow.x + b
residuals = df_snow.y - y_est
likelihoods = scipy.stats.norm(0, sigma).pdf(residuals)
likelihood[a, b, sigma] = likelihoods.prod()
# Bayes formula
posterior = prior * likelihood
posterior.normalize()
# Marginal
marginal_a = Pmf(posterior.rename("p").reset_index().groupby("a").p.sum())
marginal_b = Pmf(posterior.rename("p").reset_index().groupby("b").p.sum())
marginal_sigma = Pmf(posterior.rename("p").reset_index().groupby("sigma").p.sum())
5.2.2 MCMC - PyMC
Code
with pm.Model() as model:
a = pm.Uniform("a", -0.5, 1.5)
b = pm.Uniform("b", 54, 75)
sigma = pm.Uniform("sigma", 20, 35)
est_y = a * df_snow.x + b
y = pm.Normal("y", mu=y_est, sigma=sigma, observed=df_snow.y)
trace_binom = pm.sample(1000, return_inferencedata=False)
a_posterior_samples = trace_binom.get_values("a")
b_posterior_samples = trace_binom.get_values("b")
sigma_posterior_samples = trace_binom.get_values("sigma")
posterior_samples = pd.DataFrame(
{
"a": a_posterior_samples,
"b": b_posterior_samples,
"sigma": sigma_posterior_samples,
}
)
posterior_samples.mean().round(2)
5.2.3 MCMC - Bambi
5.2.4 Uncertainty on regression curve
By leveraging simulations on posteriors, we can define credible interval on our linear regression:
- For each value of \(x\):
- Compute probabilities by sampling from the posterior distribution of \(a\), \(b\) and \(\sigma\)
- Compute the related credible interval
Code
# Simulation parameters
RANGE_YEAR = np.linspace(df_snow.year.min(), df_snow.year.max(), 101)
RANGE_X = RANGE_YEAR - offset
NB_SAMPLE_PER_X = 10_000
QUANTS = [0.05, 0.5, 0.95]
x_to_y_thresh = {} # Will contain the q5, q50 and q95 of the distribution of probs for each x
# For each value of x
for x in RANGE_X:
samples_a = marginal_a.choice(NB_SAMPLE_PER_X)
samples_b = marginal_b.choice(NB_SAMPLE_PER_X)
samples_sigma = marginal_sigma.choice(NB_SAMPLE_PER_X)
# compute n probs by sampling from posteriors
ys = samples_a * x + samples_b + scipy.stats.norm(0, samples_sigma).rvs(NB_SAMPLE_PER_X)
x_to_y_thresh[x] = pd.Series(ys).quantile(QUANTS)
# Format the data
x_to_y_thresh = pd.DataFrame(x_to_y_thresh).T.rename_axis("x").reset_index()
x_to_y_thresh["year"] = RANGE_YEAR
Code
plt.fill_between(
x_to_y_thresh["year"],
x_to_y_thresh[0.05],
x_to_y_thresh[0.95],
color="cornflowerblue",
alpha=0.2,
)
sns.lineplot(data=x_to_y_thresh, x="year", y=0.5, label="Linear model")
sns.scatterplot(
data=df_snow, x="year", y="y", alpha=0.2, color="orangered", label="Observed data"
)
plt.gca().set(
title="Median Snowfall with 90% credible interval",
xlabel="Year",
ylabel="Snowfall (cm)",
)
plt.show()