import pymc as pm
import pandas as pd
import numpy as np
import arviz as az
from pymc.math import dot
import pytensor.tensor as pt
%load_ext lab_black
13. Multiple Regression: Brozek Index Prediction*#
Adapted from fat2d.odc and fatmulti.odc (they appear to be identical). This is the same dataset as the first regression example, available for download here.
data = pd.read_csv("../data/fat.tsv", sep="\t")
y = data["y"].to_numpy(copy=True)
X = data.iloc[:, 1:].to_numpy(copy=True)
# add intercept
X_aug = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)
n, p = X_aug.shape
# Zellner's g
g = p**2
n, p, g
(252, 15, 225)
X_aug.shape
(252, 15)
mu_beta = np.zeros(p)
with pm.Model() as m2d:
tau = pm.Gamma("tau", 0.01, 0.01)
variance = pm.Deterministic("variance", 1 / tau)
tau_matrix = pt.fill(pt.zeros((15, 15)), tau)
tau_beta = tau_matrix / g * dot(X_aug.T, X_aug)
beta = pm.MvNormal("beta", mu_beta, tau=tau_beta)
mu = dot(X_aug, beta)
pm.Normal("likelihood", mu=mu, tau=tau, observed=y)
# Bayesian R2 from fat2d.odc
sse = (n - p) * variance
cy = y - y.mean()
sst = dot(cy, cy)
br2 = pm.Deterministic("br2", 1 - sse / sst)
br2_adj = pm.Deterministic("br2_adj", 1 - (n - 1) * variance / sst)
trace = pm.sample(1000)
ppc = pm.sample_posterior_predictive(trace)
Show code cell output
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [tau, beta]
100.00% [8000/8000 05:02<00:00 Sampling 4 chains, 0 divergences]
Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 303 seconds.
Sampling: [likelihood]
100.00% [4000/4000 00:00<00:00]
az.summary(trace, hdi_prob=0.95)
mean | sd | hdi_2.5% | hdi_97.5% | mcse_mean | mcse_sd | ess_bulk | ess_tail | r_hat | |
---|---|---|---|---|---|---|---|---|---|
beta[0] | -15.194 | 16.247 | -47.402 | 15.271 | 0.378 | 0.270 | 1848.0 | 2197.0 | 1.0 |
beta[1] | 0.056 | 0.030 | -0.000 | 0.117 | 0.001 | 0.000 | 2840.0 | 2779.0 | 1.0 |
beta[2] | -0.081 | 0.051 | -0.180 | 0.020 | 0.001 | 0.001 | 1904.0 | 2276.0 | 1.0 |
beta[3] | -0.051 | 0.105 | -0.252 | 0.158 | 0.002 | 0.002 | 2354.0 | 2564.0 | 1.0 |
beta[4] | 0.067 | 0.282 | -0.477 | 0.629 | 0.006 | 0.004 | 2522.0 | 2951.0 | 1.0 |
beta[5] | -0.445 | 0.223 | -0.887 | -0.015 | 0.004 | 0.003 | 3368.0 | 2648.0 | 1.0 |
beta[6] | -0.030 | 0.102 | -0.238 | 0.167 | 0.002 | 0.001 | 2832.0 | 2976.0 | 1.0 |
beta[7] | 0.875 | 0.088 | 0.704 | 1.045 | 0.002 | 0.001 | 2974.0 | 2991.0 | 1.0 |
beta[8] | -0.205 | 0.139 | -0.477 | 0.066 | 0.003 | 0.002 | 2613.0 | 2747.0 | 1.0 |
beta[9] | 0.227 | 0.140 | -0.052 | 0.494 | 0.003 | 0.002 | 3136.0 | 2892.0 | 1.0 |
beta[10] | -0.001 | 0.237 | -0.446 | 0.458 | 0.004 | 0.003 | 3192.0 | 2778.0 | 1.0 |
beta[11] | 0.157 | 0.208 | -0.277 | 0.542 | 0.003 | 0.003 | 3804.0 | 3102.0 | 1.0 |
beta[12] | 0.148 | 0.162 | -0.163 | 0.459 | 0.003 | 0.002 | 3599.0 | 2831.0 | 1.0 |
beta[13] | 0.429 | 0.191 | 0.049 | 0.785 | 0.003 | 0.002 | 3723.0 | 2965.0 | 1.0 |
beta[14] | -1.469 | 0.515 | -2.440 | -0.469 | 0.009 | 0.007 | 3423.0 | 3119.0 | 1.0 |
tau | 0.060 | 0.005 | 0.050 | 0.071 | 0.000 | 0.000 | 3490.0 | 2580.0 | 1.0 |
variance | 16.892 | 1.520 | 14.082 | 19.992 | 0.026 | 0.018 | 3490.0 | 2580.0 | 1.0 |
br2 | 0.735 | 0.024 | 0.686 | 0.779 | 0.000 | 0.000 | 3490.0 | 2580.0 | 1.0 |
br2_adj | 0.719 | 0.025 | 0.667 | 0.766 | 0.000 | 0.000 | 3490.0 | 2580.0 | 1.0 |
y_pred = ppc.posterior_predictive.stack(sample=("chain", "draw"))["likelihood"].values.T
az.r2_score(y, y_pred)
r2 0.653819
r2_std 0.024084
dtype: float64
Original paper:
Zellner, A. (1986). “On Assessing Prior Distributions and Bayesian Regression Analysis with g Prior Distributions”. In Goel, P.; Zellner, A. (eds.). Bayesian Inference and Decision Techniques: Essays in Honor of Bruno de Finetti. Studies in Bayesian Econometrics and Statistics. Vol. 6. New York: Elsevier. pp. 233 243. ISBN 978-0-444-87712-3.
%load_ext watermark
%watermark -n -u -v -iv -p pytensor
Last updated: Wed Mar 22 2023
Python implementation: CPython
Python version : 3.11.0
IPython version : 8.9.0
pytensor: 2.10.1
pandas : 1.5.3
arviz : 0.14.0
numpy : 1.24.2
pytensor: 2.10.1
pymc : 5.1.2