13. Multiple Regression: Brozek Index Prediction*

import pymc as pm
import pandas as pd
import numpy as np
import arviz as az
from pymc.math import dot
import pytensor.tensor as pt

13. Multiple Regression: Brozek Index Prediction*#

Adapted from fat2d.odc and fatmulti.odc (they appear to be identical). This is the same dataset as the first regression example, available for download here.

This is not necessarily the best model for multiple regression; rather, it is just an attempt at implementing the lecture code, which is using Zellner’s G-prior Zellner [1986]. For multivariate regression in PyMC, we recommend pm.LKJCholeskyCov or pm.LKJCorr as your covariance matrix prior.

data = pd.read_csv("../data/fat.tsv", sep="\t")

y = data["y"].to_numpy(copy=True)
X = data.iloc[:, 1:].to_numpy(copy=True)

# add intercept
X_aug = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)
n, p = X_aug.shape

# Zellner's g
g = p**2

n, p, g

(252, 15, 225)

mu_beta = np.zeros(p)

with pm.Model() as m2d:
    tau = pm.Gamma("tau", 0.01, 0.01)
    variance = pm.Deterministic("variance", 1 / tau)

    tau_matrix = pt.fill(pt.zeros((15, 15)), tau)
    tau_beta = tau_matrix / g * dot(X_aug.T, X_aug)
    beta = pm.MvNormal("beta", mu_beta, tau=tau_beta)

    mu = dot(X_aug, beta)
    pm.Normal("likelihood", mu=mu, tau=tau, observed=y)

    # Bayesian R2 from fat2d.odc
    sse = (n - p) * variance
    cy = y - y.mean()
    sst = dot(cy, cy)
    br2 = pm.Deterministic("br2", 1 - sse / sst)
    br2_adj = pm.Deterministic("br2_adj", 1 - (n - 1) * variance / sst)

    trace = pm.sample(1000)
    ppc = pm.sample_posterior_predictive(trace)

az.summary(trace, hdi_prob=0.95)

	mean	sd	hdi_2.5%	hdi_97.5%	mcse_mean	mcse_sd	ess_bulk	ess_tail	r_hat
beta[0]	-15.031	16.576	-45.559	16.813	0.353	0.272	2219.0	2267.0	1.0
beta[1]	0.056	0.031	-0.001	0.118	0.001	0.000	3536.0	3260.0	1.0
beta[2]	-0.081	0.052	-0.183	0.018	0.001	0.001	2301.0	2362.0	1.0
beta[3]	-0.056	0.105	-0.262	0.146	0.002	0.001	3175.0	3214.0	1.0
beta[4]	0.053	0.285	-0.480	0.613	0.005	0.004	3119.0	2857.0	1.0
beta[5]	-0.441	0.225	-0.889	-0.024	0.004	0.003	3456.0	2912.0	1.0
beta[6]	-0.028	0.101	-0.231	0.165	0.002	0.001	3516.0	2901.0	1.0
beta[7]	0.874	0.088	0.695	1.040	0.001	0.001	3545.0	2925.0	1.0
beta[8]	-0.200	0.141	-0.490	0.056	0.003	0.002	2682.0	2755.0	1.0
beta[9]	0.225	0.139	-0.048	0.498	0.002	0.002	3178.0	2759.0	1.0
beta[10]	0.003	0.235	-0.438	0.476	0.004	0.004	3414.0	2592.0	1.0
beta[11]	0.153	0.208	-0.241	0.577	0.003	0.003	4039.0	2692.0	1.0
beta[12]	0.148	0.165	-0.178	0.455	0.003	0.002	3584.0	2960.0	1.0
beta[13]	0.419	0.196	0.048	0.799	0.003	0.002	3888.0	2837.0	1.0
beta[14]	-1.470	0.499	-2.501	-0.559	0.008	0.006	3686.0	3192.0	1.0
tau	0.060	0.005	0.050	0.070	0.000	0.000	3663.0	2827.0	1.0
variance	16.920	1.548	14.172	20.159	0.026	0.018	3663.0	2827.0	1.0
br2	0.734	0.024	0.683	0.777	0.000	0.000	3663.0	2827.0	1.0
br2_adj	0.718	0.026	0.664	0.764	0.000	0.000	3663.0	2827.0	1.0

y_pred = ppc.posterior_predictive.stack(sample=("chain", "draw"))["likelihood"].values.T
az.r2_score(y, y_pred)

r2        0.654267
r2_std    0.023808
dtype: float64

%load_ext watermark
%watermark -n -u -v -iv -p pytensor

Last updated: Sun Mar 09 2025

Python implementation: CPython
Python version       : 3.12.7
IPython version      : 8.29.0

pytensor: 2.26.4

arviz   : 0.20.0
pandas  : 2.2.3
pytensor: 2.26.4
pymc    : 5.19.1
numpy   : 1.26.4