import pymc as pm
import pandas as pd
import numpy as np
import arviz as az
19. Paraguay Vaccination Status*#
This example goes over a multilevel, or hierarchical, logistic regression model. It also shows how to use the PyMC coordinate system.
Adapted from Unit 7: paraguay.odc and paraguaynocluster.odc
Data can be found here.
This study considers factors influencing the vaccination status among 3424 children of 2552 mothers among 264 clusters in Paraguay. In this analysis, we’re specifically interested in mother-level factors related to child immunization. However, there is randomness associated with different clusters.
ID3: Cluster number
VACCODE: =1 if fully immunized, =0 otherwise
LB.TOT: No. of live births
MAGE2: mother age <20 =1, otherwise = 0
UN2: consensual union = 1, otherwise = 0
TOILET2: unsafe toilet in hh = 1, otherwise = 0
PR.SPOC1: spouse unskilled laborer = 1, otherwise = 0
SPANISH2: Spanish not hh language = 1, otherwise = 0
We need to add a random effect by cluster. This is a good use case for the PyMC coordinates system.
data = pd.read_csv("../data/paraguay.csv")
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3424 entries, 0 to 3423
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ID3 3424 non-null int64
1 VACCODE 3424 non-null int64
2 LB.TOT 3424 non-null int64
3 MAGE2 3424 non-null int64
4 UN2 3424 non-null int64
5 TOILET2 3424 non-null int64
6 PR.SPOC1 3424 non-null int64
7 SPANISH2 3424 non-null int64
dtypes: int64(8)
memory usage: 214.1 KB
y = data["VACCODE"].to_numpy()
# separate array for clusters
clusters = data["ID3"].to_numpy()
X = data.drop(["VACCODE", "ID3"], axis=1).to_numpy()
X_aug = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)
cols = X_aug.shape[1]
y.shape, clusters.shape, X_aug.shape
((3424,), (3424,), (3424, 7))
# set up alternate coordinates, the ID3 or clusters column
cluster_idx, clusters = pd.factorize(data.ID3)
coords = {"cluster": clusters, "id": data.index.to_numpy()}
# note that the coords dict is passed to pm.Model call
with pm.Model(coords=coords) as m:
X_data = pm.Data("X_data", X_aug)
y_data = pm.Data("y_data", y)
clust_idx = pm.Data("cluster_idx", cluster_idx, dims="id")
cluster_tau = pm.Gamma("cluster_tau", 0.01, 0.01)
cluster_variance = pm.Deterministic("cluster_variance", 1 / cluster_tau)
beta = pm.Normal("beta", 0, tau=1e-3, shape=cols)
cluster_effect = pm.Normal(
"cluster_effect", 0, tau=cluster_tau, dims="cluster"
)
p = pm.math.dot(X_data, beta) + cluster_effect[clust_idx]
pm.Bernoulli("likelihood", logit_p=p, observed=y_data)
trace = pm.sample(3000)
Show code cell output
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [cluster_tau, beta, cluster_effect]
Sampling 4 chains for 1_000 tune and 3_000 draw iterations (4_000 + 12_000 draws total) took 21 seconds.
az.summary(
trace,
var_names=["beta", "cluster_variance"],
filter_vars="like",
kind="stats",
)
mean | sd | hdi_3% | hdi_97% | |
---|---|---|---|---|
beta[0] | 1.445 | 0.130 | 1.206 | 1.694 |
beta[1] | -0.071 | 0.015 | -0.099 | -0.043 |
beta[2] | -0.566 | 0.204 | -0.939 | -0.175 |
beta[3] | -0.196 | 0.095 | -0.370 | -0.017 |
beta[4] | -0.693 | 0.133 | -0.943 | -0.444 |
beta[5] | -0.285 | 0.110 | -0.483 | -0.069 |
beta[6] | -0.621 | 0.098 | -0.802 | -0.428 |
cluster_variance | 0.529 | 0.094 | 0.364 | 0.711 |
This was originally based on this old coordinates example.
No clusters#
with pm.Model() as m_nc:
X_data = pm.Data("X_data", X_aug, mutable=True)
y_data = pm.Data("y_data", y, mutable=True)
beta = pm.Normal("beta", 0, tau=1e-3, shape=cols)
p = pm.math.dot(X_data, beta)
pm.Bernoulli("likelihood", logit_p=p, observed=y_data)
trace_nc = pm.sample(3000)
Show code cell output
/Users/aaron/miniforge3/envs/pymc_macos15/lib/python3.12/site-packages/pymc/data.py:440: FutureWarning: Data is now always mutable. Specifying the `mutable` kwarg will raise an error in a future release
warnings.warn(
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [beta]
Sampling 4 chains for 1_000 tune and 3_000 draw iterations (4_000 + 12_000 draws total) took 13 seconds.
az.summary(
trace_nc,
var_names=["beta", "cluster_variance"],
filter_vars="like",
kind="stats",
)
mean | sd | hdi_3% | hdi_97% | |
---|---|---|---|---|
beta[0] | 1.429 | 0.107 | 1.227 | 1.631 |
beta[1] | -0.064 | 0.014 | -0.089 | -0.038 |
beta[2] | -0.565 | 0.193 | -0.931 | -0.202 |
beta[3] | -0.189 | 0.085 | -0.344 | -0.024 |
beta[4] | -0.716 | 0.116 | -0.934 | -0.499 |
beta[5] | -0.451 | 0.089 | -0.622 | -0.290 |
beta[6] | -0.607 | 0.084 | -0.764 | -0.446 |
%load_ext watermark
%watermark -n -u -v -iv -p pytensor
Last updated: Sat Nov 09 2024
Python implementation: CPython
Python version : 3.12.7
IPython version : 8.29.0
pytensor: 2.26.0
arviz : 0.20.0
pymc : 5.18.0
pandas: 2.2.3
numpy : 1.26.4