import pymc as pm
import pandas as pd
import numpy as np
import arviz as az

19. Paraguay Vaccination Status*#

This example goes over a multilevel, or hierarchical, logistic regression model. It also shows how to use the PyMC coordinate system.

Adapted from Unit 7: paraguay.odc and paraguaynocluster.odc

Data can be found here.

This study considers factors influencing the vaccination status among 3424 children of 2552 mothers among 264 clusters in Paraguay. In this analysis, we’re specifically interested in mother-level factors related to child immunization. However, there is randomness associated with different clusters.

  • ID3: Cluster number

  • VACCODE: =1 if fully immunized, =0 otherwise

  • LB.TOT: No. of live births

  • MAGE2: mother age <20 =1, otherwise = 0

  • UN2: consensual union = 1, otherwise = 0

  • TOILET2: unsafe toilet in hh = 1, otherwise = 0

  • PR.SPOC1: spouse unskilled laborer = 1, otherwise = 0

  • SPANISH2: Spanish not hh language = 1, otherwise = 0

We need to add a random effect by cluster. This is a good use case for the PyMC coordinates system.

data = pd.read_csv("../data/paraguay.csv")
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3424 entries, 0 to 3423
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   ID3       3424 non-null   int64
 1   VACCODE   3424 non-null   int64
 2   LB.TOT    3424 non-null   int64
 3   MAGE2     3424 non-null   int64
 4   UN2       3424 non-null   int64
 5   TOILET2   3424 non-null   int64
 6   PR.SPOC1  3424 non-null   int64
 7   SPANISH2  3424 non-null   int64
dtypes: int64(8)
memory usage: 214.1 KB
y = data["VACCODE"].to_numpy()
# separate array for clusters
clusters = data["ID3"].to_numpy()
X = data.drop(["VACCODE", "ID3"], axis=1).to_numpy()
X_aug = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)
cols = X_aug.shape[1]
y.shape, clusters.shape, X_aug.shape
((3424,), (3424,), (3424, 7))
# set up alternate coordinates, the ID3 or clusters column
cluster_idx, clusters = pd.factorize(data.ID3)
coords = {"cluster": clusters, "id": data.index.to_numpy()}
# note that the coords dict is passed to pm.Model call
with pm.Model(coords=coords) as m:
    X_data = pm.Data("X_data", X_aug)
    y_data = pm.Data("y_data", y)
    clust_idx = pm.Data("cluster_idx", cluster_idx, dims="id")

    cluster_tau = pm.Gamma("cluster_tau", 0.01, 0.01)
    cluster_variance = pm.Deterministic("cluster_variance", 1 / cluster_tau)
    beta = pm.Normal("beta", 0, tau=1e-3, shape=cols)

    cluster_effect = pm.Normal(
        "cluster_effect", 0, tau=cluster_tau, dims="cluster"
    )
    p = pm.math.dot(X_data, beta) + cluster_effect[clust_idx]

    pm.Bernoulli("likelihood", logit_p=p, observed=y_data)

    trace = pm.sample(3000)
Hide code cell output
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [cluster_tau, beta, cluster_effect]

Sampling 4 chains for 1_000 tune and 3_000 draw iterations (4_000 + 12_000 draws total) took 21 seconds.
az.summary(
    trace,
    var_names=["beta", "cluster_variance"],
    filter_vars="like",
    kind="stats",
)
mean sd hdi_3% hdi_97%
beta[0] 1.445 0.130 1.206 1.694
beta[1] -0.071 0.015 -0.099 -0.043
beta[2] -0.566 0.204 -0.939 -0.175
beta[3] -0.196 0.095 -0.370 -0.017
beta[4] -0.693 0.133 -0.943 -0.444
beta[5] -0.285 0.110 -0.483 -0.069
beta[6] -0.621 0.098 -0.802 -0.428
cluster_variance 0.529 0.094 0.364 0.711

This was originally based on this old coordinates example.

No clusters#

with pm.Model() as m_nc:
    X_data = pm.Data("X_data", X_aug, mutable=True)
    y_data = pm.Data("y_data", y, mutable=True)

    beta = pm.Normal("beta", 0, tau=1e-3, shape=cols)

    p = pm.math.dot(X_data, beta)

    pm.Bernoulli("likelihood", logit_p=p, observed=y_data)

    trace_nc = pm.sample(3000)
Hide code cell output
/Users/aaron/miniforge3/envs/pymc_macos15/lib/python3.12/site-packages/pymc/data.py:440: FutureWarning: Data is now always mutable. Specifying the `mutable` kwarg will raise an error in a future release
  warnings.warn(
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [beta]

Sampling 4 chains for 1_000 tune and 3_000 draw iterations (4_000 + 12_000 draws total) took 13 seconds.
az.summary(
    trace_nc,
    var_names=["beta", "cluster_variance"],
    filter_vars="like",
    kind="stats",
)
mean sd hdi_3% hdi_97%
beta[0] 1.429 0.107 1.227 1.631
beta[1] -0.064 0.014 -0.089 -0.038
beta[2] -0.565 0.193 -0.931 -0.202
beta[3] -0.189 0.085 -0.344 -0.024
beta[4] -0.716 0.116 -0.934 -0.499
beta[5] -0.451 0.089 -0.622 -0.290
beta[6] -0.607 0.084 -0.764 -0.446
%load_ext watermark
%watermark -n -u -v -iv -p pytensor
Last updated: Sat Nov 09 2024

Python implementation: CPython
Python version       : 3.12.7
IPython version      : 8.29.0

pytensor: 2.26.0

arviz : 0.20.0
pymc  : 5.18.0
pandas: 2.2.3
numpy : 1.26.4