{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "8781b1e5-e41b-4219-8229-88a1a7c71dc2", "metadata": {}, "outputs": [], "source": [ "import pymc as pm\n", "import pandas as pd\n", "import numpy as np\n", "import arviz as az\n", "from pymc.math import dot\n", "import pytensor.tensor as pt\n", "\n", "%load_ext lab_black" ] }, { "cell_type": "markdown", "id": "bc073c69-254b-46e2-ba24-9782ac2e96aa", "metadata": { "tags": [] }, "source": [ "# 13. Multiple Regression: Brozek Index Prediction*" ] }, { "cell_type": "markdown", "id": "740924ab-f64c-468c-8225-cbdd2c1c9598", "metadata": {}, "source": [ "Adapted from [fat2d.odc](https://raw.githubusercontent.com/areding/6420-pymc/main/original_examples/Codes4Unit7/fat1.odc) and [fatmulti.odc](https://raw.githubusercontent.com/areding/6420-pymc/main/original_examples/Codes4Unit7/fatmulti.odc) (they appear to be identical). This is the same dataset as [the first regression example](./Unit7-demo-regression.ipynb), available for download [here](https://raw.githubusercontent.com/areding/6420-pymc/main/data/fat.tsv)." ] }, { "cell_type": "code", "execution_count": 2, "id": "de022d74-1d1e-4670-8a3e-70bd45ff404b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(252, 15, 225)" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = pd.read_csv(\"../data/fat.tsv\", sep=\"\\t\")\n", "\n", "y = data[\"y\"].to_numpy(copy=True)\n", "X = data.iloc[:, 1:].to_numpy(copy=True)\n", "\n", "# add intercept\n", "X_aug = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)\n", "n, p = X_aug.shape\n", "\n", "# Zellner's g\n", "g = p**2\n", "\n", "n, p, g" ] }, { "cell_type": "code", "execution_count": 3, "id": "48e3b1a7-68da-46fc-addf-0b34746c9707", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(252, 15)" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_aug.shape" ] }, { "cell_type": "code", "execution_count": 4, "id": "4f08b917-8797-4f60-9e06-b614e8e4c27a", "metadata": {}, "outputs": [], "source": [ "mu_beta = np.zeros(p)" ] }, { "cell_type": "code", "execution_count": 5, "id": "321af628-9851-4986-b2df-7b31d99fb6b8", "metadata": { "tags": [ "hide-output" ] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Auto-assigning NUTS sampler...\n", "Initializing NUTS using jitter+adapt_diag...\n", "Multiprocess sampling (4 chains in 4 jobs)\n", "NUTS: [tau, beta]\n" ] }, { "data": { "text/html": [ "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", "
\n", " \n", " 100.00% [8000/8000 05:02<00:00 Sampling 4 chains, 0 divergences]\n", "
\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 303 seconds.\n", "Sampling: [likelihood]\n" ] }, { "data": { "text/html": [ "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", "
\n", " \n", " 100.00% [4000/4000 00:00<00:00]\n", "
\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "with pm.Model() as m2d:\n", " tau = pm.Gamma(\"tau\", 0.01, 0.01)\n", " variance = pm.Deterministic(\"variance\", 1 / tau)\n", "\n", " tau_matrix = pt.fill(pt.zeros((15, 15)), tau)\n", " tau_beta = tau_matrix / g * dot(X_aug.T, X_aug)\n", " beta = pm.MvNormal(\"beta\", mu_beta, tau=tau_beta)\n", "\n", " mu = dot(X_aug, beta)\n", " pm.Normal(\"likelihood\", mu=mu, tau=tau, observed=y)\n", "\n", " # Bayesian R2 from fat2d.odc\n", " sse = (n - p) * variance\n", " cy = y - y.mean()\n", " sst = dot(cy, cy)\n", " br2 = pm.Deterministic(\"br2\", 1 - sse / sst)\n", " br2_adj = pm.Deterministic(\"br2_adj\", 1 - (n - 1) * variance / sst)\n", "\n", " trace = pm.sample(1000)\n", " ppc = pm.sample_posterior_predictive(trace)" ] }, { "cell_type": "code", "execution_count": 6, "id": "e60158dc-1962-42b8-9b7c-dc3d0060cd0e", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
meansdhdi_2.5%hdi_97.5%mcse_meanmcse_sdess_bulkess_tailr_hat
beta[0]-15.19416.247-47.40215.2710.3780.2701848.02197.01.0
beta[1]0.0560.030-0.0000.1170.0010.0002840.02779.01.0
beta[2]-0.0810.051-0.1800.0200.0010.0011904.02276.01.0
beta[3]-0.0510.105-0.2520.1580.0020.0022354.02564.01.0
beta[4]0.0670.282-0.4770.6290.0060.0042522.02951.01.0
beta[5]-0.4450.223-0.887-0.0150.0040.0033368.02648.01.0
beta[6]-0.0300.102-0.2380.1670.0020.0012832.02976.01.0
beta[7]0.8750.0880.7041.0450.0020.0012974.02991.01.0
beta[8]-0.2050.139-0.4770.0660.0030.0022613.02747.01.0
beta[9]0.2270.140-0.0520.4940.0030.0023136.02892.01.0
beta[10]-0.0010.237-0.4460.4580.0040.0033192.02778.01.0
beta[11]0.1570.208-0.2770.5420.0030.0033804.03102.01.0
beta[12]0.1480.162-0.1630.4590.0030.0023599.02831.01.0
beta[13]0.4290.1910.0490.7850.0030.0023723.02965.01.0
beta[14]-1.4690.515-2.440-0.4690.0090.0073423.03119.01.0
tau0.0600.0050.0500.0710.0000.0003490.02580.01.0
variance16.8921.52014.08219.9920.0260.0183490.02580.01.0
br20.7350.0240.6860.7790.0000.0003490.02580.01.0
br2_adj0.7190.0250.6670.7660.0000.0003490.02580.01.0
\n", "
" ], "text/plain": [ " mean sd hdi_2.5% hdi_97.5% mcse_mean mcse_sd ess_bulk \\\n", "beta[0] -15.194 16.247 -47.402 15.271 0.378 0.270 1848.0 \n", "beta[1] 0.056 0.030 -0.000 0.117 0.001 0.000 2840.0 \n", "beta[2] -0.081 0.051 -0.180 0.020 0.001 0.001 1904.0 \n", "beta[3] -0.051 0.105 -0.252 0.158 0.002 0.002 2354.0 \n", "beta[4] 0.067 0.282 -0.477 0.629 0.006 0.004 2522.0 \n", "beta[5] -0.445 0.223 -0.887 -0.015 0.004 0.003 3368.0 \n", "beta[6] -0.030 0.102 -0.238 0.167 0.002 0.001 2832.0 \n", "beta[7] 0.875 0.088 0.704 1.045 0.002 0.001 2974.0 \n", "beta[8] -0.205 0.139 -0.477 0.066 0.003 0.002 2613.0 \n", "beta[9] 0.227 0.140 -0.052 0.494 0.003 0.002 3136.0 \n", "beta[10] -0.001 0.237 -0.446 0.458 0.004 0.003 3192.0 \n", "beta[11] 0.157 0.208 -0.277 0.542 0.003 0.003 3804.0 \n", "beta[12] 0.148 0.162 -0.163 0.459 0.003 0.002 3599.0 \n", "beta[13] 0.429 0.191 0.049 0.785 0.003 0.002 3723.0 \n", "beta[14] -1.469 0.515 -2.440 -0.469 0.009 0.007 3423.0 \n", "tau 0.060 0.005 0.050 0.071 0.000 0.000 3490.0 \n", "variance 16.892 1.520 14.082 19.992 0.026 0.018 3490.0 \n", "br2 0.735 0.024 0.686 0.779 0.000 0.000 3490.0 \n", "br2_adj 0.719 0.025 0.667 0.766 0.000 0.000 3490.0 \n", "\n", " ess_tail r_hat \n", "beta[0] 2197.0 1.0 \n", "beta[1] 2779.0 1.0 \n", "beta[2] 2276.0 1.0 \n", "beta[3] 2564.0 1.0 \n", "beta[4] 2951.0 1.0 \n", "beta[5] 2648.0 1.0 \n", "beta[6] 2976.0 1.0 \n", "beta[7] 2991.0 1.0 \n", "beta[8] 2747.0 1.0 \n", "beta[9] 2892.0 1.0 \n", "beta[10] 2778.0 1.0 \n", "beta[11] 3102.0 1.0 \n", "beta[12] 2831.0 1.0 \n", "beta[13] 2965.0 1.0 \n", "beta[14] 3119.0 1.0 \n", "tau 2580.0 1.0 \n", "variance 2580.0 1.0 \n", "br2 2580.0 1.0 \n", "br2_adj 2580.0 1.0 " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "az.summary(trace, hdi_prob=0.95)" ] }, { "cell_type": "code", "execution_count": 7, "id": "4a6e7cfe-08b8-40c2-ba19-ae4e353ddf2d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "r2 0.653819\n", "r2_std 0.024084\n", "dtype: float64" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y_pred = ppc.posterior_predictive.stack(sample=(\"chain\", \"draw\"))[\"likelihood\"].values.T\n", "az.r2_score(y, y_pred)" ] }, { "cell_type": "markdown", "id": "cbaeb295-78b8-4212-bab4-7029b37dbf66", "metadata": {}, "source": [ "- https://arxiv.org/abs/1702.01201\n", "\n", "- https://towardsdatascience.com/linear-regression-model-selection-through-zellners-g--prior-da5f74635a03\n", "\n", "- https://en.wikipedia.org/wiki/G-prior\\\n", "\n", "Original paper:\n", "\n", "Zellner, A. (1986). \"On Assessing Prior Distributions and Bayesian Regression Analysis with g Prior Distributions\". In Goel, P.; Zellner, A. (eds.). Bayesian Inference and Decision Techniques: Essays in Honor of Bruno de Finetti. Studies in Bayesian Econometrics and Statistics. Vol. 6. New York: Elsevier. pp. 233 243. ISBN 978-0-444-87712-3." ] }, { "cell_type": "code", "execution_count": 8, "id": "f298b004-1364-475c-99f1-452c423978a9", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Last updated: Wed Mar 22 2023\n", "\n", "Python implementation: CPython\n", "Python version : 3.11.0\n", "IPython version : 8.9.0\n", "\n", "pytensor: 2.10.1\n", "\n", "pandas : 1.5.3\n", "arviz : 0.14.0\n", "numpy : 1.24.2\n", "pytensor: 2.10.1\n", "pymc : 5.1.2\n", "\n" ] } ], "source": [ "%load_ext watermark\n", "%watermark -n -u -v -iv -p pytensor" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.0" } }, "nbformat": 4, "nbformat_minor": 5 }