{ "cells": [ { "cell_type": "code", "execution_count": 5, "id": "334fb185-d552-42c0-875a-bc6839408e05", "metadata": {}, "outputs": [], "source": [ "import pymc as pm\n", "import numpy as np\n", "import pandas as pd\n", "import arviz as az" ] }, { "cell_type": "markdown", "id": "e6298873-237d-41b0-9452-7812d26b8ff3", "metadata": {}, "source": [ "# 10. SSVS: Hald*" ] }, { "cell_type": "markdown", "id": "b46cdf8e-f486-4d40-a2b3-3da23bda254b", "metadata": {}, "source": [ "Adapted from [Haldssvs.odc](https://raw.githubusercontent.com/areding/6420-pymc/main/original_examples/Codes4Unit9/Haldssvs.odc).\n", "\n", "SVSS can be looked at as adding an indicator variable to each covariate, then randomly turning each one on and off as you sample." ] }, { "cell_type": "code", "execution_count": 2, "id": "09c1a672-ec64-40b7-989f-255039d5cb12", "metadata": {}, "outputs": [], "source": [ "data = pd.read_csv(\"../data/hald_data.csv\")\n", "y = data[\"y\"].to_numpy()\n", "X = data.drop(\"y\", axis=1).to_numpy()\n", "\n", "X_centered = (X - X.mean(axis=0)) / X.std(axis=0)" ] }, { "cell_type": "code", "execution_count": 6, "id": "85efbc45-0ad0-4f3e-b154-cacc99b26166", "metadata": { "tags": [ "hide-output" ] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Multiprocess sampling (4 chains in 4 jobs)\n", "CompoundStep\n", ">BinaryGibbsMetropolis: [delta]\n", ">NUTS: [alpha, tau, intercept]\n" ] }, { "data": { "text/html": [ "\n", "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", "
\n", " \n", " 100.00% [44000/44000 00:18<00:00 Sampling 4 chains, 0 divergences]\n", "
\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "Sampling 4 chains for 1_000 tune and 10_000 draw iterations (4_000 + 40_000 draws total) took 19 seconds.\n" ] } ], "source": [ "_, p = X.shape\n", "draws = 10000\n", "chains = 4\n", "\n", "with pm.Model() as m_svss:\n", " # SVSS prior\n", " delta = pm.Bernoulli(\"delta\", p=0.5, shape=p)\n", " alpha = pm.Normal(\"alpha\", 0, tau=0.1, shape=p)\n", " beta = pm.Deterministic(\"beta\", delta * alpha)\n", "\n", " tau = pm.Gamma(\"tau\", 0.1, 0.1)\n", " intercept = pm.Normal(\"intercept\", 0, tau=0.001)\n", "\n", " mu = intercept + pm.math.dot(X_centered, beta)\n", " pm.Normal(\"likelihood\", mu, tau=tau, observed=y)\n", "\n", " trace = pm.sample(draws=draws, chains=chains, target_accept=0.95)" ] }, { "cell_type": "code", "execution_count": 7, "id": "4e6dfe42-9659-47d1-9853-2f72cc4bc184", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
meansdhdi_3%hdi_97%
delta[0]1.0000.0211.01.0
delta[1]0.9310.2540.01.0
delta[2]0.4300.4950.01.0
delta[3]0.9410.2351.01.0
\n", "
" ], "text/plain": [ " mean sd hdi_3% hdi_97%\n", "delta[0] 1.000 0.021 1.0 1.0\n", "delta[1] 0.931 0.254 0.0 1.0\n", "delta[2] 0.430 0.495 0.0 1.0\n", "delta[3] 0.941 0.235 1.0 1.0" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "az.summary(trace, var_names=\"delta\", kind=\"stats\")" ] }, { "cell_type": "markdown", "id": "bf73349c-5ed5-41de-a740-3f147ad374cc", "metadata": {}, "source": [ "Delta values, representing how often each feature was selected for the model, match the BUGS results. But what about looking at how often each model was selected? I really don't want to code that quadruple for-loop (the model variable in the BUGS example) or try to vectorize it.\n", "\n", "Let's look in the trace. We can view the delta variable across all chains and samples." ] }, { "cell_type": "code", "execution_count": 8, "id": "348211f0-da7b-4b8d-be3c-db0c0885794f", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
<xarray.DataArray 'delta' (chain: 4, draw: 10000, delta_dim_0: 4)>\n",
       "array([[[1, 1, 0, 1],\n",
       "        [1, 1, 1, 1],\n",
       "        [1, 1, 1, 1],\n",
       "        ...,\n",
       "        [1, 1, 0, 0],\n",
       "        [1, 1, 0, 0],\n",
       "        [1, 1, 1, 0]],\n",
       "\n",
       "       [[1, 1, 0, 1],\n",
       "        [1, 1, 0, 1],\n",
       "        [1, 1, 0, 1],\n",
       "        ...,\n",
       "        [1, 1, 1, 1],\n",
       "        [1, 1, 1, 1],\n",
       "        [1, 1, 0, 1]],\n",
       "\n",
       "       [[1, 1, 1, 1],\n",
       "        [1, 1, 1, 1],\n",
       "        [1, 1, 1, 1],\n",
       "        ...,\n",
       "        [1, 1, 0, 1],\n",
       "        [1, 1, 0, 1],\n",
       "        [1, 1, 0, 1]],\n",
       "\n",
       "       [[1, 1, 0, 1],\n",
       "        [1, 1, 0, 1],\n",
       "        [1, 1, 0, 1],\n",
       "        ...,\n",
       "        [1, 1, 0, 1],\n",
       "        [1, 1, 0, 1],\n",
       "        [1, 1, 0, 1]]])\n",
       "Coordinates:\n",
       "  * chain        (chain) int64 0 1 2 3\n",
       "  * draw         (draw) int64 0 1 2 3 4 5 6 ... 9994 9995 9996 9997 9998 9999\n",
       "  * delta_dim_0  (delta_dim_0) int64 0 1 2 3
" ], "text/plain": [ "\n", "array([[[1, 1, 0, 1],\n", " [1, 1, 1, 1],\n", " [1, 1, 1, 1],\n", " ...,\n", " [1, 1, 0, 0],\n", " [1, 1, 0, 0],\n", " [1, 1, 1, 0]],\n", "\n", " [[1, 1, 0, 1],\n", " [1, 1, 0, 1],\n", " [1, 1, 0, 1],\n", " ...,\n", " [1, 1, 1, 1],\n", " [1, 1, 1, 1],\n", " [1, 1, 0, 1]],\n", "\n", " [[1, 1, 1, 1],\n", " [1, 1, 1, 1],\n", " [1, 1, 1, 1],\n", " ...,\n", " [1, 1, 0, 1],\n", " [1, 1, 0, 1],\n", " [1, 1, 0, 1]],\n", "\n", " [[1, 1, 0, 1],\n", " [1, 1, 0, 1],\n", " [1, 1, 0, 1],\n", " ...,\n", " [1, 1, 0, 1],\n", " [1, 1, 0, 1],\n", " [1, 1, 0, 1]]])\n", "Coordinates:\n", " * chain (chain) int64 0 1 2 3\n", " * draw (draw) int64 0 1 2 3 4 5 6 ... 9994 9995 9996 9997 9998 9999\n", " * delta_dim_0 (delta_dim_0) int64 0 1 2 3" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "trace.posterior.delta" ] }, { "cell_type": "markdown", "id": "3f8f1288-d538-40ca-a4c6-8d83d98a8518", "metadata": {}, "source": [ "We just need a way to count each unique combination of predictors. I didn't see an obvious way to do that in xarray, so I'm copying the delta values to a NumPy array, then reshaping to include all draws for each chain along dimension 0, and finally getting the unique vectors of length p along with their counts, which we can use to calculate the probability of each model being selected." ] }, { "cell_type": "code", "execution_count": 9, "id": "7b9f167f-fee5-4c48-b571-f9a74f252fd1", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[0 0 1 1]: prob=0.000175\n", "[0 1 1 1]: prob=0.00025\n", "[1 0 0 1]: prob=0.0112\n", "[1 0 1 1]: prob=0.0578\n", "[1 1 0 0]: prob=0.0394\n", "[1 1 0 1]: prob=0.519\n", "[1 1 1 0]: prob=0.0192\n", "[1 1 1 1]: prob=0.353\n" ] } ], "source": [ "rows = chains * draws\n", "\n", "deltas = trace.posterior.delta.to_numpy()\n", "models, counts = np.unique(\n", " deltas.reshape((rows, p)), axis=0, return_counts=True\n", ")\n", "\n", "for model, count in zip(models, counts):\n", " print(f\"{model}: prob={count/rows:.3}\")" ] }, { "cell_type": "markdown", "id": "072f5589-b133-454f-afdc-32940cf13d85", "metadata": {}, "source": [ "The most chosen models are the same as BUGS: intercept + x0 + x1 + x3 is selected over 50% of the time, intercept + x0 + x1 + x2 + x3 about a third of the time, and so on." ] }, { "cell_type": "code", "execution_count": 10, "id": "ec019ecd-a8e4-4b61-8fb8-6d732d5202c5", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Last updated: Wed Mar 22 2023\n", "\n", "Python implementation: CPython\n", "Python version : 3.11.0\n", "IPython version : 8.9.0\n", "\n", "pytensor: 2.10.1\n", "\n", "pymc : 5.1.2\n", "numpy : 1.24.2\n", "pandas: 1.5.3\n", "arviz : 0.14.0\n", "\n" ] } ], "source": [ "%load_ext watermark\n", "%watermark -n -u -v -iv -p pytensor" ] }, { "cell_type": "code", "execution_count": null, "id": "adf0ac48-61c1-4ffb-a926-bc709feae1b0", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.0" } }, "nbformat": 4, "nbformat_minor": 5 }