diff --git a/module3/exo3/exercice_en.ipynb b/module3/exo3/exercice_en.ipynb
index 0bbbe371b01e359e381e43239412d77bf53fb1fb..ccc2812356fefcefdab74c88bfd68339c1ecc087 100644
--- a/module3/exo3/exercice_en.ipynb
+++ b/module3/exo3/exercice_en.ipynb
@@ -1,5 +1,1901 @@
{
- "cells": [],
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Smoker | \n",
+ " Status | \n",
+ " Age | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " Yes | \n",
+ " Alive | \n",
+ " 21.0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " Yes | \n",
+ " Alive | \n",
+ " 19.3 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " No | \n",
+ " Dead | \n",
+ " 57.5 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " No | \n",
+ " Alive | \n",
+ " 47.1 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " Yes | \n",
+ " Alive | \n",
+ " 81.4 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Smoker Status Age\n",
+ "0 Yes Alive 21.0\n",
+ "1 Yes Alive 19.3\n",
+ "2 No Dead 57.5\n",
+ "3 No Alive 47.1\n",
+ "4 Yes Alive 81.4"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "\n",
+ "df = pd.read_csv(\"../../Subject6_smoking.csv\")\n",
+ "df.head()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " Smoker Status Age\n",
+ "0 Yes Alive 21.0\n",
+ "1 Yes Alive 19.3\n",
+ "2 No Dead 57.5\n",
+ "3 No Alive 47.1\n",
+ "4 Yes Alive 81.4\n",
+ "Status Alive Dead\n",
+ "Smoker \n",
+ "No 502 230\n",
+ "Yes 443 139\n",
+ "Smoker\n",
+ "No 49.815847\n",
+ "Yes 44.269759\n",
+ "Name: Age, dtype: float64\n",
+ "Smoker Status\n",
+ "No Alive 40.347410\n",
+ " Dead 70.481739\n",
+ "Yes Alive 39.648984\n",
+ " Dead 58.996403\n",
+ "Name: Age, dtype: float64\n"
+ ]
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "\n",
+ "# Load the dataset\n",
+ "df = pd.read_csv(\"../../Subject6_smoking.csv\")\n",
+ "\n",
+ "# Quick look at the first rows\n",
+ "print(df.head())\n",
+ "\n",
+ "# Summary by smoker status\n",
+ "summary = df.groupby(\"Smoker\")[\"Status\"].value_counts().unstack()\n",
+ "print(summary)\n",
+ "\n",
+ "# Mean age by smoker status\n",
+ "mean_age = df.groupby(\"Smoker\")[\"Age\"].mean()\n",
+ "print(mean_age)\n",
+ "\n",
+ "# Check by smoker *and* status to see the paradox\n",
+ "cross_summary = df.groupby([\"Smoker\", \"Status\"])[\"Age\"].mean()\n",
+ "print(cross_summary)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "import seaborn as sns\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "# Side-by-side boxplots\n",
+ "plt.figure(figsize=(8,5))\n",
+ "sns.boxplot(data=df, x=\"Smoker\", y=\"Age\", hue=\"Status\")\n",
+ "plt.title(\"Age distribution by Smoker status and Survival\")\n",
+ "plt.show()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAfIAAAFNCAYAAAD7De1wAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAAIABJREFUeJzt3Xt8VfWd7vHPQ4IFvMutCCra4KXaihirjpdiFYutgm1Pi1Onxo4ex6pAbaczHs/0VDvKsWecjooztXTaGmcqXouiR6lIQa3HWhEpqNgSWwQRAWPlIkgJfM8fa4XZxAR2MCsra+d5v1557az7d+29kmf/fmvvtRQRmJmZWTH1yLsAMzMz23UOcjMzswJzkJuZmRWYg9zMzKzAHORmZmYF5iA3MzMrMAe5dQpJ10j6z/T3AyWtl1TVQeu+TdK3099HSnq9I9abru8USb/rqPWVrHeopJBU3dHrzlJac03edVSCrnQMlP4NfcD13C7puo6oycrnIO+GJM2R9CdJH8pj+xGxNCL2iIgtO5pP0oWSflXG+i6NiH/siNpaBlVEPBURh3XEurMiaR9JP5H0pqR1kn4v6e/zrqujlHsclMzfZQLyg5B0sqT/J2mNpLclPS3puCy21ZF/Q9b5HOTdjKShwClAAGNyLaYDdFSrvuD+BdgDOALYm+R1fTXXinbCr9uOSdoLeBiYDOwHDAauBTbtwrokyf/rK5hf3O7nAuDXwO1AXekESX0lPSRpraTnJF1X2hKSdLikmWnr4HeSvtTWRiQdLOmJtIU4E+hXMm27FlPa4vpDOu8fJZ0v6QjgNuDEtBv+nXTe2yX9QNIjkt4FTmutO0/S1ZLekrRE0vkl4+dIurhkeFtrT9KT6ejfptsc17KrXtIR6TrekfSSpDEl026X9K+S/m+6L89K+shOXo+/lvSGpBWSvpmu58OSNkjqW7LuYyWtltSzlXUcB9wZEX+KiK0R8UpE3FeybEi6TNLitK5/lPQRSc+kr/U9knYrmf+/S2pIX+fpkvZvrfC0xbhM0mnpcJvHR2uvWyvra89x8FlJL6T1L5N0Tcmqml/Hd9JlTlTJqZ10+Z0eg23s8yfS5+2d9DW7tcVzF5IuTZ/rP6XHg9JpVZJuTI/LPwCfbW0bqUMBImJqRGyJiI0R8VhELEjXtbP9mSPpeklPAxuAqyXNbbEvV0qaXvL6XJf+vkjS2SXzVac1j0iH71XS+7NG0pOSjtzBflhniAj/dKMfoAG4DDgW2AwMLJl2V/rTB/gosAz4VTpt93T4q0A1MAJ4Cziyje08A3wf+BBwKrAO+M902lCSHoHqdL1rgcPSaYOa1wlc2Lz9kvXeDqwBTiJ5I9orHXddOn0k0FSy7U8C75asfw5wccn6tttGWldNyfBI4PX0957p83c1sBvwqXS/Diup7W3gE+m+/Qy4q43np/k5mJo+Bx8DVgNnpNMfAb5WMv+/AJPbWNe/Ay+lr82wVqYHMB3YCziSpFU3CziEpAX/MlCXzvup9HUdkT5/k4EnWz4/wKfT4+ET5Rwfrb1uLWps73EwMn3OegAfB1YC57Y8vkrmv4b0+GvPMdjKc3kscEK63FBgEfD1Fs/Pw8A+wIHpazo6nXYp8ApwAEkre3bLOkvWsxfQCNQDZwH7tpje5v6UHOdL09e7On2d15UeH8BzwHklr0/z39D/An5WMt9ngVdKhv8a2DM9Pm4C5rf4+7wu7/9z3e3HLfJuRNLJwEHAPRHxPEn365fTaVXAF4DvRMSGiHiZ5J9Is7OBJRHx04hoioh5wP3Af2tlOweStBK/HRGbIuJJ4KEdlLYVOEpS74hYEREv7WRXHoyIpyNpfb7XxjzN234C+L9Am70H7XACSRf2DRHx54j4Jck/7b8smefnEfGbiGgiCfLhO1nntRHxbkQsBH5asq564K9g22vzl8B/tLGO8em2rgBeTlvTZ7WY53sRsTZ9bl8EHouIP0TEGuBR4Jh0vvOBn0TEvIjYBPwPktbw0JJ1fRGYAnwmIn6Tjivn+NjZ61b2cRARcyJiYbquBSRviD7Z1vxlKGvbEfF8RPw63cclwA9b2e4NEfFORCwlCevmY+BLwE0RsSwi3gb+9w72by1wMkk4/whYnfaODGzHPt0eES+lta4BHiQ9viQNAw4neYPX0p3AGEl90uEvp+Oaa/tJRKxLj49rgKMl7d2OuqyDOci7lzqSf+BvpcN38l/d6/1J3rkvK5m/9PeDgOPTLsV30i7O84EPt7Kd/YE/RcS7JeNea62gdJ5xJK2VFWm39OE72Y9lO5ne2rZb7R5up/2BZRGxtcW6B5cMv1ny+waS4N+R0n0prfNB4KOSDgFGAWtKQnM7kXS7ToqIY4G+wD3AvZL2K5ltZcnvG1sZbq5zf0peq4hYT9IyLN3Hr5O8GVxYMq6c46PN1629x4Gk4yXNTk83rEmX69fW/DvSnm1LOlTSw2nX8lpgUivbbesY2J/3v947qmtRRFwYEUOAo9Llbyp3v3j/830n//VG8cvAAxGxoZXtNpD0NJyThvmYdNnm0wM3SHo13f8l6WK79Nxbx3CQdxOSepO0CD6Z/hN6E7iS5N300SRdgE3AkJLFDij5fRnwRETsU/KzR0R8rZXNrQD2lbR7ybgD26otIn4REaNIujRfIWmBQNIaaXWRtvcU2tj2G+nv75KcOmjW2huRtrwBHKDtPzh0ILC8HetoqfQ53lZn2mK9hyQMv0LbrfHtpC25SSTdxQfvQj1vkIQyAOnz2Jft9/GLwLmSvl4yrpzjY4evWzuPgztJWpMHRMTeJOfRtYP5d/i672DbLf0gnT4sIvYiOc2iNuZtaQXvf73LEhGvkHRbH5WOKuc4bvk8PAb0kzScJNDvfP8i20xN5xkLvJyGOyRvAMYCZ5B01w9Nx5f7HFgGHOTdx7nAFpJz38PTnyOAp4ALIvkq2M+BayT1SVskF5Qs/zBwqKSvSOqZ/hyXfhhpOxHxGjAXuFbSbmmX/jmtFSVpoKQxaWBsAtandULSahxS+mGidmje9ikk3b73puPnA59P97EGuKjFcitJzh235lmSf6B/l+7/yHS/7tqF+pp9O63lSJLzy3eXTLuD5PzwGOA/W1kWAEnfTl+L3ST1AiYC7wC78v33O4GvShqu5OuJk4Bn027kZm8ApwMTJF2Wjiv7+GhjH9p7HOwJvB0R70n6BOkpotRqkq7y0tdxPnCqkmsY7E1yyqCcbbe0J8n59PXp30hrb2Tbcg/JczZE0r7AVW3NqOSDg9+UNCQdPoAkWH+9s/1pS3q65z7gn0jO0c/cwex3AWeS7F9p4O9J8hw1kryRmLSz7Vr2HOTdRx3w00i+w/1m8w9wK3B++mnXK0jeZb9J0gKcSvp1l4hYR/KHfR7JP/I3ge+RfOClNV8Gjif58Nd3SEKpNT2Ab6brfJvkfGNzOPyS5ENcb0p6q/XFW/Um8Kd0nT8DLk1bNJB8aOzPJOFQn04vdQ1Qn3YPb3dePSL+TBKqZ5F8kOvfSN4EvcKue4LkA3SzgBsj4rGS7T1NEkjzWgRpS0Fyfv0tkn0eBXw27RZvl4iYBXyb5Pz2CuAjJK95y/mWkoT530u6eBeOj5baexxcBnxX0jqSD2fdU1LbBuB64On0dTwhImaSvElaADxP8sajnG239Lckx/Y6klb73W3M15ofAb8AfgvMI3nj3JZ1JH8/zyr5lP+vST7b8M10H3e0PztyJ0lr+t402FsVEStIPrD6F7z/zeVrJD00L/NfbywsR4rYWS+ldVeSvgd8OCLqdjqzZULSL0m+WvbveddiZl2TW+S2Tdqd93ElPkHS7Twt77q6KyVX8RpB+1p9ZtbNFPoShtbh9iTpTt8fWAX8M8mnp62TSaon+VzDxLTb2sysVe5aNzMzKzB3rZuZmRWYg9zMzKzACnGOvF+/fjF06NC8yzAzM+sUzz///FsR0b+ceQsR5EOHDmXu3Lk7n9HMzKwCSNrhJXxLuWvdzMyswBzkZmZmBeYgNzMzKzAHuZmZWYE5yM3MzArMQW5mZlZgmQa5pImSXpT0kqSvp+P2kzRT0uL0cd8sazAzM6tkmQW5pKOA/w58AjgaOFvSMOAqYFZEDCO5B/NVWdVgZmZW6bJskR8B/DoiNqQ3sH8C+BwwFqhP52m+w5OZmZntgiyv7PYicL2kvsBG4DPAXGBgRKwAiIgVkgZkWIOZmXWwyZMn09DQ0KHrXL58OQCDBw/u0PUC1NTUMH78+A5fb1eRWZBHxCJJ3wNmAuuB3wJN5S4v6RLgEoADDzwwkxrNrLJkETDgkOkMGzduzLuEwsr0WusR8WPgxwCSJgGvAyslDUpb44OAVW0sOwWYAlBbW+ubpptZbhwy28vijcfEiRMBuPnmmzt83ZUu0yCXNCAiVkk6EPg8cCJwMFAH3JA+PphlDVaZ3PKy1mT1/DpkrCvL+u5n96fnyDcDl0fEnyTdANwj6SJgKfDFjGswK5tbXmZWNFl3rZ/SyrhG4PQst2uVzy0vM7OEr+xmZmZWYA5yMzOzAnOQm5mZFZiD3MzMrMAc5GZmZgXmIDczMyswB7mZmVmBOcjNzMwKzEFuZmZWYA7yAmlsbGTChAk0NjbmXYqZmXURDvICqa+vZ+HChdxxxx15l2JmZl2Eg7wgGhsbmTFjBhHBjBkz3Co3MzPAQV4Y9fX1bN26FYAtW7a4VW5mZoCDvDAef/xxmpqaAGhqamLmzJk5V2RmZl2Bg7wgzjjjDKqrk7vOVldXM2rUqJwrMjOzrsBBXhB1dXX06JG8XFVVVVxwwQU5V2RmZl2Bg7wg+vbty+jRo5HE6NGj6du3b94lmZlZF1CddwFWvrq6OpYsWeLWuJmZbeMgL5C+fftyyy235F2GmZl1Ie5aNzMzKzAHuZmZWYE5yAvE11o3M7OWfI68QEqvtX7llVfmXY6ZdXGTJ0+moaEh7zLK0lznxIkTc66kfDU1NYwfPz7vMhzkRdHyWusXXHCBv4JmZjvU0NDA4pde4MA9tuRdyk7ttjnpIN702tycKynP0vVVeZewjYO8IFq71rpb5Wa2MwfusYWrR6zNu4yKM2neXnmXsI3PkReEr7VuZmatyTTIJV0p6SVJL0qaKqmXpP0kzZS0OH3cN8saKoWvtW5mZq3JLMglDQYmALURcRRQBZwHXAXMiohhwKx02HbC11o3M7PWZN21Xg30llQN9AHeAMYC9en0euDcjGuoCL7WupmZtSazII+I5cCNwFJgBbAmIh4DBkbEinSeFcCA1paXdImkuZLmrl69OqsyC6Wuro6Pfexjbo2bmdk2WXat70vS+j4Y2B/YXdJflbt8REyJiNqIqO3fv39WZRZK87XW3Ro3M7NmWXatnwH8MSJWR8Rm4OfAXwArJQ0CSB9XZViDmZlZRcsyyJcCJ0jqI0nA6cAiYDpQl85TBzyYYQ1mZmYVLbMLwkTEs5LuA+YBTcALwBRgD+AeSReRhP0Xs6rBzMys0mV6ZbeI+A7wnRajN5G0zs3MzOwD8pXdzMzMCsxBbmZmVmAOcjMzswJzkJuZmRWYg9zMzKzAHORmZmYF5iA3MzMrMAe5mZlZgTnIzczMCsxBbmZmVmAOcjMzswJzkJuZmRWYg9zMzKzAHORmZmYFlultTLuryZMn09DQ0OHrXb58OQCDBw/u8HXX1NQwfvz4Dl+vmZlly0FeIBs3bsy7BDMz62Ic5BnIqmU7ceJEAG6++eZM1m9mlWX58uW8u66KSfP2yruUivPauip2T3tJ8+Zz5GZmZgXmFrmZWYUaPHgwm5pWcPWItXmXUnEmzduLD2XweaVd4Ra5mZlZgTnIzczMCsxBbmZmVmAOcjMzswJzkJuZmRWYg9zMzKzAMgtySYdJml/ys1bS1yXtJ2mmpMXp475Z1WBmZlbpMgvyiPhdRAyPiOHAscAGYBpwFTArIoYBs9JhMzMz2wWd1bV+OvBqRLwGjAXq0/H1wLmdVIOZmVnF6awgPw+Ymv4+MCJWAKSPAzqpBjMzs4qTeZBL2g0YA9zbzuUukTRX0tzVq1dnU5yZmVnBdUaL/CxgXkSsTIdXShoEkD6uam2hiJgSEbURUdu/f/9OKNPMzKx4OiPI/5L/6lYHmA7Upb/XAQ92Qg1mZmYVKdMgl9QHGAX8vGT0DcAoSYvTaTdkWYOZmVkly/Q2phGxAejbYlwjyafYzczM7APyld3MzMwKzEFuZmZWYA5yMzOzAsv0HLnZ5MmTaWhoyLuMsjXXOnHixJwrKV9NTQ3jx4/Puwwzy4mD3DLV0NDA4pde4MA9tuRdSll225x0Um16bW7OlZRn6fqqvEsws5w5yC1zB+6xhatHrM27jIo0ad5eeZdgZjnzOXIzM7MCc5CbmZkVmIPczMyswBzkZmZmBeYgNzMzKzAHuZmZWYE5yM3MzArMQW5mZlZgDnIzM7MCc5CbmZkVmIPczMyswBzkZmZmBeabppiZVbCl66sKcXOdlRuSduXAPltzrqQ8S9dXMSzvIlIOcjOzClVTU5N3CWX7c0MDAB86qBg1D6PrPL8OcjOzCjV+/Pi8SyjbxIkTAbj55ptzrqR4fI7czMyswBzkZmZmBeYgNzMzKzAHuZmZWYFlGuSS9pF0n6RXJC2SdKKk/STNlLQ4fdw3yxrMzMwqWdYt8puBGRFxOHA0sAi4CpgVEcOAWemwmZmZ7YLMglzSXsCpwI8BIuLPEfEOMBaoT2erB87NqgYzM7NKl2WL/BBgNfBTSS9I+ndJuwMDI2IFQPo4IMMazMzMKlqWQV4NjAB+EBHHAO/Sjm50SZdImitp7urVq7Oq0czMrNCyvLLb68DrEfFsOnwfSZCvlDQoIlZIGgSsam3hiJgCTAGora2NDOu0DC1fvpx31xXjWs9F9Nq6KnZfvjzvMswsR5m1yCPiTWCZpMPSUacDLwPTgbp0XB3wYFY1mJmZVbqsr7U+HviZpN2APwBfJXnzcI+ki4ClwBczrsFyNHjwYDY1reDqEWvzLqUiTZq3Fx8aPDjvMswsR5kGeUTMB2pbmXR6lts1MzPrLnxlNzMzswJzkJuZmRWYg9zMzKzAHORmZmYF5iA3MzMrMAe5mZlZgTnIzczMCsxBbmZmVmAOcjMzswLbaZBLGijpx5IeTYc/ml5e1czMzHJWTov8duAXwP7p8O+Br2dVkJmZmZWvnCDvFxH3AFsBIqIJ2JJpVWZmZlaWcoL8XUl9gQCQdAKwJtOqzMzMrCzl3P3sGyT3EP+IpKeB/sB/y7QqMzMzK8tOgzwi5kn6JHAYIOB3EbE588rMzMxsp3Ya5JI+32LUoZLWAAsjYlU2ZZmZmVk5yulavwg4EZidDo8Efk0S6N+NiP/IqDYzMzPbiXKCfCtwRESshOR75cAPgOOBJwEHuZmZWU7K+dT60OYQT60CDo2ItwGfKzczM8tROS3ypyQ9DNybDn8BeFLS7sA7mVVmZmZmO1VOkF8OfB44OR3+DTAoIt4FTsuqMDMzM9u5cr5+FpJeJTkn/iXgj8D9WRdmZpVr8uTJNDQ05F1G2ZprnThxYs6VlK+mpobx48fnXYZ1gjaDXNKhwHnAXwKNwN2AIsKtcDP7QBoaGpj/4iK29Nkv71LK0uPPAcDzf1i5kzm7hqoNb+ddgnWiHbXIXwGeAs6JiAYASVd2SlVmVvG29NmPjYd/Ju8yKlLvVx7JuwTrRDv61PoXgDeB2ZJ+JOl0kiu7mZmZWRfRZpBHxLSIGAccDswBrgQGSvqBpDM7qT4zMzPbgZ1+jzwi3o2In0XE2cAQYD5wVTkrl7RE0kJJ8yXNTcftJ2mmpMXp474faA/MzMy6sXIuCLNNRLwdET+MiE+1Y7HTImJ4RNSmw1cBsyJiGDCLMt8UmJmZ2fu1K8g7yFigPv29Hjg3hxrMzMwqQjkXhPkgAnhMUgA/jIgpwMCIWAEQESskDWhtQUmXAJcAHHjggZkU5++yZm/58uX0y7sIM7MKlnWQnxQRb6RhPVPSK+UumIb+FIDa2trIojh/lzVbVRveZo9ePaFn3pWYmVWuTIM8It5IH1dJmgZ8AlgpaVDaGh9EchOW3Pi7rNnp/cojsHVd3mWYmVW0zM6RS9pd0p7NvwNnAi8C04G6dLY64MGsajAzM6t0WbbIBwLTJDVv586ImCHpOeAeSRcBS4EvZliDmZlZRcssyCPiD8DRrYxvBE7PartmZmbdSR5fPzMzM7MO4iA3MzMrMAe5mZlZgTnIzczMCsxBbmZmVmAOcjMzswJzkJuZmRWYg9zMzKzAHORmZmYF5iA3MzMrMAe5mZlZgWV9P3Izlq6vYtK8vfIuoywrNyTvbQf22ZpzJeVZur6KYXkXYWa5cpBbpnr37s3gmpq8yyjbnxsaAPjQQcWoeRhQU6Dn18w6noPcMjV48GBuvvnmvMso28SJEwEKVbOZdW8+R25mZlZgDnIzM7MCc5CbmZkVmIPczMyswBzkZmZmBeYgNzMzKzAHuZmZWYE5yM3MzArMQW5mZlZgDnIzM7MCc5CbmZkVWOZBLqlK0guSHk6H95M0U9Li9HHfrGswMzOrVJ3RIp8ILCoZvgqYFRHDgFnpsJmZme2CTINc0hDgs8C/l4weC9Snv9cD52ZZg5mZWSXLukV+E/B3wNaScQMjYgVA+jgg4xrMzMwqVmZBLulsYFVEPL+Ly18iaa6kuatXr+7g6szMzCpDli3yk4AxkpYAdwGfkvSfwEpJgwDSx1WtLRwRUyKiNiJq+/fvn2GZZmZmxZVZkEfE/4iIIRExFDgP+GVE/BUwHahLZ6sDHsyqBjMzs0qXx/fIbwBGSVoMjEqHzczMbBdUd8ZGImIOMCf9vRE4vTO2uzPLly+nasMaer/ySN6lVKSqDY0sX96UdxlmZhXNV3YzMzMrsE5pkXdVgwcP5s1N1Ww8/DN5l1KRer/yCIMHD8y7DOuC3BuWLfeGdS9ukZuZmRVYt26Rm1k+3BuWLfeGdS9ukZuZmRWYg9zMzKzAHORmZmYF5iA3MzMrMAe5mZlZgTnIzczMCsxfPzMzs3aZPHkyDQ0NHbrO5vVNnDixQ9cLUFNTw/jx4zt8vV2Fg9zMzHLXu3fvvEsoLAe5mZm1SyW3bovI58jNzMwKzEFuZmZWYA5yMzOzAnOQm5mZFZiD3MzMrMAc5GZmZgXmIDczMyswB7mZmVmBOcjNzMwKzEFuZmZWYA5yMzOzAnOQm5mZFVhmQS6pl6TfSPqtpJckXZuO30/STEmL08d9s6rBzMys0mXZIt8EfCoijgaGA6MlnQBcBcyKiGHArHTYzMy6scbGRiZMmEBjY2PepRROZkEeifXpYM/0J4CxQH06vh44N6sazMysGOrr61m4cCF33HFH3qUUTqbnyCVVSZoPrAJmRsSzwMCIWAGQPg7IsgYzM+vaGhsbefTRR4kIHn30UbfK2ynTII+ILRExHBgCfELSUeUuK+kSSXMlzV29enV2RZqZWa7q6+tpamoCYPPmzW6Vt1N1Z2wkIt6RNAcYDayUNCgiVkgaRNJab22ZKcAUgNra2uiMOs26os2bN/P666/z3nvv5V1Kh/nSl77EuU1biJ693zctAl5/5z1++sxS1m3akkN11tlmzpxJRPJvPiJ47LHHuPLKK3OuqjgyC3JJ/YHNaYj3Bs4AvgdMB+qAG9LHB7OqwawSvP766+y5554MHToUSXmX0yGWLl3Khk2b2dpr7/dNiwj267uOrwK3zPlj5xdnnW7gwIEsWbJku2ErX5Yt8kFAvaQqki78eyLiYUnPAPdIughYCnwxwxp2qmrD2/R+5ZE8Syhbj/fWArC11145V1Keqg1vA/6D/KDee++9igrxnZHEbn32ZMg+vfIuxTrJm2++ucNh27HMgjwiFgDHtDK+ETg9q+22R01NTd4ltEtDwzoAag4pSjgOLNxz3FV1lxBvJolutsvd2oc//OHtWuQf/vCH8yumgDrlHHlXNX78+LxLaJeJEycCcPPNN+dciRXN9ddfz5133klVVRU9evTghz/8Ic888wyXXHIJffr02eGyN910U1nzme2qlStX7nDYdsyXaDWrcM888wwPP/ww8+bNY8GCBTz++OMccMAB3HTTTWzYsGGny5c7n9muGjVq1LZeJ0mceeaZOVdULA5yswq3YsUK+vXrx4c+9CEA+vXrx3333ccbb7zBaaedxmmnnQbA1772NWpraznyyCP5zne+A8Att9zyvvn22GOPbeu+7777uPDCCwG49957Oeqoozj66KM59dRTO3EPrejq6uro2bMnAD179uSCCy7IuaJicZCbVbgzzzyTZcuWceihh3LZZZfxxBNPMGHCBPbff39mz57N7NmzgaT7fe7cuSxYsIAnnniCBQsWtDpfW7773e/yi1/8gt/+9rdMnz69M3bNKkTfvn0ZPXo0kjjrrLPo27dv3iUVioPcrMLtsccePP/880yZMoX+/fszbtw4br/99vfNd8899zBixAiOOeYYXnrpJV5++eV2beekk07iwgsv5Ec/+hFbtvj739Y+Y8aMoU+fPpxzzjl5l1I4DnKzbqCqqoqRI0dy7bXXcuutt3L//fdvN/2Pf/wjN954I7NmzWLBggV89rOfbfMCNKWfoC+d57bbbuO6665j2bJlDB8+3JfZtHaZPn06GzZs4KGHHsq7lMLp1p9aN+sOfve739GjRw+GDRsGwPz58znooINYsmQJ69ato1+/fqxdu5bdd9+dvffem5UrV/Loo48ycuRIAPbcc89t80FysY5FixZx2GGHMW3aNPbcc08AXn31VY4//niOP/54HnroIZYtW7bjLtKtW+jx3po2J2vzxi5zjQdfwyFbjY2NzJgxg4hgxowZXHDBBe5ebwcHuVmFW79+PePHj+edd96hurqampoapkyZwtSpUznrrLMYNGgQs2fP5phjjuHII4/kkEMO4aSTTtq2/CWXXLLdfDfccANnn302BxxwAEcddRTr1yc3OfzWt77F4sWLiQhOP/10jj766DZrav7g3Y7sVl3FsV3kmgm+hkO26uvrt52OaWpq4o477vAlWttBzde37cpqa2tj7ty5eZeRO3+PPHtd8TletGgRRxxxRN5ldLpC2dTIAAAL+0lEQVSutN9d8bioJJ/5zGe2+4pjnz59eOSRrtEbkxdJz0dEbTnz+hy5mZnl6uSTT95u+JRTTsmpkmJykJuZWa662yWIO5qD3MzMcvXUU0/tcNh2zEFuZma5OuOMM6iuTj57XV1dzahRo3KuqFgc5GZmlqu6ujp69EjiqKqqypdobScHuZmZ5ar0Eq2jR4/2d8jbyUFuZmWZNm0aknjllVcAWLJkCUcddRQAc+fOZcKECXmWZwVXV1fHxz72MbfGd4EvCGNWMFd841useuvtDlvfgH77cev3/2mn802dOpWTTz6Zu+66i2uuuWa7abW1tdTWlvWVV7NW9e3bl1tuuSXvMgrJQW5WMKveeptXB36y41a48omdzrJ+/XqefvppZs+ezZgxY94X5HPmzOHGG29k+vTpHHLIIcyfP5999tkHgJqaGp5++ml69OjBpZdeytKlS4HkPuelV5Azs13jrnUz26kHHniA0aNHc+ihh7Lffvsxb968Vufr0aMHY8eOZdq0aQA8++yzDB06lIEDBzJx4kSuvPJKnnvuOe6//34uvvjiztwFs4rlIDeznZo6dSrnnXceAOeddx5Tp05tc95x48Zx9913A3DXXXcxbtw4AB5//HGuuOIKhg8fzpgxY1i7di3r1q3LvnizCueudSukyZMn09DQ0OHrbV5n87W1O1JNTQ3jx4/v8PVmrbGxkV/+8pe8+OKLSGLLli1I4rLLLmt1/hNPPJGGhgZWr17NAw88wD/8wz8AsHXrVp555hl69+7dmeWbVTy3yM1K9O7d20HTwn333ccFF1zAa6+9xpIlS1i2bBkHH3wwr7/+eqvzS+Jzn/sc3/jGNzjiiCO2fZXozDPP5NZbb9023/z58zulfrNK5xa5FVIRW7ZFNXXqVK666qrtxn3hC19g0qRJbS4zbtw4jjvuOG6//fZt42655RYuv/xyPv7xj9PU1MSpp57KbbfdllXZZt2Gg9ysYAb026+sT5q3a307MGfOnPeNmzBhwnbfGx85ciQjR47cNlxbW0vLWyT369dv27lzM+s4DnKzginnO99m1n34HLmZmeWusbGRCRMm0NjYmHcphZNZkEs6QNJsSYskvSRpYjp+P0kzJS1OH/fNqgYzMyuG+vp6Fi5cyB133JF3KYWTZYu8CfhmRBwBnABcLumjwFXArIgYBsxKh83MrJtqbGxkxowZRAQzZsxwq7ydMgvyiFgREfPS39cBi4DBwFigPp2tHjg3qxrMzKzrq6+vZ+vWrQBs2bLFrfJ26pRz5JKGAscAzwIDI2IFJGEPDOiMGszMrGt6/PHHaWpqAqCpqYmZM2fmXFGxZB7kkvYA7ge+HhFr27HcJZLmSpq7evXq7Ao0s52qqqpi+PDhHHnkkRx99NF8//vf39aC+qCuueYabrzxxg5ZlxXTGWecQXV18iWq6upqRo0alXNFxZLp188k9SQJ8Z9FxM/T0SslDYqIFZIGAataWzYipgBTAGpra6O1ecy6o6u/eQVr3lrZYevbu99AJv3zrTucp3fv3tuuxLZq1Sq+/OUvs2bNGq699toOq8O6r7q6OmbMmAEkbxp9T/L2ySzIJQn4MbAoIr5fMmk6UAfckD4+mFUNZpVozVsr+fuPvNJh6/veq+2bf8CAAUyZMoXjjjuOa665hq1bt3LVVVcxZ84cNm3axOWXX87f/M3fsH79esaOHcuf/vQnNm/ezHXXXcfYsWMBuP7667njjjs44IAD6N+/P8cee2yH7Y8VT9++fRk9ejQPPfQQo0eP3nZZXytPli3yk4CvAAslNV9U+WqSAL9H0kXAUuCLGdZgZhk45JBD2Lp1K6tWreLBBx9k77335rnnnmPTpk2cdNJJnHnmmRxwwAFMmzaNvfbai7feeosTTjiBMWPGMG/ePO666y5eeOEFmpqaGDFihIPcqKurY8mSJW6N74LMgjwifgWojcmnZ7VdM+sczZdgfeyxx1iwYAH33XcfAGvWrGHx4sUMGTKEq6++mieffJIePXqwfPlyVq5cyVNPPcXnPvc5+vTpA8CYMWNy2wfrOvr27cstt9ySdxmF5Eu0ZsC32LRK94c//IGqqioGDBhARDB58mQ+/elPbzfP7bffzurVq3n++efp2bMnQ4cO5b333gOSO6RlwX971h35Eq0F4ltsWlewevVqLr30Uq644gok8elPf5of/OAHbN68GYDf//73vPvuu6xZs4YBAwbQs2dPZs+ezWuvvQbAqaeeyrRp09i4cSPr1q3joYceynN3yuK/PevK3CLPgN9dW6XZuHEjw4cPZ/PmzVRXV/OVr3yFb3zjGwBcfPHFLFmyhBEjRhAR9O/fnwceeIDzzz+fc845h9raWoYPH87hhx8OwIgRIxg3bhzDhw/noIMO4pRTTumwOv23Z92RWt5qsCuqra2NuXPn5l2GWS4WLVrEEUccsW04j6+f5aHlfpt1J5Kej4jacuZ1i9ysYLpi6JpZfnyO3MzMrMAc5GZmZgXmIDcrgCJ8lqUjdbf9NfsgHORmXVyvXr1obGzsNuEWETQ2NtKrV6+8SzErBH/YzayLGzJkCK+//jrd6S6AvXr1YsiQIXmXYVYIDnKzLq5nz54cfPDBeZdhZl2Uu9bNzMwKzEFuZmZWYA5yMzOzAivEJVolrQZey7sO22X9gLfyLsKsG/LfXnEdFBH9y5mxEEFuxSZpbrnXDDazjuO/ve7BXetmZmYF5iA3MzMrMAe5dYYpeRdg1k35b68b8DlyMzOzAnOL3MzMrMAc5PaBKPErSWeVjPuSpBl51mXWnUgKSf9cMvy3kq7JsSTrRA5y+0AiOTdzKfB9Sb0k7Q5cD1yeb2Vm3com4POS+uVdiHU+B7l9YBHxIvAQ8PfAd4A7IuJVSXWSfiNpvqR/k9RDUrWk/5C0UNKLkibkW71ZRWgi+WDblS0nSDpI0ixJC9LHAzu/PMuS735mHeVaYB7wZ6BW0lHA54C/iIgmSVOA84BXgX4R8TEASfvkVbBZhflXYIGk/9Ni/K0kb67rJf01cAtwbqdXZ5lxkFuHiIh3Jd0NrI+ITZLOAI4D5koC6A0sA34BHCbpZuAR4LG8ajarJBGxVtIdwARgY8mkE4HPp7//B9Ay6K3gHOTWkbamPwACfhIR3245k6SPA2eR/MP5AnBJp1VoVtluIukZ++kO5vF3jiuMz5FbVh4HvtT84RtJfSUdKKk/yfUL7iU5nz4izyLNKklEvA3cA1xUMvr/kZzWAjgf+FVn12XZcovcMhERCyVdCzwuqQewmeTT7VuAHyvpbw+SD8iZWcf5Z+CKkuEJwE8kfQtYDXw1l6osM76ym5mZWYG5a93MzKzAHORmZmYF5iA3MzMrMAe5mZlZgTnIzczMCsxBblahJP1PSS+l19ieL+n4D7i+kZIe7qj6zKxj+HvkZhVI0onA2cCI9JK5/YDdcqynOiKa8tq+WSVzi9ysMg0C3oqITQAR8VZEvCFpiaRJkp6RNFfSCEm/kPSqpEth2z3m/ym9O91CSeNarlzScZJekHSIpN0l/UTSc+m4sek8F0q6V9JD+Jr6Zplxi9ysMj0G/C9Jvye5XO7dEfFEOm1ZRJwo6V+A24GTgF7AS8BtJDfYGA4cDfQDnpP0ZPOKJf0FMBkYGxFLJU0CfhkRf53eze43kh5PZz8R+Hh66VAzy4CD3KwCRcR6SccCpwCnAXdLuiqdPD19XAjsERHrgHWS3kuD+GRgakRsAVZKeoLkTnZrgSNI7nt9ZkS8ka7nTGCMpL9Nh3sBzfe8nukQN8uWg9ysQqVBPAeYI2khUJdO2pQ+bi35vXm4muTOdW1ZQRLUxwDNQS7gCxHxu9IZ0w/XvfsBdsHMyuBz5GYVSNJhkoaVjBoOvFbm4k8C4yRVpXerOxX4TTrtHeCzwCRJI9NxvwDGpzfCQdIxH7R+Myufg9ysMu0B1Et6WdIC4KPANWUuOw1YAPwW+CXwdxHxZvPEiFgJnAP8a9rq/kegJ7BA0ovpsJl1Et/9zMzMrMDcIjczMyswB7mZmVmBOcjNzMwKzEFuZmZWYA5yMzOzAnOQm5mZFZiD3MzMrMAc5GZmZgX2/wFVt3W7VP6QsQAAAABJRU5ErkJggg==\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "%matplotlib inline\n",
+ "import seaborn as sns\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "plt.figure(figsize=(8,5))\n",
+ "sns.boxplot(data=df, x=\"Smoker\", y=\"Age\", hue=\"Status\")\n",
+ "plt.title(\"Age distribution by Smoker status and Survival\")\n",
+ "plt.show()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Status Alive Dead Mortality Rate\n",
+ "Smoker \n",
+ "No 502 230 0.314208\n",
+ "Yes 443 139 0.238832\n"
+ ]
+ }
+ ],
+ "source": [
+ "summary = df.groupby(\"Smoker\")[\"Status\"].value_counts().unstack()\n",
+ "summary[\"Mortality Rate\"] = summary[\"Dead\"] / summary.sum(axis=1)\n",
+ "print(summary)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Status Alive Dead Mortality Rate\n",
+ "AgeGroup Smoker \n",
+ "18–34 No 212 6 0.027523\n",
+ " Yes 172 5 0.028249\n",
+ "34–54 No 180 19 0.095477\n",
+ " Yes 196 41 0.172996\n",
+ "55–64 No 81 40 0.330579\n",
+ " Yes 64 51 0.443478\n",
+ "65+ No 28 165 0.854922\n",
+ " Yes 7 42 0.857143\n"
+ ]
+ }
+ ],
+ "source": [
+ "bins = [18, 34, 54, 64, 120]\n",
+ "labels = [\"18–34\", \"34–54\", \"55–64\", \"65+\"]\n",
+ "df[\"AgeGroup\"] = pd.cut(df[\"Age\"], bins=bins, labels=labels, right=True)\n",
+ "\n",
+ "age_summary = df.groupby([\"AgeGroup\", \"Smoker\"])[\"Status\"].value_counts().unstack()\n",
+ "age_summary[\"Mortality Rate\"] = age_summary[\"Dead\"] / age_summary.sum(axis=1)\n",
+ "print(age_summary)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "ename": "ValueError",
+ "evalue": "Neither the `x` nor `y` variable appears to be numeric.",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mplt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfigure\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfigsize\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m8\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0msns\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbarplot\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"Smoker\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"Status\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m==\u001b[0m\u001b[0;34m\"Dead\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mestimator\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m/\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 7\u001b[0m \u001b[0mplt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mylabel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Mortality Rate\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0mplt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtitle\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Overall mortality rate by smoking status\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/seaborn/categorical.py\u001b[0m in \u001b[0;36mbarplot\u001b[0;34m(x, y, hue, data, order, hue_order, estimator, ci, n_boot, units, orient, color, palette, saturation, errcolor, errwidth, capsize, dodge, ax, **kwargs)\u001b[0m\n\u001b[1;32m 2957\u001b[0m \u001b[0mestimator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mci\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn_boot\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0munits\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2958\u001b[0m \u001b[0morient\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolor\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpalette\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msaturation\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2959\u001b[0;31m errcolor, errwidth, capsize, dodge)\n\u001b[0m\u001b[1;32m 2960\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2961\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0max\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/seaborn/categorical.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, x, y, hue, data, order, hue_order, estimator, ci, n_boot, units, orient, color, palette, saturation, errcolor, errwidth, capsize, dodge)\u001b[0m\n\u001b[1;32m 1594\u001b[0m \u001b[0;34m\"\"\"Initialize the plotter.\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1595\u001b[0m self.establish_variables(x, y, hue, data, orient,\n\u001b[0;32m-> 1596\u001b[0;31m order, hue_order, units)\n\u001b[0m\u001b[1;32m 1597\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mestablish_colors\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcolor\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpalette\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msaturation\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1598\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mestimate_statistic\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mestimator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mci\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn_boot\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/seaborn/categorical.py\u001b[0m in \u001b[0;36mestablish_variables\u001b[0;34m(self, x, y, hue, data, orient, order, hue_order, units)\u001b[0m\n\u001b[1;32m 152\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 153\u001b[0m \u001b[0;31m# Figure out the plotting orientation\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 154\u001b[0;31m \u001b[0morient\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minfer_orient\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morient\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 155\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 156\u001b[0m \u001b[0;31m# Option 2a:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/seaborn/categorical.py\u001b[0m in \u001b[0;36minfer_orient\u001b[0;34m(self, x, y, orient)\u001b[0m\n\u001b[1;32m 355\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mis_not_numeric\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 356\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mis_not_numeric\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 357\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mno_numeric\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 358\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 359\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0;34m\"h\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;31mValueError\u001b[0m: Neither the `x` nor `y` variable appears to be numeric."
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "import seaborn as sns\n",
+ "import matplotlib.pyplot as plt\n",
+ "%matplotlib inline\n",
+ "\n",
+ "plt.figure(figsize=(8,5))\n",
+ "sns.barplot(data=df, x=\"Smoker\", y=(df[\"Status\"]==\"Dead\"), estimator=lambda x: sum(x)/len(x))\n",
+ "plt.ylabel(\"Mortality Rate\")\n",
+ "plt.title(\"Overall mortality rate by smoking status\")\n",
+ "plt.show()\n",
+ "\n",
+ "plt.figure(figsize=(10,6))\n",
+ "sns.barplot(data=df, x=\"AgeGroup\", y=(df[\"Status\"]==\"Dead\"), hue=\"Smoker\",\n",
+ " estimator=lambda x: sum(x)/len(x))\n",
+ "plt.ylabel(\"Mortality Rate\")\n",
+ "plt.title(\"Mortality rate by age group and smoking status\")\n",
+ "plt.show()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/opt/conda/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.\n",
+ " return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "import seaborn as sns\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "df[\"Death\"] = (df[\"Status\"] == \"Dead\").astype(int)\n",
+ "\n",
+ "plt.figure(figsize=(8,5))\n",
+ "sns.barplot(data=df, x=\"Smoker\", y=\"Death\", estimator=lambda x: x.mean())\n",
+ "plt.ylabel(\"Mortality Rate\")\n",
+ "plt.title(\"Overall mortality rate by smoking status\")\n",
+ "plt.show()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "ename": "ValueError",
+ "evalue": "Neither the `x` nor `y` variable appears to be numeric.",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mplt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfigure\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfigsize\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m8\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0msns\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbarplot\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"Smoker\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"Status\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m==\u001b[0m\u001b[0;34m\"Dead\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mestimator\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m/\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 7\u001b[0m \u001b[0mplt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mylabel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Mortality Rate\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0mplt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtitle\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Overall mortality rate by smoking status\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/seaborn/categorical.py\u001b[0m in \u001b[0;36mbarplot\u001b[0;34m(x, y, hue, data, order, hue_order, estimator, ci, n_boot, units, orient, color, palette, saturation, errcolor, errwidth, capsize, dodge, ax, **kwargs)\u001b[0m\n\u001b[1;32m 2957\u001b[0m \u001b[0mestimator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mci\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn_boot\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0munits\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2958\u001b[0m \u001b[0morient\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolor\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpalette\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msaturation\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2959\u001b[0;31m errcolor, errwidth, capsize, dodge)\n\u001b[0m\u001b[1;32m 2960\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2961\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0max\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/seaborn/categorical.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, x, y, hue, data, order, hue_order, estimator, ci, n_boot, units, orient, color, palette, saturation, errcolor, errwidth, capsize, dodge)\u001b[0m\n\u001b[1;32m 1594\u001b[0m \u001b[0;34m\"\"\"Initialize the plotter.\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1595\u001b[0m self.establish_variables(x, y, hue, data, orient,\n\u001b[0;32m-> 1596\u001b[0;31m order, hue_order, units)\n\u001b[0m\u001b[1;32m 1597\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mestablish_colors\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcolor\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpalette\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msaturation\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1598\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mestimate_statistic\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mestimator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mci\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn_boot\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/seaborn/categorical.py\u001b[0m in \u001b[0;36mestablish_variables\u001b[0;34m(self, x, y, hue, data, orient, order, hue_order, units)\u001b[0m\n\u001b[1;32m 152\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 153\u001b[0m \u001b[0;31m# Figure out the plotting orientation\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 154\u001b[0;31m \u001b[0morient\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minfer_orient\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morient\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 155\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 156\u001b[0m \u001b[0;31m# Option 2a:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/seaborn/categorical.py\u001b[0m in \u001b[0;36minfer_orient\u001b[0;34m(self, x, y, orient)\u001b[0m\n\u001b[1;32m 355\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mis_not_numeric\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 356\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mis_not_numeric\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 357\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mno_numeric\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 358\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 359\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0;34m\"h\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;31mValueError\u001b[0m: Neither the `x` nor `y` variable appears to be numeric."
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "import seaborn as sns\n",
+ "import matplotlib.pyplot as plt\n",
+ "%matplotlib inline\n",
+ "\n",
+ "plt.figure(figsize=(8,5))\n",
+ "sns.barplot(data=df, x=\"Smoker\", y=(df[\"Status\"]==\"Dead\"), estimator=lambda x: sum(x)/len(x))\n",
+ "plt.ylabel(\"Mortality Rate\")\n",
+ "plt.title(\"Overall mortality rate by smoking status\")\n",
+ "plt.show()\n",
+ "\n",
+ "plt.figure(figsize=(10,6))\n",
+ "sns.barplot(data=df, x=\"AgeGroup\", y=(df[\"Status\"]==\"Dead\"), hue=\"Smoker\",\n",
+ " estimator=lambda x: sum(x)/len(x))\n",
+ "plt.ylabel(\"Mortality Rate\")\n",
+ "plt.title(\"Mortality rate by age group and smoking status\")\n",
+ "plt.show()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns\n",
+ "import numpy as np\n",
+ "\n",
+ "# Numeric mortality flag\n",
+ "df[\"DeadFlag\"] = (df[\"Status\"] == \"Dead\").astype(int)\n",
+ "\n",
+ "# Define age groups\n",
+ "bins = [18, 34, 54, 64, df[\"Age\"].max()]\n",
+ "labels = [\"18–34\", \"34–54\", \"55–64\", \"65+\"]\n",
+ "df[\"AgeGroup\"] = pd.cut(df[\"Age\"], bins=bins, labels=labels, right=True, include_lowest=True)\n",
+ "\n",
+ "# Plot\n",
+ "plt.figure(figsize=(10, 6))\n",
+ "sns.barplot(\n",
+ " data=df,\n",
+ " x=\"Smoker\",\n",
+ " y=\"DeadFlag\",\n",
+ " hue=\"AgeGroup\",\n",
+ " estimator=np.mean,\n",
+ " ci=None\n",
+ ")\n",
+ "plt.ylabel(\"Mortality Rate\")\n",
+ "plt.title(\"Mortality rate by Smoking Status and Age Group\")\n",
+ "plt.ylim(0, 1)\n",
+ "plt.legend(title=\"Age Group\")\n",
+ "plt.show()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(1314, 3)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Smoker | \n",
+ " Status | \n",
+ " Age | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " Yes | \n",
+ " Alive | \n",
+ " 21.0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " Yes | \n",
+ " Alive | \n",
+ " 19.3 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " No | \n",
+ " Dead | \n",
+ " 57.5 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " No | \n",
+ " Alive | \n",
+ " 47.1 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " Yes | \n",
+ " Alive | \n",
+ " 81.4 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Smoker Status Age\n",
+ "0 Yes Alive 21.0\n",
+ "1 Yes Alive 19.3\n",
+ "2 No Dead 57.5\n",
+ "3 No Alive 47.1\n",
+ "4 Yes Alive 81.4"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Cell 1 — Setup & load data\n",
+ "%matplotlib inline\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns\n",
+ "import statsmodels.formula.api as smf\n",
+ "from statsmodels.stats.proportion import proportion_confint\n",
+ "\n",
+ "sns.set(style=\"whitegrid\")\n",
+ "\n",
+ "# Adjust this path if necessary\n",
+ "data_path = \"../../Subject6_smoking.csv\"\n",
+ "\n",
+ "df = pd.read_csv(data_path)\n",
+ "# preview\n",
+ "print(df.shape)\n",
+ "df.head()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "No 732\n",
+ "Yes 582\n",
+ "Name: Smoker, dtype: int64\n",
+ "Alive 945\n",
+ "Dead 369\n",
+ "Name: Status, dtype: int64\n",
+ "Mean age: 47.35936073059361\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Cell 2 — Clean & prepare columns\n",
+ "# Standardize column names if needed\n",
+ "df.columns = [c.strip() for c in df.columns]\n",
+ "\n",
+ "# Create numeric death flag\n",
+ "df['Death'] = (df['Status'].str.strip().str.lower() == \"dead\").astype(int)\n",
+ "\n",
+ "# Make Smoker categorical with consistent labels\n",
+ "df['Smoker'] = df['Smoker'].str.strip().replace({'Yes':'Yes','No':'No'})\n",
+ "\n",
+ "# Basic checks\n",
+ "print(df['Smoker'].value_counts())\n",
+ "print(df['Status'].value_counts())\n",
+ "print(\"Mean age:\", df['Age'].mean())\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Overall counts and mortality rates by smoking status (with 95% CI):\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " alive | \n",
+ " dead | \n",
+ " total | \n",
+ " mortality_rate | \n",
+ " ci_lower | \n",
+ " ci_upper | \n",
+ "
\n",
+ " \n",
+ " | Smoker | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | No | \n",
+ " 502 | \n",
+ " 230 | \n",
+ " 732 | \n",
+ " 0.314208 | \n",
+ " 0.281624 | \n",
+ " 0.348731 | \n",
+ "
\n",
+ " \n",
+ " | Yes | \n",
+ " 443 | \n",
+ " 139 | \n",
+ " 582 | \n",
+ " 0.238832 | \n",
+ " 0.205976 | \n",
+ " 0.275112 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " alive dead total mortality_rate ci_lower ci_upper\n",
+ "Smoker \n",
+ "No 502 230 732 0.314208 0.281624 0.348731\n",
+ "Yes 443 139 582 0.238832 0.205976 0.275112"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Cell 3 — Step 1: Tabulate overall counts and mortality rates by smoking status\n",
+ "grouped = df.groupby('Smoker')['Death'].agg(['sum','count']).rename(columns={'sum':'dead','count':'total'})\n",
+ "grouped['alive'] = grouped['total'] - grouped['dead']\n",
+ "grouped['mortality_rate'] = grouped['dead'] / grouped['total']\n",
+ "\n",
+ "# 95% Wilson confidence intervals for proportion (more stable than plain normal)\n",
+ "cis = grouped.apply(lambda row: proportion_confint(count=int(row['dead']), nobs=int(row['total']), alpha=0.05, method='wilson'), axis=1)\n",
+ "grouped['ci_lower'] = [c[0] for c in cis]\n",
+ "grouped['ci_upper'] = [c[1] for c in cis]\n",
+ "\n",
+ "# Display nicely\n",
+ "display_cols = ['alive','dead','total','mortality_rate','ci_lower','ci_upper']\n",
+ "print(\"Overall counts and mortality rates by smoking status (with 95% CI):\")\n",
+ "grouped[display_cols]\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Cell 4 — Bar plot with error bars (overall)\n",
+ "plt.figure(figsize=(6,4))\n",
+ "x = grouped.index.tolist()\n",
+ "rates = grouped['mortality_rate'].values\n",
+ "err_low = rates - grouped['ci_lower'].values\n",
+ "err_high = grouped['ci_upper'].values - rates\n",
+ "\n",
+ "plt.bar(x, rates, yerr=[err_low, err_high], capsize=6)\n",
+ "plt.ylim(0, 1)\n",
+ "plt.ylabel(\"Mortality rate (20 yr)\")\n",
+ "plt.title(\"Overall mortality rate by smoking status\\n(95% Wilson CI)\")\n",
+ "plt.show()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "ename": "ValueError",
+ "evalue": "Shape of passed values is (8, 2), indices imply (8, 4)",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/internals.py\u001b[0m in \u001b[0;36mcreate_block_manager_from_arrays\u001b[0;34m(arrays, names, axes)\u001b[0m\n\u001b[1;32m 4637\u001b[0m \u001b[0mblocks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mform_blocks\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marrays\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnames\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxes\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4638\u001b[0;31m \u001b[0mmgr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mBlockManager\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mblocks\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxes\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4639\u001b[0m \u001b[0mmgr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_consolidate_inplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/internals.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, blocks, axes, do_integrity_check, fastpath)\u001b[0m\n\u001b[1;32m 3032\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mdo_integrity_check\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3033\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_verify_integrity\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3034\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/internals.py\u001b[0m in \u001b[0;36m_verify_integrity\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 3243\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mblock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_verify_integrity\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mblock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mmgr_shape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3244\u001b[0;31m \u001b[0mconstruction_error\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtot_items\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mblock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0maxes\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3245\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mtot_items\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/internals.py\u001b[0m in \u001b[0;36mconstruction_error\u001b[0;34m(tot_items, block_shape, axes, e)\u001b[0m\n\u001b[1;32m 4607\u001b[0m raise ValueError(\"Shape of passed values is {0}, indices imply {1}\".format(\n\u001b[0;32m-> 4608\u001b[0;31m passed, implied))\n\u001b[0m\u001b[1;32m 4609\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;31mValueError\u001b[0m: Shape of passed values is (8, 2), indices imply (8, 4)",
+ "\nDuring handling of the above exception, another exception occurred:\n",
+ "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;31m# add Wilson CI per group\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 12\u001b[0;31m \u001b[0mcis_age\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0magg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mproportion_confint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcount\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'dead'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnobs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'total'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0malpha\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0.05\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'wilson'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 13\u001b[0m \u001b[0magg\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'ci_lower'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mc\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mcis_age\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0magg\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'ci_upper'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mc\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mcis_age\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36mapply\u001b[0;34m(self, func, axis, broadcast, raw, reduce, args, **kwds)\u001b[0m\n\u001b[1;32m 4875\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4876\u001b[0m \u001b[0mreduce\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mreduce\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4877\u001b[0;31m ignore_failures=ignore_failures)\n\u001b[0m\u001b[1;32m 4878\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4879\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply_broadcast\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m_apply_standard\u001b[0;34m(self, func, axis, ignore_failures, reduce)\u001b[0m\n\u001b[1;32m 4988\u001b[0m \u001b[0mindex\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4989\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4990\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_constructor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mresults\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4991\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mres_index\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4992\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, data, index, columns, dtype, copy)\u001b[0m\n\u001b[1;32m 328\u001b[0m dtype=dtype, copy=copy)\n\u001b[1;32m 329\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 330\u001b[0;31m \u001b[0mmgr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_init_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 331\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mMaskedArray\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 332\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmrecords\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mmrecords\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m_init_dict\u001b[0;34m(self, data, index, columns, dtype)\u001b[0m\n\u001b[1;32m 459\u001b[0m \u001b[0marrays\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mkeys\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 460\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 461\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_arrays_to_mgr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marrays\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata_names\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 462\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 463\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_init_ndarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m_arrays_to_mgr\u001b[0;34m(arrays, arr_names, index, columns, dtype)\u001b[0m\n\u001b[1;32m 6171\u001b[0m \u001b[0maxes\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0m_ensure_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_ensure_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6172\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 6173\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mcreate_block_manager_from_arrays\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marrays\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marr_names\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxes\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6174\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6175\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/internals.py\u001b[0m in \u001b[0;36mcreate_block_manager_from_arrays\u001b[0;34m(arrays, names, axes)\u001b[0m\n\u001b[1;32m 4640\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mmgr\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4641\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mValueError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4642\u001b[0;31m \u001b[0mconstruction_error\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marrays\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marrays\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxes\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4643\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4644\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/internals.py\u001b[0m in \u001b[0;36mconstruction_error\u001b[0;34m(tot_items, block_shape, axes, e)\u001b[0m\n\u001b[1;32m 4606\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Empty data passed with indices specified.\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4607\u001b[0m raise ValueError(\"Shape of passed values is {0}, indices imply {1}\".format(\n\u001b[0;32m-> 4608\u001b[0;31m passed, implied))\n\u001b[0m\u001b[1;32m 4609\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4610\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;31mValueError\u001b[0m: Shape of passed values is (8, 2), indices imply (8, 4)"
+ ]
+ }
+ ],
+ "source": [
+ "# Cell 5 — Step 2: Add age groups and compute mortality within each strata\n",
+ "bins = [18, 34, 54, 64, df['Age'].max()+1] # last bin includes max\n",
+ "labels = ['18-34','34-54','55-64','65+']\n",
+ "df['AgeGroup'] = pd.cut(df['Age'], bins=bins, labels=labels, right=True, include_lowest=True)\n",
+ "\n",
+ "# Group by age group and smoker\n",
+ "agg = df.groupby(['AgeGroup','Smoker'])['Death'].agg(['sum','count']).rename(columns={'sum':'dead','count':'total'})\n",
+ "agg['alive'] = agg['total'] - agg['dead']\n",
+ "agg['mortality_rate'] = agg['dead'] / agg['total']\n",
+ "\n",
+ "# add Wilson CI per group\n",
+ "cis_age = agg.apply(lambda row: proportion_confint(count=int(row['dead']), nobs=int(row['total']), alpha=0.05, method='wilson'), axis=1)\n",
+ "agg['ci_lower'] = [c[0] for c in cis_age]\n",
+ "agg['ci_upper'] = [c[1] for c in cis_age]\n",
+ "\n",
+ "print(\"Mortality by Age group and Smoker (with 95% CI):\")\n",
+ "agg[['alive','dead','total','mortality_rate','ci_lower','ci_upper']]\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "from statsmodels.stats.proportion import proportion_confint\n",
+ "\n",
+ "# Example: your aggregated DataFrame (agg)\n",
+ "# Make sure these columns exist: 'dead', 'total'\n",
+ "# agg = pd.DataFrame(...)\n",
+ "\n",
+ "# Compute CIs as two separate lists\n",
+ "ci_lower = []\n",
+ "ci_upper = []\n",
+ "\n",
+ "for _, row in agg.iterrows():\n",
+ " low, up = proportion_confint(\n",
+ " count=int(row['dead']),\n",
+ " nobs=int(row['total']),\n",
+ " alpha=0.05,\n",
+ " method='wilson'\n",
+ " )\n",
+ " ci_lower.append(low)\n",
+ " ci_upper.append(up)\n",
+ "\n",
+ "# Add them to the DataFrame\n",
+ "agg['ci_lower'] = ci_lower\n",
+ "agg['ci_upper'] = ci_upper\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "ename": "ValueError",
+ "evalue": "Shape of passed values is (8, 2), indices imply (8, 4)",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/internals.py\u001b[0m in \u001b[0;36mcreate_block_manager_from_arrays\u001b[0;34m(arrays, names, axes)\u001b[0m\n\u001b[1;32m 4637\u001b[0m \u001b[0mblocks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mform_blocks\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marrays\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnames\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxes\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4638\u001b[0;31m \u001b[0mmgr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mBlockManager\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mblocks\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxes\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4639\u001b[0m \u001b[0mmgr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_consolidate_inplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/internals.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, blocks, axes, do_integrity_check, fastpath)\u001b[0m\n\u001b[1;32m 3032\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mdo_integrity_check\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3033\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_verify_integrity\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3034\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/internals.py\u001b[0m in \u001b[0;36m_verify_integrity\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 3243\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mblock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_verify_integrity\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mblock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mmgr_shape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3244\u001b[0;31m \u001b[0mconstruction_error\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtot_items\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mblock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0maxes\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3245\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mtot_items\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/internals.py\u001b[0m in \u001b[0;36mconstruction_error\u001b[0;34m(tot_items, block_shape, axes, e)\u001b[0m\n\u001b[1;32m 4607\u001b[0m raise ValueError(\"Shape of passed values is {0}, indices imply {1}\".format(\n\u001b[0;32m-> 4608\u001b[0;31m passed, implied))\n\u001b[0m\u001b[1;32m 4609\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;31mValueError\u001b[0m: Shape of passed values is (8, 2), indices imply (8, 4)",
+ "\nDuring handling of the above exception, another exception occurred:\n",
+ "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;31m# add Wilson CI per group\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 12\u001b[0;31m \u001b[0mcis_age\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0magg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mproportion_confint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcount\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'dead'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnobs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'total'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0malpha\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0.05\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'wilson'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 13\u001b[0m \u001b[0magg\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'ci_lower'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mc\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mcis_age\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0magg\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'ci_upper'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mc\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mcis_age\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36mapply\u001b[0;34m(self, func, axis, broadcast, raw, reduce, args, **kwds)\u001b[0m\n\u001b[1;32m 4875\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4876\u001b[0m \u001b[0mreduce\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mreduce\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4877\u001b[0;31m ignore_failures=ignore_failures)\n\u001b[0m\u001b[1;32m 4878\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4879\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply_broadcast\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m_apply_standard\u001b[0;34m(self, func, axis, ignore_failures, reduce)\u001b[0m\n\u001b[1;32m 4988\u001b[0m \u001b[0mindex\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4989\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4990\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_constructor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mresults\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4991\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mres_index\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4992\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, data, index, columns, dtype, copy)\u001b[0m\n\u001b[1;32m 328\u001b[0m dtype=dtype, copy=copy)\n\u001b[1;32m 329\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 330\u001b[0;31m \u001b[0mmgr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_init_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 331\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mMaskedArray\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 332\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmrecords\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mmrecords\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m_init_dict\u001b[0;34m(self, data, index, columns, dtype)\u001b[0m\n\u001b[1;32m 459\u001b[0m \u001b[0marrays\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mk\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mkeys\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 460\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 461\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_arrays_to_mgr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marrays\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata_names\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 462\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 463\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_init_ndarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m_arrays_to_mgr\u001b[0;34m(arrays, arr_names, index, columns, dtype)\u001b[0m\n\u001b[1;32m 6171\u001b[0m \u001b[0maxes\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0m_ensure_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_ensure_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6172\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 6173\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mcreate_block_manager_from_arrays\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marrays\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marr_names\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxes\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6174\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6175\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/internals.py\u001b[0m in \u001b[0;36mcreate_block_manager_from_arrays\u001b[0;34m(arrays, names, axes)\u001b[0m\n\u001b[1;32m 4640\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mmgr\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4641\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mValueError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4642\u001b[0;31m \u001b[0mconstruction_error\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marrays\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marrays\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxes\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4643\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4644\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/internals.py\u001b[0m in \u001b[0;36mconstruction_error\u001b[0;34m(tot_items, block_shape, axes, e)\u001b[0m\n\u001b[1;32m 4606\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Empty data passed with indices specified.\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4607\u001b[0m raise ValueError(\"Shape of passed values is {0}, indices imply {1}\".format(\n\u001b[0;32m-> 4608\u001b[0;31m passed, implied))\n\u001b[0m\u001b[1;32m 4609\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4610\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;31mValueError\u001b[0m: Shape of passed values is (8, 2), indices imply (8, 4)"
+ ]
+ }
+ ],
+ "source": [
+ "# Cell 5 — Step 2: Add age groups and compute mortality within each strata\n",
+ "bins = [18, 34, 54, 64, df['Age'].max()+1] # last bin includes max\n",
+ "labels = ['18-34','34-54','55-64','65+']\n",
+ "df['AgeGroup'] = pd.cut(df['Age'], bins=bins, labels=labels, right=True, include_lowest=True)\n",
+ "\n",
+ "# Group by age group and smoker\n",
+ "agg = df.groupby(['AgeGroup','Smoker'])['Death'].agg(['sum','count']).rename(columns={'sum':'dead','count':'total'})\n",
+ "agg['alive'] = agg['total'] - agg['dead']\n",
+ "agg['mortality_rate'] = agg['dead'] / agg['total']\n",
+ "\n",
+ "# add Wilson CI per group\n",
+ "cis_age = agg.apply(lambda row: proportion_confint(count=int(row['dead']), nobs=int(row['total']), alpha=0.05, method='wilson'), axis=1)\n",
+ "agg['ci_lower'] = [c[0] for c in cis_age]\n",
+ "agg['ci_upper'] = [c[1] for c in cis_age]\n",
+ "\n",
+ "print(\"Mortality by Age group and Smoker (with 95% CI):\")\n",
+ "agg[['alive','dead','total','mortality_rate','ci_lower','ci_upper']]\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "ename": "TypeError",
+ "evalue": "aggregate() missing 1 required positional argument: 'arg'",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 8\u001b[0m agg = df.groupby('Smoker').agg(\n\u001b[1;32m 9\u001b[0m \u001b[0malive\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Alive'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'sum'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 10\u001b[0;31m \u001b[0mdead\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Dead'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'sum'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 11\u001b[0m ).reset_index()\n\u001b[1;32m 12\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;31mTypeError\u001b[0m: aggregate() missing 1 required positional argument: 'arg'"
+ ]
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "from statsmodels.stats.proportion import proportion_confint\n",
+ "\n",
+ "# Example: df is your main dataset\n",
+ "# Columns needed: 'Smoker' ('Yes'/'No'), 'Alive'/'Dead' status (or 'Death' 1/0)\n",
+ "\n",
+ "# Tabulate\n",
+ "agg = df.groupby('Smoker').agg(\n",
+ " alive=('Alive', 'sum'),\n",
+ " dead=('Dead', 'sum')\n",
+ ").reset_index()\n",
+ "\n",
+ "# Total per group\n",
+ "agg['total'] = agg['alive'] + agg['dead']\n",
+ "\n",
+ "# Mortality rate\n",
+ "agg['mortality_rate'] = agg['dead'] / agg['total']\n",
+ "\n",
+ "# Confidence intervals\n",
+ "ci_lower, ci_upper = [], []\n",
+ "for _, row in agg.iterrows():\n",
+ " low, up = proportion_confint(\n",
+ " count=int(row['dead']),\n",
+ " nobs=int(row['total']),\n",
+ " alpha=0.05,\n",
+ " method='wilson'\n",
+ " )\n",
+ " ci_lower.append(low)\n",
+ " ci_upper.append(up)\n",
+ "\n",
+ "agg['ci_lower'] = ci_lower\n",
+ "agg['ci_upper'] = ci_upper\n",
+ "\n",
+ "print(agg)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/opt/conda/lib/python3.6/site-packages/pandas/core/groupby.py:4291: FutureWarning: using a dict with renaming is deprecated and will be removed in a future version\n",
+ " return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)\n"
+ ]
+ },
+ {
+ "ename": "KeyError",
+ "evalue": "'Alive'",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/indexes/base.py\u001b[0m in \u001b[0;36mget_loc\u001b[0;34m(self, key, method, tolerance)\u001b[0m\n\u001b[1;32m 2524\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2525\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2526\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32mpandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
+ "\u001b[0;32mpandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
+ "\u001b[0;32mpandas/_libs/hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n",
+ "\u001b[0;32mpandas/_libs/hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n",
+ "\u001b[0;31mKeyError\u001b[0m: 'Alive'",
+ "\nDuring handling of the above exception, another exception occurred:\n",
+ "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m agg = df.groupby('Smoker').agg({\n\u001b[1;32m 2\u001b[0m \u001b[0;34m'Alive'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'sum'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0;34m'Dead'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'sum'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m }).reset_index()\n\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/groupby.py\u001b[0m in \u001b[0;36maggregate\u001b[0;34m(self, arg, *args, **kwargs)\u001b[0m\n\u001b[1;32m 4289\u001b[0m versionadded=''))\n\u001b[1;32m 4290\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0maggregate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4291\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0msuper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mDataFrameGroupBy\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0maggregate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4292\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4293\u001b[0m \u001b[0magg\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0maggregate\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/groupby.py\u001b[0m in \u001b[0;36maggregate\u001b[0;34m(self, arg, *args, **kwargs)\u001b[0m\n\u001b[1;32m 3722\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3723\u001b[0m \u001b[0m_level\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'_level'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3724\u001b[0;31m \u001b[0mresult\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhow\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_aggregate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_level\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0m_level\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3725\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhow\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3726\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/base.py\u001b[0m in \u001b[0;36m_aggregate\u001b[0;34m(self, arg, *args, **kwargs)\u001b[0m\n\u001b[1;32m 476\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 477\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 478\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_agg\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_agg_1dim\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 479\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mSpecificationError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 480\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/base.py\u001b[0m in \u001b[0;36m_agg\u001b[0;34m(arg, func)\u001b[0m\n\u001b[1;32m 427\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcompat\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mOrderedDict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 428\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mfname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0magg_how\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mcompat\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miteritems\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 429\u001b[0;31m \u001b[0mresult\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mfname\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0magg_how\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 430\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 431\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/base.py\u001b[0m in \u001b[0;36m_agg_1dim\u001b[0;34m(name, how, subset)\u001b[0m\n\u001b[1;32m 406\u001b[0m \u001b[0maggregate\u001b[0m \u001b[0ma\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0mdim\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mhow\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 407\u001b[0m \"\"\"\n\u001b[0;32m--> 408\u001b[0;31m \u001b[0mcolg\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_gotitem\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mndim\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msubset\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msubset\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 409\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcolg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mndim\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 410\u001b[0m raise SpecificationError(\"nested dictionary is ambiguous \"\n",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/groupby.py\u001b[0m in \u001b[0;36m_gotitem\u001b[0;34m(self, key, ndim, subset)\u001b[0m\n\u001b[1;32m 4316\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mndim\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4317\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0msubset\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4318\u001b[0;31m \u001b[0msubset\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4319\u001b[0m return SeriesGroupBy(subset, selection=key,\n\u001b[1;32m 4320\u001b[0m grouper=self.grouper)\n",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 2137\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_getitem_multilevel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2138\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2139\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_getitem_column\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2140\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2141\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_getitem_column\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m_getitem_column\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 2144\u001b[0m \u001b[0;31m# get column\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2145\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_unique\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2146\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_item_cache\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2147\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2148\u001b[0m \u001b[0;31m# duplicate columns & possible reduce dimensionality\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m_get_item_cache\u001b[0;34m(self, item)\u001b[0m\n\u001b[1;32m 1840\u001b[0m \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcache\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mitem\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1841\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mres\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1842\u001b[0;31m \u001b[0mvalues\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_data\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mitem\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1843\u001b[0m \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_box_item_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mitem\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1844\u001b[0m \u001b[0mcache\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mitem\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mres\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/internals.py\u001b[0m in \u001b[0;36mget\u001b[0;34m(self, item, fastpath)\u001b[0m\n\u001b[1;32m 3841\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3842\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mitem\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3843\u001b[0;31m \u001b[0mloc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mitem\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3844\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3845\u001b[0m \u001b[0mindexer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0misna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/indexes/base.py\u001b[0m in \u001b[0;36mget_loc\u001b[0;34m(self, key, method, tolerance)\u001b[0m\n\u001b[1;32m 2525\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2526\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2527\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_maybe_cast_indexer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2528\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2529\u001b[0m \u001b[0mindexer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_indexer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtolerance\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtolerance\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32mpandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
+ "\u001b[0;32mpandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
+ "\u001b[0;32mpandas/_libs/hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n",
+ "\u001b[0;32mpandas/_libs/hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n",
+ "\u001b[0;31mKeyError\u001b[0m: 'Alive'"
+ ]
+ }
+ ],
+ "source": [
+ "agg = df.groupby('Smoker').agg({\n",
+ " 'Alive': 'sum',\n",
+ " 'Dead': 'sum'\n",
+ "}).reset_index()\n",
+ "\n",
+ "# Total per group\n",
+ "agg['total'] = agg['Alive'] + agg['Dead']\n",
+ "\n",
+ "# Mortality rate\n",
+ "agg['mortality_rate'] = agg['Dead'] / agg['total']\n",
+ "\n",
+ "# Confidence intervals\n",
+ "from statsmodels.stats.proportion import proportion_confint\n",
+ "\n",
+ "ci_lower, ci_upper = [], []\n",
+ "for _, row in agg.iterrows():\n",
+ " low, up = proportion_confint(\n",
+ " count=int(row['Dead']),\n",
+ " nobs=int(row['total']),\n",
+ " alpha=0.05,\n",
+ " method='wilson'\n",
+ " )\n",
+ " ci_lower.append(low)\n",
+ " ci_upper.append(up)\n",
+ "\n",
+ "agg['ci_lower'] = ci_lower\n",
+ "agg['ci_upper'] = ci_upper\n",
+ "\n",
+ "print(agg)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Status Smoker alive dead total mortality_rate ci_lower ci_upper\n",
+ "0 No 502 230 732 0.314208 0.281624 0.348731\n",
+ "1 Yes 443 139 582 0.238832 0.205976 0.275112\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Count alive and dead per smoking group\n",
+ "agg = df.groupby(['Smoker', 'Status']).size().unstack(fill_value=0).reset_index()\n",
+ "\n",
+ "# Rename columns for clarity\n",
+ "agg = agg.rename(columns={'Alive': 'alive', 'Dead': 'dead'})\n",
+ "\n",
+ "# Total and mortality rate\n",
+ "agg['total'] = agg['alive'] + agg['dead']\n",
+ "agg['mortality_rate'] = agg['dead'] / agg['total']\n",
+ "\n",
+ "# Confidence intervals\n",
+ "from statsmodels.stats.proportion import proportion_confint\n",
+ "ci_lower, ci_upper = [], []\n",
+ "\n",
+ "for _, row in agg.iterrows():\n",
+ " low, up = proportion_confint(\n",
+ " count=int(row['dead']),\n",
+ " nobs=int(row['total']),\n",
+ " alpha=0.05,\n",
+ " method='wilson'\n",
+ " )\n",
+ " ci_lower.append(low)\n",
+ " ci_upper.append(up)\n",
+ "\n",
+ "agg['ci_lower'] = ci_lower\n",
+ "agg['ci_upper'] = ci_upper\n",
+ "\n",
+ "print(agg)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [
+ {
+ "ename": "TypeError",
+ "evalue": "aggregate() missing 1 required positional argument: 'arg'",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 14\u001b[0m agg = df.groupby('Smoker').agg(\n\u001b[1;32m 15\u001b[0m \u001b[0malive\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'alive'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'sum'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 16\u001b[0;31m \u001b[0mdead\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'dead'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'sum'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 17\u001b[0m ).reset_index()\n\u001b[1;32m 18\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;31mTypeError\u001b[0m: aggregate() missing 1 required positional argument: 'arg'"
+ ]
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "from scipy.stats import norm\n",
+ "import statsmodels.api as sm\n",
+ "import seaborn as sns\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "# --- 1. Mortality by Smoking Status ---\n",
+ "\n",
+ "# Create Alive/Dead counts if not already present\n",
+ "df['alive'] = (df['Status'] == 'Alive').astype(int)\n",
+ "df['dead'] = (df['Status'] == 'Dead').astype(int)\n",
+ "\n",
+ "agg = df.groupby('Smoker').agg(\n",
+ " alive=('alive', 'sum'),\n",
+ " dead=('dead', 'sum')\n",
+ ").reset_index()\n",
+ "\n",
+ "agg['total'] = agg['alive'] + agg['dead']\n",
+ "agg['mortality_rate'] = agg['dead'] / agg['total']\n",
+ "\n",
+ "# 95% confidence intervals (normal approx)\n",
+ "z = norm.ppf(0.975)\n",
+ "agg['ci_lower'] = agg['mortality_rate'] - z * np.sqrt(\n",
+ " agg['mortality_rate'] * (1 - agg['mortality_rate']) / agg['total']\n",
+ ")\n",
+ "agg['ci_upper'] = agg['mortality_rate'] + z * np.sqrt(\n",
+ " agg['mortality_rate'] * (1 - agg['mortality_rate']) / agg['total']\n",
+ ")\n",
+ "\n",
+ "print(\"\\nMortality by Smoking Status:\")\n",
+ "print(agg)\n",
+ "\n",
+ "sns.barplot(\n",
+ " data=agg,\n",
+ " x='Smoker', y='mortality_rate', \n",
+ " yerr=[agg['mortality_rate'] - agg['ci_lower'], agg['ci_upper'] - agg['mortality_rate']]\n",
+ ")\n",
+ "plt.ylabel('Mortality Rate')\n",
+ "plt.title('Mortality Rate by Smoking Status with 95% CI')\n",
+ "plt.show()\n",
+ "\n",
+ "# --- 2. Mortality by Smoking Status & Age Group ---\n",
+ "\n",
+ "# Create age groups\n",
+ "bins = [18, 34, 54, 64, np.inf]\n",
+ "labels = ['18-34', '35-54', '55-64', '65+']\n",
+ "df['age_group'] = pd.cut(df['Age'], bins=bins, labels=labels, right=True)\n",
+ "\n",
+ "agg_age = df.groupby(['age_group', 'Smoker']).agg(\n",
+ " alive=('alive', 'sum'),\n",
+ " dead=('dead', 'sum')\n",
+ ").reset_index()\n",
+ "\n",
+ "agg_age['total'] = agg_age['alive'] + agg_age['dead']\n",
+ "agg_age['mortality_rate'] = agg_age['dead'] / agg_age['total']\n",
+ "\n",
+ "print(\"\\nMortality by Smoking Status & Age Group:\")\n",
+ "print(agg_age)\n",
+ "\n",
+ "sns.catplot(\n",
+ " data=agg_age, kind='bar',\n",
+ " x='age_group', y='mortality_rate', hue='Smoker'\n",
+ ")\n",
+ "plt.ylabel('Mortality Rate')\n",
+ "plt.title('Mortality Rate by Smoking Status and Age Group')\n",
+ "plt.show()\n",
+ "\n",
+ "# --- 3. Logistic Regression ---\n",
+ "\n",
+ "df['Death'] = (df['Status'] == 'Dead').astype(int)\n",
+ "\n",
+ "X = df[['Age']]\n",
+ "X['Smoker'] = (df['Smoker'] == 'Yes').astype(int)\n",
+ "X = sm.add_constant(X)\n",
+ "y = df['Death']\n",
+ "\n",
+ "model = sm.Logit(y, X).fit()\n",
+ "print(model.summary())\n",
+ "\n",
+ "# Predict for plotting\n",
+ "age_range = np.linspace(df['Age'].min(), df['Age'].max(), 100)\n",
+ "pred_df = pd.DataFrame({\n",
+ " 'const': 1,\n",
+ " 'Age': np.tile(age_range, 2),\n",
+ " 'Smoker': np.repeat([0, 1], len(age_range))\n",
+ "})\n",
+ "pred_df['pred'] = model.predict(pred_df)\n",
+ "\n",
+ "sns.lineplot(\n",
+ " data=pred_df, x='Age', y='pred', hue='Smoker'\n",
+ ")\n",
+ "plt.ylabel('Predicted Probability of Death')\n",
+ "plt.title('Logistic Regression: Death ~ Age + Smoker')\n",
+ "plt.show()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " Smoker alive dead total mortality_rate ci_lower ci_upper\n",
+ "0 No 502 230 732 0.314208 0.280580 0.347835\n",
+ "1 Yes 443 139 582 0.238832 0.204192 0.273471\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns\n",
+ "from scipy.stats import norm\n",
+ "\n",
+ "# Example: load your dataset\n",
+ "# df = pd.read_csv(\"your_data.csv\")\n",
+ "\n",
+ "# Step 1: aggregate alive/dead counts by smoking\n",
+ "agg = df.groupby('Smoker').agg({\n",
+ " 'alive': 'sum',\n",
+ " 'dead': 'sum'\n",
+ "}).reset_index()\n",
+ "\n",
+ "# Add total and mortality rate\n",
+ "agg['total'] = agg['alive'] + agg['dead']\n",
+ "agg['mortality_rate'] = agg['dead'] / agg['total']\n",
+ "\n",
+ "# 95% CI for proportion\n",
+ "z = norm.ppf(0.975)\n",
+ "agg['ci_lower'] = agg['mortality_rate'] - z * np.sqrt(\n",
+ " (agg['mortality_rate'] * (1 - agg['mortality_rate'])) / agg['total']\n",
+ ")\n",
+ "agg['ci_upper'] = agg['mortality_rate'] + z * np.sqrt(\n",
+ " (agg['mortality_rate'] * (1 - agg['mortality_rate'])) / agg['total']\n",
+ ")\n",
+ "\n",
+ "print(agg)\n",
+ "\n",
+ "# Plot\n",
+ "sns.barplot(x='Smoker', y='mortality_rate', data=agg, yerr=[\n",
+ " agg['mortality_rate'] - agg['ci_lower'],\n",
+ " agg['ci_upper'] - agg['mortality_rate']\n",
+ "])\n",
+ "plt.ylabel('Mortality Rate')\n",
+ "plt.title('Mortality Rate by Smoking Status')\n",
+ "plt.show()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [
+ {
+ "ename": "KeyError",
+ "evalue": "'age'",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/indexes/base.py\u001b[0m in \u001b[0;36mget_loc\u001b[0;34m(self, key, method, tolerance)\u001b[0m\n\u001b[1;32m 2524\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2525\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2526\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32mpandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
+ "\u001b[0;32mpandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
+ "\u001b[0;32mpandas/_libs/hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n",
+ "\u001b[0;32mpandas/_libs/hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n",
+ "\u001b[0;31mKeyError\u001b[0m: 'age'",
+ "\nDuring handling of the above exception, another exception occurred:\n",
+ "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mbins\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;36m18\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m34\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m54\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m64\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minf\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mlabels\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m'18-34'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'35-54'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'55-64'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'65+'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'age_group'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcut\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'age'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbins\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbins\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlabels\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlabels\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mright\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;31m# Aggregate\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 2137\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_getitem_multilevel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2138\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2139\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_getitem_column\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2140\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2141\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_getitem_column\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m_getitem_column\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 2144\u001b[0m \u001b[0;31m# get column\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2145\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_unique\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2146\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_get_item_cache\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2147\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2148\u001b[0m \u001b[0;31m# duplicate columns & possible reduce dimensionality\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m_get_item_cache\u001b[0;34m(self, item)\u001b[0m\n\u001b[1;32m 1840\u001b[0m \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcache\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mitem\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1841\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mres\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1842\u001b[0;31m \u001b[0mvalues\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_data\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mitem\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1843\u001b[0m \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_box_item_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mitem\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1844\u001b[0m \u001b[0mcache\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mitem\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mres\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/internals.py\u001b[0m in \u001b[0;36mget\u001b[0;34m(self, item, fastpath)\u001b[0m\n\u001b[1;32m 3841\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3842\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mitem\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3843\u001b[0;31m \u001b[0mloc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mitem\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3844\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3845\u001b[0m \u001b[0mindexer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0marange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0misna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/pandas/core/indexes/base.py\u001b[0m in \u001b[0;36mget_loc\u001b[0;34m(self, key, method, tolerance)\u001b[0m\n\u001b[1;32m 2525\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2526\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2527\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_loc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_maybe_cast_indexer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2528\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2529\u001b[0m \u001b[0mindexer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_indexer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtolerance\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtolerance\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32mpandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
+ "\u001b[0;32mpandas/_libs/index.pyx\u001b[0m in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n",
+ "\u001b[0;32mpandas/_libs/hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n",
+ "\u001b[0;32mpandas/_libs/hashtable_class_helper.pxi\u001b[0m in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n",
+ "\u001b[0;31mKeyError\u001b[0m: 'age'"
+ ]
+ }
+ ],
+ "source": [
+ "# Define age groups\n",
+ "bins = [18, 34, 54, 64, np.inf]\n",
+ "labels = ['18-34', '35-54', '55-64', '65+']\n",
+ "df['age_group'] = pd.cut(df['age'], bins=bins, labels=labels, right=True)\n",
+ "\n",
+ "# Aggregate\n",
+ "agg_age = df.groupby(['age_group', 'Smoker']).agg({\n",
+ " 'alive': 'sum',\n",
+ " 'dead': 'sum'\n",
+ "}).reset_index()\n",
+ "\n",
+ "# Add total, mortality, CIs\n",
+ "agg_age['total'] = agg_age['alive'] + agg_age['dead']\n",
+ "agg_age['mortality_rate'] = agg_age['dead'] / agg_age['total']\n",
+ "agg_age['ci_lower'] = agg_age['mortality_rate'] - z * np.sqrt(\n",
+ " (agg_age['mortality_rate'] * (1 - agg_age['mortality_rate'])) / agg_age['total']\n",
+ ")\n",
+ "agg_age['ci_upper'] = agg_age['mortality_rate'] + z * np.sqrt(\n",
+ " (agg_age['mortality_rate'] * (1 - agg_age['mortality_rate'])) / agg_age['total']\n",
+ ")\n",
+ "\n",
+ "print(agg_age)\n",
+ "\n",
+ "# Plot\n",
+ "sns.barplot(x='age_group', y='mortality_rate', hue='Smoker', data=agg_age)\n",
+ "plt.ylabel('Mortality Rate')\n",
+ "plt.title('Mortality Rate by Smoking Status and Age Group')\n",
+ "plt.show()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['Smoker', 'Status', 'Age', 'Death', 'AgeGroup', 'alive', 'dead'], dtype='object')"
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.columns\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [
+ {
+ "ename": "TypeError",
+ "evalue": "aggregate() missing 1 required positional argument: 'arg'",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 6\u001b[0m agg = df.groupby('AgeGroup').agg(\n\u001b[1;32m 7\u001b[0m \u001b[0malive\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'alive'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'sum'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m \u001b[0mdead\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'dead'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'sum'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 9\u001b[0m ).reset_index()\n\u001b[1;32m 10\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;31mTypeError\u001b[0m: aggregate() missing 1 required positional argument: 'arg'"
+ ]
+ }
+ ],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "from statsmodels.stats.proportion import proportion_confint\n",
+ "\n",
+ "# Step 2: Aggregate by AgeGroup\n",
+ "agg = df.groupby('AgeGroup').agg(\n",
+ " alive=('alive', 'sum'),\n",
+ " dead=('dead', 'sum')\n",
+ ").reset_index()\n",
+ "\n",
+ "# Total in each group\n",
+ "agg['total'] = agg['alive'] + agg['dead']\n",
+ "\n",
+ "# Mortality rate\n",
+ "agg['mortality_rate'] = agg['dead'] / agg['total']\n",
+ "\n",
+ "# Wilson confidence intervals\n",
+ "cis_age = agg.apply(\n",
+ " lambda row: proportion_confint(\n",
+ " count=int(row['dead']),\n",
+ " nobs=int(row['total']),\n",
+ " alpha=0.05,\n",
+ " method='wilson'\n",
+ " ),\n",
+ " axis=1\n",
+ ")\n",
+ "\n",
+ "agg['ci_lower'] = [c[0] for c in cis_age]\n",
+ "agg['ci_upper'] = [c[1] for c in cis_age]\n",
+ "\n",
+ "agg\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [
+ {
+ "ename": "TypeError",
+ "evalue": "aggregate() missing 1 required positional argument: 'arg'",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 10\u001b[0m agg = df.groupby('AgeGroup').agg(\n\u001b[1;32m 11\u001b[0m \u001b[0malive\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'alive'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'sum'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 12\u001b[0;31m \u001b[0mdead\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'dead'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'sum'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 13\u001b[0m ).reset_index()\n\u001b[1;32m 14\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;31mTypeError\u001b[0m: aggregate() missing 1 required positional argument: 'arg'"
+ ]
+ }
+ ],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "from statsmodels.stats.proportion import proportion_confint\n",
+ "\n",
+ "# Step 1: Create alive/dead indicator columns\n",
+ "df['alive'] = (df['Status'] == 'Alive').astype(int)\n",
+ "df['dead'] = (df['Status'] == 'Dead').astype(int)\n",
+ "\n",
+ "# Step 2: Aggregate by AgeGroup\n",
+ "agg = df.groupby('AgeGroup').agg(\n",
+ " alive=('alive', 'sum'),\n",
+ " dead=('dead', 'sum')\n",
+ ").reset_index()\n",
+ "\n",
+ "# Total in each group\n",
+ "agg['total'] = agg['alive'] + agg['dead']\n",
+ "\n",
+ "# Mortality rate\n",
+ "agg['mortality_rate'] = agg['dead'] / agg['total']\n",
+ "\n",
+ "# Wilson confidence intervals\n",
+ "cis_age = agg.apply(\n",
+ " lambda row: proportion_confint(\n",
+ " count=int(row['dead']),\n",
+ " nobs=int(row['total']),\n",
+ " alpha=0.05,\n",
+ " method='wilson'\n",
+ " ),\n",
+ " axis=1\n",
+ ")\n",
+ "\n",
+ "agg['ci_lower'] = [c[0] for c in cis_age]\n",
+ "agg['ci_upper'] = [c[1] for c in cis_age]\n",
+ "\n",
+ "agg\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " AgeGroup | \n",
+ " alive | \n",
+ " dead | \n",
+ " total | \n",
+ " mortality_rate | \n",
+ " ci_lower | \n",
+ " ci_upper | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 18-34 | \n",
+ " 389 | \n",
+ " 11 | \n",
+ " 400 | \n",
+ " 0.027500 | \n",
+ " 0.015424 | \n",
+ " 0.048565 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 34-54 | \n",
+ " 376 | \n",
+ " 60 | \n",
+ " 436 | \n",
+ " 0.137615 | \n",
+ " 0.108430 | \n",
+ " 0.173129 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 55-64 | \n",
+ " 145 | \n",
+ " 91 | \n",
+ " 236 | \n",
+ " 0.385593 | \n",
+ " 0.325799 | \n",
+ " 0.449053 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 65+ | \n",
+ " 35 | \n",
+ " 207 | \n",
+ " 242 | \n",
+ " 0.855372 | \n",
+ " 0.805503 | \n",
+ " 0.894135 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " AgeGroup alive dead total mortality_rate ci_lower ci_upper\n",
+ "0 18-34 389 11 400 0.027500 0.015424 0.048565\n",
+ "1 34-54 376 60 436 0.137615 0.108430 0.173129\n",
+ "2 55-64 145 91 236 0.385593 0.325799 0.449053\n",
+ "3 65+ 35 207 242 0.855372 0.805503 0.894135"
+ ]
+ },
+ "execution_count": 28,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Step 1: Create alive/dead columns\n",
+ "df['alive'] = (df['Status'] == 'Alive').astype(int)\n",
+ "df['dead'] = (df['Status'] == 'Dead').astype(int)\n",
+ "\n",
+ "# Step 2: Aggregate using old syntax\n",
+ "agg = df.groupby('AgeGroup').agg({\n",
+ " 'alive': 'sum',\n",
+ " 'dead': 'sum'\n",
+ "}).reset_index()\n",
+ "\n",
+ "# Total in each group\n",
+ "agg['total'] = agg['alive'] + agg['dead']\n",
+ "\n",
+ "# Mortality rate\n",
+ "agg['mortality_rate'] = agg['dead'] / agg['total']\n",
+ "\n",
+ "# Confidence intervals\n",
+ "from statsmodels.stats.proportion import proportion_confint\n",
+ "ci_bounds = [proportion_confint(row['dead'], row['total'], alpha=0.05, method='wilson')\n",
+ " for _, row in agg.iterrows()]\n",
+ "agg['ci_lower'] = [c[0] for c in ci_bounds]\n",
+ "agg['ci_upper'] = [c[1] for c in ci_bounds]\n",
+ "\n",
+ "agg\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " AgeGroup | \n",
+ " Smoker | \n",
+ " alive | \n",
+ " dead | \n",
+ " total | \n",
+ " mortality_rate | \n",
+ " ci_lower | \n",
+ " ci_upper | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 18-34 | \n",
+ " No | \n",
+ " 213 | \n",
+ " 6 | \n",
+ " 219 | \n",
+ " 0.027397 | \n",
+ " 0.012616 | \n",
+ " 0.058473 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 18-34 | \n",
+ " Yes | \n",
+ " 176 | \n",
+ " 5 | \n",
+ " 181 | \n",
+ " 0.027624 | \n",
+ " 0.011856 | \n",
+ " 0.063027 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 34-54 | \n",
+ " No | \n",
+ " 180 | \n",
+ " 19 | \n",
+ " 199 | \n",
+ " 0.095477 | \n",
+ " 0.061977 | \n",
+ " 0.144299 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 34-54 | \n",
+ " Yes | \n",
+ " 196 | \n",
+ " 41 | \n",
+ " 237 | \n",
+ " 0.172996 | \n",
+ " 0.130158 | \n",
+ " 0.226265 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 55-64 | \n",
+ " No | \n",
+ " 81 | \n",
+ " 40 | \n",
+ " 121 | \n",
+ " 0.330579 | \n",
+ " 0.253108 | \n",
+ " 0.418476 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 55-64 | \n",
+ " Yes | \n",
+ " 64 | \n",
+ " 51 | \n",
+ " 115 | \n",
+ " 0.443478 | \n",
+ " 0.355968 | \n",
+ " 0.534642 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 65+ | \n",
+ " No | \n",
+ " 28 | \n",
+ " 165 | \n",
+ " 193 | \n",
+ " 0.854922 | \n",
+ " 0.798312 | \n",
+ " 0.897680 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 65+ | \n",
+ " Yes | \n",
+ " 7 | \n",
+ " 42 | \n",
+ " 49 | \n",
+ " 0.857143 | \n",
+ " 0.733323 | \n",
+ " 0.929036 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " AgeGroup Smoker alive dead total mortality_rate ci_lower ci_upper\n",
+ "0 18-34 No 213 6 219 0.027397 0.012616 0.058473\n",
+ "1 18-34 Yes 176 5 181 0.027624 0.011856 0.063027\n",
+ "2 34-54 No 180 19 199 0.095477 0.061977 0.144299\n",
+ "3 34-54 Yes 196 41 237 0.172996 0.130158 0.226265\n",
+ "4 55-64 No 81 40 121 0.330579 0.253108 0.418476\n",
+ "5 55-64 Yes 64 51 115 0.443478 0.355968 0.534642\n",
+ "6 65+ No 28 165 193 0.854922 0.798312 0.897680\n",
+ "7 65+ Yes 7 42 49 0.857143 0.733323 0.929036"
+ ]
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Group by AgeGroup & Smoker\n",
+ "agg2 = df.groupby(['AgeGroup', 'Smoker']).agg({\n",
+ " 'alive': 'sum',\n",
+ " 'dead': 'sum'\n",
+ "}).reset_index()\n",
+ "\n",
+ "# Totals and mortality rates\n",
+ "agg2['total'] = agg2['alive'] + agg2['dead']\n",
+ "agg2['mortality_rate'] = agg2['dead'] / agg2['total']\n",
+ "\n",
+ "# Confidence intervals\n",
+ "ci_bounds2 = [proportion_confint(row['dead'], row['total'], alpha=0.05, method='wilson')\n",
+ " for _, row in agg2.iterrows()]\n",
+ "agg2['ci_lower'] = [c[0] for c in ci_bounds2]\n",
+ "agg2['ci_upper'] = [c[1] for c in ci_bounds2]\n",
+ "\n",
+ "agg2\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Optimization terminated successfully.\n",
+ " Current function value: 0.412727\n",
+ " Iterations 7\n",
+ "Optimization terminated successfully.\n",
+ " Current function value: 0.354560\n",
+ " Iterations 7\n"
+ ]
+ },
+ {
+ "ename": "AttributeError",
+ "evalue": "'LogitResults' object has no attribute 'get_prediction'",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0mage_range\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlinspace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"Age\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"Age\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m100\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 17\u001b[0;31m \u001b[0mpred_smokers\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel_smokers\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_prediction\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m\"Age\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mage_range\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 18\u001b[0m \u001b[0mpred_nonsmokers\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel_nonsmokers\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_prediction\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m\"Age\"\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mage_range\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/statsmodels/base/wrapper.py\u001b[0m in \u001b[0;36m__getattribute__\u001b[0;34m(self, attr)\u001b[0m\n\u001b[1;32m 33\u001b[0m \u001b[0;32mpass\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 34\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 35\u001b[0;31m \u001b[0mobj\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresults\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mattr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 36\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mresults\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 37\u001b[0m \u001b[0mhow\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_wrap_attrs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mattr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;31mAttributeError\u001b[0m: 'LogitResults' object has no attribute 'get_prediction'"
+ ]
+ }
+ ],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns\n",
+ "import statsmodels.api as sm\n",
+ "import statsmodels.formula.api as smf\n",
+ "\n",
+ "# Logistic regression for smokers\n",
+ "model_smokers = smf.logit(\"Death ~ Age\", data=df[df[\"Smoker\"] == \"Yes\"]).fit()\n",
+ "\n",
+ "# Logistic regression for non-smokers\n",
+ "model_nonsmokers = smf.logit(\"Death ~ Age\", data=df[df[\"Smoker\"] == \"No\"]).fit()\n",
+ "\n",
+ "# Create a dataframe for predictions\n",
+ "age_range = np.linspace(df[\"Age\"].min(), df[\"Age\"].max(), 100)\n",
+ "\n",
+ "pred_smokers = model_smokers.get_prediction(pd.DataFrame({\"Age\": age_range}))\n",
+ "pred_nonsmokers = model_nonsmokers.get_prediction(pd.DataFrame({\"Age\": age_range}))\n",
+ "\n",
+ "# Put into DataFrames\n",
+ "smokers_df = pd.DataFrame({\n",
+ " \"Age\": age_range,\n",
+ " \"Predicted\": pred_smokers.predicted_mean,\n",
+ " \"Lower\": pred_smokers.conf_int()[:,0],\n",
+ " \"Upper\": pred_smokers.conf_int()[:,1],\n",
+ " \"Smoker\": \"Yes\"\n",
+ "})\n",
+ "\n",
+ "nonsmokers_df = pd.DataFrame({\n",
+ " \"Age\": age_range,\n",
+ " \"Predicted\": pred_nonsmokers.predicted_mean,\n",
+ " \"Lower\": pred_nonsmokers.conf_int()[:,0],\n",
+ " \"Upper\": pred_nonsmokers.conf_int()[:,1],\n",
+ " \"Smoker\": \"No\"\n",
+ "})\n",
+ "\n",
+ "plot_df = pd.concat([smokers_df, nonsmokers_df])\n",
+ "\n",
+ "# Plot\n",
+ "plt.figure(figsize=(8,6))\n",
+ "sns.lineplot(data=plot_df, x=\"Age\", y=\"Predicted\", hue=\"Smoker\")\n",
+ "plt.fill_between(smokers_df[\"Age\"], smokers_df[\"Lower\"], smokers_df[\"Upper\"], color=\"red\", alpha=0.2)\n",
+ "plt.fill_between(nonsmokers_df[\"Age\"], nonsmokers_df[\"Lower\"], nonsmokers_df[\"Upper\"], color=\"blue\", alpha=0.2)\n",
+ "plt.ylabel(\"Probability of Death\")\n",
+ "plt.title(\"Logistic Regression: Death Probability vs Age\")\n",
+ "plt.show()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Optimization terminated successfully.\n",
+ " Current function value: 0.412727\n",
+ " Iterations 7\n",
+ "Optimization terminated successfully.\n",
+ " Current function value: 0.354560\n",
+ " Iterations 7\n"
+ ]
+ },
+ {
+ "ename": "AttributeError",
+ "evalue": "module 'seaborn' has no attribute 'lineplot'",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 40\u001b[0m \u001b[0;31m# Plot\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 41\u001b[0m \u001b[0mplt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfigure\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfigsize\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m8\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m6\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 42\u001b[0;31m \u001b[0msns\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlineplot\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mplot_df\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"Age\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"Predicted\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhue\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"Smoker\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 43\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0msmoker_val\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolor\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mzip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"Yes\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"No\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m\"red\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"blue\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 44\u001b[0m \u001b[0msubset\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mplot_df\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mplot_df\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"Smoker\"\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0msmoker_val\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;31mAttributeError\u001b[0m: module 'seaborn' has no attribute 'lineplot'"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns\n",
+ "import statsmodels.api as sm\n",
+ "import statsmodels.formula.api as smf\n",
+ "\n",
+ "# Fit models separately\n",
+ "model_smokers = smf.logit(\"Death ~ Age\", data=df[df[\"Smoker\"] == \"Yes\"]).fit()\n",
+ "model_nonsmokers = smf.logit(\"Death ~ Age\", data=df[df[\"Smoker\"] == \"No\"]).fit()\n",
+ "\n",
+ "# Prediction age range\n",
+ "age_range = np.linspace(df[\"Age\"].min(), df[\"Age\"].max(), 100)\n",
+ "\n",
+ "def pred_ci(model, age_vals):\n",
+ " X = sm.add_constant(pd.DataFrame({\"Age\": age_vals}))\n",
+ " pred_probs = model.predict(X)\n",
+ " # standard errors for log-odds\n",
+ " se = np.sqrt(np.diag(np.dot(np.dot(X, model.cov_params()), X.T)))\n",
+ " # 95% CI in log-odds, then convert to probability\n",
+ " logit = np.log(pred_probs / (1 - pred_probs))\n",
+ " ci_lower = 1 / (1 + np.exp(-(logit - 1.96 * se)))\n",
+ " ci_upper = 1 / (1 + np.exp(-(logit + 1.96 * se)))\n",
+ " return pred_probs, ci_lower, ci_upper\n",
+ "\n",
+ "# Predictions for smokers\n",
+ "pred_s, lower_s, upper_s = pred_ci(model_smokers, age_range)\n",
+ "# Predictions for non-smokers\n",
+ "pred_n, lower_n, upper_n = pred_ci(model_nonsmokers, age_range)\n",
+ "\n",
+ "# Create plot DataFrame\n",
+ "plot_df = pd.DataFrame({\n",
+ " \"Age\": np.concatenate([age_range, age_range]),\n",
+ " \"Predicted\": np.concatenate([pred_s, pred_n]),\n",
+ " \"Lower\": np.concatenate([lower_s, lower_n]),\n",
+ " \"Upper\": np.concatenate([upper_s, upper_n]),\n",
+ " \"Smoker\": [\"Yes\"]*len(age_range) + [\"No\"]*len(age_range)\n",
+ "})\n",
+ "\n",
+ "# Plot\n",
+ "plt.figure(figsize=(8,6))\n",
+ "sns.lineplot(data=plot_df, x=\"Age\", y=\"Predicted\", hue=\"Smoker\")\n",
+ "for smoker_val, color in zip([\"Yes\", \"No\"], [\"red\", \"blue\"]):\n",
+ " subset = plot_df[plot_df[\"Smoker\"] == smoker_val]\n",
+ " plt.fill_between(subset[\"Age\"], subset[\"Lower\"], subset[\"Upper\"], color=color, alpha=0.2)\n",
+ "\n",
+ "plt.ylabel(\"Probability of Death\")\n",
+ "plt.title(\"Logistic Regression: Death Probability vs Age\")\n",
+ "plt.show()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "plt.figure(figsize=(8,6))\n",
+ "\n",
+ "# Smokers\n",
+ "plt.plot(age_range, pred_s, color=\"red\", label=\"Smokers\")\n",
+ "plt.fill_between(age_range, lower_s, upper_s, color=\"red\", alpha=0.2)\n",
+ "\n",
+ "# Non-smokers\n",
+ "plt.plot(age_range, pred_n, color=\"blue\", label=\"Non-smokers\")\n",
+ "plt.fill_between(age_range, lower_n, upper_n, color=\"blue\", alpha=0.2)\n",
+ "\n",
+ "plt.xlabel(\"Age\")\n",
+ "plt.ylabel(\"Probability of Death\")\n",
+ "plt.title(\"Logistic Regression: Death Probability vs Age\")\n",
+ "plt.legend()\n",
+ "plt.show()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
@@ -16,10 +1912,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.3"
+ "version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
-