{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Sujet 6 : Autour du Paradoxe de Simpson" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "scrolled": true }, "outputs": [], "source": [ "data_url = \"https://gitlab.inria.fr/learninglab/mooc-rr/mooc-rr-ressources/-/raw/master/module3/Practical_session/Subject6_smoking.csv\"" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "data_file = \"Subject6_smoking.csv\"\n", "\n", "import os\n", "import urllib.request\n", "if not os.path.exists(data_file):\n", " urllib.request.urlretrieve(data_url, data_file)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SmokerStatusAge
0YesAlive21.0
1YesAlive19.3
2NoDead57.5
3NoAlive47.1
4YesAlive81.4
5NoAlive36.8
6NoAlive23.8
7YesDead57.5
8YesAlive24.8
9YesAlive49.5
10YesAlive30.0
11NoDead66.0
12YesAlive49.2
13NoAlive58.4
14NoDead60.6
15NoAlive25.1
16NoAlive43.5
17NoAlive27.1
18NoAlive58.3
19YesAlive65.7
20NoDead73.2
21YesAlive38.3
22NoAlive33.4
23YesDead62.3
24NoAlive18.0
25NoAlive56.2
26YesAlive59.2
27NoAlive25.8
28NoDead36.9
29NoAlive20.2
............
1284YesDead36.0
1285YesAlive48.3
1286NoAlive63.1
1287NoAlive60.8
1288YesDead39.3
1289NoAlive36.7
1290NoAlive63.8
1291NoDead71.3
1292NoAlive57.7
1293NoAlive63.2
1294NoAlive46.6
1295YesDead82.4
1296YesAlive38.3
1297YesAlive32.7
1298NoAlive39.7
1299YesDead60.0
1300NoDead71.0
1301NoAlive20.5
1302NoAlive44.4
1303YesAlive31.2
1304YesAlive47.8
1305YesAlive60.9
1306NoDead61.4
1307YesAlive43.0
1308NoAlive42.1
1309YesAlive35.9
1310NoAlive22.3
1311YesDead62.1
1312NoDead88.6
1313NoAlive39.1
\n", "

1314 rows × 3 columns

\n", "
" ], "text/plain": [ " Smoker Status Age\n", "0 Yes Alive 21.0\n", "1 Yes Alive 19.3\n", "2 No Dead 57.5\n", "3 No Alive 47.1\n", "4 Yes Alive 81.4\n", "5 No Alive 36.8\n", "6 No Alive 23.8\n", "7 Yes Dead 57.5\n", "8 Yes Alive 24.8\n", "9 Yes Alive 49.5\n", "10 Yes Alive 30.0\n", "11 No Dead 66.0\n", "12 Yes Alive 49.2\n", "13 No Alive 58.4\n", "14 No Dead 60.6\n", "15 No Alive 25.1\n", "16 No Alive 43.5\n", "17 No Alive 27.1\n", "18 No Alive 58.3\n", "19 Yes Alive 65.7\n", "20 No Dead 73.2\n", "21 Yes Alive 38.3\n", "22 No Alive 33.4\n", "23 Yes Dead 62.3\n", "24 No Alive 18.0\n", "25 No Alive 56.2\n", "26 Yes Alive 59.2\n", "27 No Alive 25.8\n", "28 No Dead 36.9\n", "29 No Alive 20.2\n", "... ... ... ...\n", "1284 Yes Dead 36.0\n", "1285 Yes Alive 48.3\n", "1286 No Alive 63.1\n", "1287 No Alive 60.8\n", "1288 Yes Dead 39.3\n", "1289 No Alive 36.7\n", "1290 No Alive 63.8\n", "1291 No Dead 71.3\n", "1292 No Alive 57.7\n", "1293 No Alive 63.2\n", "1294 No Alive 46.6\n", "1295 Yes Dead 82.4\n", "1296 Yes Alive 38.3\n", "1297 Yes Alive 32.7\n", "1298 No Alive 39.7\n", "1299 Yes Dead 60.0\n", "1300 No Dead 71.0\n", "1301 No Alive 20.5\n", "1302 No Alive 44.4\n", "1303 Yes Alive 31.2\n", "1304 Yes Alive 47.8\n", "1305 Yes Alive 60.9\n", "1306 No Dead 61.4\n", "1307 Yes Alive 43.0\n", "1308 No Alive 42.1\n", "1309 Yes Alive 35.9\n", "1310 No Alive 22.3\n", "1311 Yes Dead 62.1\n", "1312 No Dead 88.6\n", "1313 No Alive 39.1\n", "\n", "[1314 rows x 3 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = pd.read_csv(data_url)\n", "data" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SmokerStatusAge
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [Smoker, Status, Age]\n", "Index: []" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data[data.isnull().any(axis=1)]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 1. Représentez dans un tableau le nombre total de femmes vivantes et décédées sur la période en fonction de leur habitude de tabagisme. Calculez dans chaque groupe (fumeuses / non fumeuses) le taux de mortalité (le rapport entre le nombre de femmes décédées dans un groupe et le nombre total de femmes dans ce groupe). Vous pourrez proposer une représentation graphique de ces données et calculer des intervalles de confiance si vous le souhaitez. En quoi ce résultat est-il surprenant ?" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SmokerNoYes
Status
Alive502443
Dead230139
\n", "
" ], "text/plain": [ "Smoker No Yes\n", "Status \n", "Alive 502 443\n", "Dead 230 139" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_crosstab=pd.crosstab(data['Status'],data['Smoker'])\n", "data_crosstab" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Mortality of smokers is (%): 23.883161512027492\n" ] } ], "source": [ "mortality_smoker=data_crosstab['Yes']/data_crosstab['Yes'].sum()*100\n", "print(\"Mortality of smokers is (%):\",mortality_smoker['Dead'])" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Mortality of no smokers is (%) : 31.420765027322407\n" ] } ], "source": [ "mortality_nosmoker=data_crosstab['No']/data_crosstab['No'].sum()*100\n", "print(\"Mortality of no smokers is (%) :\",mortality_nosmoker['Dead'])" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "mortality=(mortality_smoker,mortality_nosmoker)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "plt.pie(data_crosstab['Yes'], labels=['Alive','Dead'],autopct='%1.2f%%')\n", "plt.title('Smokers')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Confidence interval for mortality of smokers is (%) : (20.419137788669218, 27.347185235385766)\n" ] } ], "source": [ "z_score = 1.96\n", "n_yes=443+139\n", "se_smoker = np.sqrt(mortality_smoker['Dead'] * (100 - mortality_smoker['Dead'])/n_yes)\n", "lsmoker = mortality_smoker['Dead'] - z_score* se_smoker #lower limit of the CI\n", "usmoker = mortality_smoker['Dead'] + z_score* se_smoker #upper limit of the CI\n", "CIsmoker = (lsmoker,usmoker)\n", "print (\"Confidence interval for mortality of smokers is (%) :\",CIsmoker)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAWQAAAD7CAYAAABdXO4CAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAAIABJREFUeJzt3XecVNX9//HXZ2YLsCBVUFQcFStGiVExdo19NGqKLUb9KYkm0cSv7TsxbbGOUaOxYonRaFSMXcf2U6yIXbEX0DGgCEgZWJbt5/vHHXRdF9g6596Z9/PxmMfuzu7cfS8PeHP23HvPMeccIiLiX8x3ABERCaiQRURCQoUsIhISKmQRkZBQIYuIhIQKWUQkJFTIUnLM7Bgze853DpG2VMjSZWaWNbM5ZlbV6rnxZvZUF4+3o5k9b2Y5M1tgZlPMbJseCywScipk6a4y4HfdPYiZrQY8CFwODAHWAiYA9d09dm8xszLfGaS4qJCluy4ETjOzQe190sy2N7OX86Pel81s+xUcZyMA59xtzrlm59wy59xjzrk388c5Jj9ivsTMFpnZx/ljH2NmM81srpkd3er7DjSzf5nZPDP71Mz+aGbt/n03swvN7DkzG5j/+Fgze8/MFprZo2a2bquvdWb2GzP7CPjIApfkv3/OzN40s8279CcpJU+FLN31CvAUcFrbT5jZECADXAYMBf4GZMxsaDvH+RBoNrObzGxfMxvczteMA97MH+tW4HZgG2A0cCRwhZn1z3/t5cBAYH1gF+Ao4P+1yRczs+uALYC9nHM5MzsIOBP4EbA68CxwW5scB+WzbAbsBexM8B/KIOBQYH472UVWSYUsPeHPwElmtnqb55PAR865m51zTc6524D3gQPaHsA5txjYEXDAdcA8M7vfzEa0+rJPnHP/dM41A5OAdYCznHP1zrnHgAZgtJnFCYrx9865Jc65LHAx8PNWxyonKNohwAHOudr888cD5zvn3nPONQHnAWNbj5Lzn1/gnFsGNAIDgE0Ay79udif+7ES+okKWbnPOvU0w/5tq86mRwKdtnvuUYH64veO855w7xjm3NrB5/vWXtvqSOa3eX5Z/Tdvn+gPDgIo237vt9x0NHAhMcM41tHp+XeDv+WmRRcACwNq8dmarzJOBK4ArgTlmdm1+Plyk01TI0lP+AvyCbxbX5wQF19oo4LNVHcw59z5wI0Exd9aXBCPX1t+77fd9j2AK42Ez27jV8zOB451zg1o9+jrnnm8dr03Wy5xz3wPGEExdnN6FzCIqZOkZzrnpBNMIv2319EPARmZ2hJmVmdmhBPOuD7Z9vZltYmanmtna+Y/XAQ4HXuhClmbgDuBcMxuQn244BbilzdfdRjBf/LiZbZB/eiLwezMbk88x0Mx+uqLvZWbbmNk4MysHlgJ1QHNnM4uACll61lnAV9ckO+fmA/sDpxKc6DoD2N8592U7r11CcKLsRTNbSlDEb+df2xUnERTkx8BzBCcBb2j7Rc65m/K5J5tZwjl3D3ABcLuZLc5n2Hcl32c1gjnvhQTTIvOBi7qYWUqcaYF6EZFw0AhZRCQkVMgiIiGhQhYRCQkVsohISKiQRURCQoUsIhISKmQRkZBQIYuIhIQKWUQkJFTIIiIhoUIWEQkJFbKISEiokEVEQkKFLCISEipkEZGQUCGLiISECllEJCRUyCIiIaFCFhEJCRWyiEhIqJBFREJChSwiEhIqZBGRkFAhi4iEhApZRCQkVMgiIiFR5juASFuJVMaAgcAQoAroB/TNv60EKvKPcoK/ww5oyj+aW72//ONGYAmQW/7IppM1hfuJRDrGnHO+M0gJSKQyZcBawKj8Yx1gdYLSHdrmMRiI93KkZmAxX5f0ImAO8Bnwef6x/P3PsulkbS/nEVEhS89JpDJDgDHApsD6fF2+o4CR9H7J9qYcMAuYDnzY+pFNJ7/wGUyKhwq5g8zsYOBuYFPn3PtmlgAedM5tbmZbA0c5537rM2OhJFKZEcBm7TyG+8zl0WLgI4KCfhd4HXgtm07O9ppKIkeF3EFmdgewJvCEc666dSF7DdbLEqnMIGAbYBywbf79NbyGio4vyJfz8kc2ncx6TSShpkLuADPrD3wA7Abc75zbpM0IeVfgNOCHwMfAWOfcovxrpwM7AC3ARIJf3wFOds5NKeTPsSqJVCYObAVsR1C+2wIbAuYzV5FZALwAPA08A7ySTSeb/EaSsNBVFh1zEPCIc+5DM1tgZlsR/MP6Budci5ndBxwM/NPMxgFZ59wcM7sVuMQ595yZjQIeJZhr9SZ/NcPmwA+A3YFdgNV8ZioBQ4D98g+ApYlUZipBOT8DvJhNJ+t8hRO/VMgdczhwaf792/MfX7mCr50E/Bn4J3BY/mOAPYDNzL4abK5mZgOcc0t6JfEKJFKZ9fm6gHendOd9w6KK4O/GHvmP6xOpzPPAQ8CD2XTyfW/JpOA0ZbEKZjaU4Oz6XILrXeP5t7sAD7SesnDO7W9B434EfB94CdjaOTffzL4E1nHOLStk/vwoeBzBKP8gYONCfn/pthlAJv94KptONnjOI71IhbwKZnY8sJVz7vhWzz0N/BG4um0h5z9/IcGJr6HOuf3yz90KvO6cuzD/8Vjn3Bu9kTmRylQQjH4PIpjXXrM3vo8UXA3wOPAAcG82nfzWtJlEmwp5FczsKSDtnHuk1XO/BfYlGPG2V8hbAy8Dxzjnbso/N4xgmmNTgqmiZ5xzJ/RUzkQqUw4kgUMJ5ic1F1zcGgnKeRJBOec855EeoEKOuEQqsy1wFMF89VDPccSPeoIpjVuAjKY1okuFHEGJVGZt4Of5h9crNSR0FhKMmq/JppO9MiUmvUeFHBH5KYmDgV8SXA+tlfpkVV4ErgYm6VK6aFAhh1wilRlJUMK/RCfnpGsWAjcCE7Pp5Iees8hKqJBDKpHKbAP8D/ATgmUmRXrCZODvwAPZdFL/+ENGhRwi+WuGDwZOBbb3HEeK2zvABcBtunU7PFTIIZBIZWIEI+E/EdzKLFIoWeAi4IZsOlnQm5bk21TIHuWL+FCCm0w28xxHSttcgqmMK3VNsz8qZA/yq6odRlDEm3iOI9LaQuB84HJdmVF4KuQCS6QyBwB/RUUs4TaTYJGsf2XTyRbfYUqFCrlAEqnMd4BLCFZaE4mKt4BUNp18yHeQUqBC7mWJVGY4cDZwHNHeU05K21PAadl08lXfQYqZCrmXJFKZSuBk4Ey00I8UhxbgKuAP2XRyse8wxUiF3AsSqczOwPUE2x+JFJvPgZOz6eR/fAcpNirkHpRIZQYQnLA7Hu1DJ8XvIeA32ri156iQe0gildmPYBPTdXxnESmgWuAs4KJsOtnsO0zUqZC7KZHKDCW4oP5nvrOIeDQV+Fk2nfzEd5Ao0xKO3ZBIZfYG3kVlLPJ9YFoilTnKd5Ao0wi5CxKpTBlwDnAGmisWaWsScEI2nVzkO0jUqJA7KZHKrAPcBuzgO4tIiM0Efp5NJ5/2HSRKNGXRCfnbnt9AZSyyKusAkxOpTMp3kCjRCLkD8osBXUCwTrGIdM7twLFa3nPVVMirkL+2+HZgP99ZRCLsNeCgbDo503eQMFMhr0R+vvhBYAvfWUSKwFzgx9l08jnfQcJKc8grkEhlvkewa6/KWKRnDCeYVx7vO0hYqZDbkUhlDgKeQbs8i/S0cuA6nexrnwq5jUQqcwJwF9DPdxaRInZ+IpVJ+w4RNppDbiWRypwEXOY7h0gJmUiwQJF2JUGF/JVEKnMywY4ekddSV8P8hy+j4cv/AjBsv99hZZXMf/RKXHMDFoszZM9fUTly42+9dtbVxxKr6AuxGBaLs+bRlwLQMOfjdl9fN+tdFjx2FRYvZ9gPT6d88Eha6mqYd98FDD/kLMx0I6Os0m3AUdl0ssl3EN9UyEAilTmdYNnMovBl5m9Urj2GAVvujWtuxDXWM+++C1ht6wPpu8HWLJvxMrkX72KNI779G+Osq49lzaMvId5v4DeenzPpT+2+fu495zJ4l2Noys1l2SevMmT38SyYfD39Ro+jz6jvFOpHluh7gOAKjEbfQXwq+TnkRCrze4qojFvqa6mb+Q79t9gLAIuXE+vTP/hcQ+1XXxPvP7Tzx27n9RYrwzU14JrqsVgZjQtn07xkvspYOusA4JZEKlPSnVTSI+REKvNbgqUzi0YwtXA55UNH0TD3EyrXGM3gH/yS5sXzmHPHnwEHroU1jryIsoHDv/X6WROPI54v8P5j92XA2H0AaPxyZruvXz6VYeUVDEueysIn/8GgnY6kfMhaBfyppYjcAIzPppMlWUwlW8iJVOZg4E6K7LeE+tkf8cXNp7LGkRdSOXJjFjx+DbGKfrQ01FK5zuZUbbwDS997lpppjzDisHO/9fqmJfMpGzCU5qWLmDPpjwzZ8wT6rLM5Cx6/ZpWvr5v5NrUfTmXAd/dj0bO3YLE4g3c/jnjV4EL9+FIc/ppNJ//XdwgfiqqMOiqRymwH/Jsi/PnLBgwjPmDYVyfs+m28Aw1zZlDz1hP022j74LlNdqR+9ocreH0wFRGvGkS/jb5P/efB163q9c45cs9PYuAOh7Noyq0M2vEIqsbsxuJXH+iVn1OK2hmJVOZ3vkP4UHSFtCqJVGYD4H6gr+8svSHefzBlqw2jcf4sAOo+nUb5sFHE+w+hfuZbXz83eOS3XtvSUEdLfe1X79d98joVq6+bP+7KX7/07Sfou8HWxPv0xzXWg8XALHhfpPMuSaQyP/YdotBKasoiv93SVIp8N+iGOR8z/5HLcM1NlA1ag6H7nUzjl5+y8PFrcS3NWFkFQ/b6NZVrjKZpyXzmP3IZI346gcZFXzDv7nOCg7S0ULXZLgzc/lAA6ma90+7rAVoa65h75wRGHHI2Fi+jbubbLHjsaixexrAfnqH5ZOmqGmBcNp1813eQQimZQs4vofkEsIvvLCLSYR8A22bTycW+gxRCKU1ZnIvKWCRqNgZuTKQyJXGHUUkUciKV2Z9g/zsRiZ6DgZK46qLopywSqcxawDSg83dCiEhYNAN7Z9PJJ3wH6U1FXcj5u34eB3bznUVEum0msHkxzycX+5TFaaiMRYrFOsCFvkP0pqIdISdSmfWAdyjS641FSpQD9simk5N9B+kNxTxCvhKVsUixMeD6RCpT5TtIbyjKQs7f4bOv7xwi0ivWA873HaI3FN2URSKVGQC8B+j2MJHi5YDvZdPJ130H6UnFOEKegMpYpNgZUHR78hXVCDl/Iu9DoMx3FhEpiD2K6drkYhsh/xGVsUgpSRfTbdVFU8j50fFRvnOISEFtDfzUd4ieUjSFjEbHIqXq3PxqjpFXFIWs0bFISRtNsElq5BVFIQMpNDoWKWUn+Q7QEyJ/lUUilekPzAb6+84iIl5tnk0n3/EdojuKYYR8KCpjEYETfQformIo5PG+A4hIKPw8kcoM9B2iOyJdyIlUZjNgO985RCQUqoDDfYfojkgXMnCc7wAiEio/8h2gOyJ7Ui9/d85nwJq+s4hIaDQCw7Pp5CLfQboiyiPkLVAZi8g3lQP7+w7RVVEu5H18BxCRUDrId4CuinIh7+07gIiE0j6JVCaSuwVFspDz27fs4DuHiIRSFTDOd4iuiGQhE+wkXeE7hIiE1ra+A3RFVAv5+74DiEiobeM7QFdEtZC/4zuAiISaRsgFtLnvACISaqMSqcwI3yE6K3KFnF/dLeE7h4iEXuSmLSJXyMAYgh1nRURWZn3fATorioWs6QoR6Yi1fAforCgWsm6XFpGOUCEXwBDfAUQkEkb6DtBZKmQRKVYaIRfAYN8BRCQSIje9GcVC1ghZRDoicgsMRbGQI71nlogUTJnvAJ0VucBAk+8AEi7lNDWcXnb7CzGiufuN9J5fn/lC7Krzzm7xnaOjoljIy3wHkHD5e/kVz+8Xf2lX3zkkjB52cLbvEB0WxSkLFbJ8ZaxN/2Df2EtaG1va00J1LlK/NkWxkOt8B5BwiNPc9O+Kc1vMKPedRUKp1neAzopiIWuELABcWH7NlCqr39R3DgmtL3wH6KwoFvIS3wHEv80sO+Pg2HPb+c4hoaZCLoBZvgOIX0ZLy6SKs+vMqPSdRUJNhVwAn/oOIH6dU3bDswNs2RjfOST0ZvsO0FlRLORPfAcQfzawzz49Ij45ktvzSMF97jtAZ0WxkD/wHUB8ce7OigmLzKJ3S6x48bbvAJ0VuULOppOfoRN7JemPZbc8O9hqtvSdQyLjdd8BOityhZz3lu8AUljr2hezjos/vJXvHBIZ86jOfeY7RGdFtZCf9R1ACuuuiuq5ZvT3nUMiI3KjY4huIT/jO4AUzqlldzw7zBZrdCydoUIuoOeAZt8hpPetxbzZJ8bv1byxdFYkB22RLORsOrkYeMN3Dul9d1ZOmGXGar5zSKQsA570HaIrIlnIeZH8H1A67lfx+6asaQu28Z1DIucpqnORXPMmyoX8kO8A0ntGsGDu6WWTNvOdQyLpYd8BuirKhfwkEbxXXTrmPxUTPo6ZNrSVLonsYC2yhZxNJ5uBO3znkJ53TPyRqaNi87SSm3TFW1TnZvgO0VWRLeS8f/sOID1rCLn5fy67ebTvHBJZ//AdoDsiXcjZdPIlYLrvHNJz7qg4+4OYudV955BIqgdu9h2iOyJdyHm3+g4gPeOw+OQXR8c+3953Domse6nOLfAdojuKoZCvBRp8h5DuWY2a3Lll/0j4ziGRdr3vAN0V+ULOr/6mueSIu63i3Lfi5kb4ziGR9RHwhO8Q3RX5Qs77KxCp7b7lawfGprwyJvbpjr5zSKSdT3Uu8h1QFIWcTSffBx7wnUM6r4plSy4uv3qk7xwSaVkifjJvuaIo5LwLfAeQzrul4vw3yqxFhSzdcT7VuSbfIXpC0RRyNp18HnjKdw7puL1iL78+1qZrqkK6YyZwo+8QPaVoCjnvf4AW3yFk1fpRt/TK8suGmmG+s0iknU91rmiusiqqQs6mk28Q8Tt1SsUNFRe+Wm7No3znkEibRnDZa9EoqkLO+wOwyHcIWbGdY9PeHGfvaapCusMBv6E6V1QbVRRdIWfTyXnAWb5zSPsqaai7rvziAWbF93dPCupmqnNTfIfoacX6j+IK4H3fIeTbJpZf8kKlNa3nO4dEWg44w3eI3lCUhZxNJxuBY9G+e6Eyzt59d9fYtJ1855DI+wPVuTm+Q/SGoixkgGw6ORU4z3cOCZTT1HBTxQXlZsR9Z5FIexS4yneI3lK0hZx3FvCy7xACl5VfPrWPNW7oO4dE2pfAMcVwi/SKFHUhZ9PJJuBIoNZ3llI21qZ/sE/sZS2rKd01nupcUW/bVtSFDJBNJz8ETvOdo1TFaW66teJcZ0a57ywSaddSnbvPd4jeVvSFDJBNJ68G7vSdoxRdVD5xSj+r38R3jp5S1+TY9roatpxYw5iravjLk3UA/OedRsZcVUNswmJe+Xzl55KbWxzfvaaG/W/9+he30x+rY5Mratji6hoOnlTLorrgt/Ip/21ii6tr2Oa6GqYvCG5CXVTn2PuWpThXtL+5t/UOwV24Ra8kCjnvaOB13yFKyWaWnXFQbEpRbVZaGYfJR1cx7YT+vHF8FY/MaOKFWU1sPjzG3Yf0Zed1V33O8u8vNrDpsG/+09tzgzLe/nUVb/6qPxsNiXH+s/UAXDy1gbsO6ct5u/fh6peDO4TPfrqeM3esxKwk7jpfCBxIda4kph1LppCz6WQtcCBQ1HNQYRGjpXlSxdl1ZlT6ztKTzIz+FUERNrZAYzMYsOnqcTYetuoynrW4hcxHTYzfquIbz++1QRllseC4260dZ9aSYDRcHodlTVDb6CiPw4wFLXy2pIVdEmU9+4OFUzNweJR3ke6skilkgGw6ORM4mGAzROlF55Td8NwAWzbGd47e0NziGDuxhuEXLmHP9csYt3bHy/HkR+r46x59iK1kcHvDG43sOzo45u93rOSXD9Rx6YsNnLhtBX+YXMfZuxXV/3ErcwrVuUd9hyikkipkgGw6+QIw3neOYrahzcoeHp+8re8cvSUeM944oT+zThnAS5838/bcjt1/9OCHjQyvMr43csUj6XOfqacsBj/7TnAOdOwacV4YX8WTR1fx8cIWRg6I4YBD76zlyLuXMaemaBc3vJLq3GW+QxRayRUyQDadvAWY4DtHcXLuPxUTcmb09Z2ktw3qY+y6bhmPTO/Y2uhT/tvM/R80kbh0CYfduYzJnzRx5N3Lvvr8TW808OBHTfz7R32/NT/snOOcZ+r5086VTHi6ngm7VnLkFuVc9mLRrDzZ2t3A73yH8KEkCxkgm05WAxf5zlFs/lR2yzODbOmWvnP0lnlLW766AmJZo+PxT5rYZFjH/hmdv0cfZp0ygOzJA7j9J33Zfb0ybvlR8P/WI9ObuGBKA/cf1pd+5d+ez7hpWiPJDcsY3NeobYSYBY/axp772ULiYYJ545Jc9qAkzgysSDadPD2RyvQBTvSdpRisa1/MOjb+8Na+c/Sm2TWOo++tpbkFWhwcMqac/Tcq5573Gjnp4Trm1TqSt9Yydo0Yjx5ZxedLWhh/fx0P/azfSo974kPLqG+GPW8OLibYbu04E/cPyrq20XHTtEYeOzI4xinbVfDjO5ZREYfbflxUv4g8CfyomBac7ywroWsZ25VIZQy4BviF7yxR90rlCa8Ns8Vb+c4hkTQV2IvqXI3vID6V7JTFctl00gEnAP/ynSXKTiub9KzKWLroVWC/Ui9j0Aj5K4lUJg7cABzlO0vUrMW82c9V/q7KjNV8Z5HIeYrgxo/FvoOEQcmPkJfLppPNwDHA3zxHiZy7KqtnqYylC+4B9lEZf00j5HYkUpn/BdK+c0TBb+L3Tjm9/I4dfOeQyLkeOKFUr6ZYERXyCiRSmZ8RTGFUrOprS9UIFsydWnliRcwY5DuLRMr5VOfO9B0ijDRlsQLZdPLfwN5oB+sV+k/FhI9VxtIJDcDxKuMVUyGvRDadfAoYR7D8n7RybPzhqaNi84pqJTfpVV8Au1Gdu9Z3kDDTlEUHJFKZKuA64HDfWcJgCLn5r1T+2sXMDfOdRSLhRYIbPj73HSTsVMidkEhlTgIuhtLe/eLxilOfHx2brS2ZpCP+CfyK6pxWWOwAFXInJVKZ7YE7gLV8Z/HhsPjkF9Pl14/znUNCrxb4H01RdI4KuQsSqcxw4BZgT99ZCmkgNYteqzy+Pm5uhO8sEmqvAUdQnfvAd5Co0Um9Lsimk3MJrsD4DbDUc5yCua3inHdUxrISLcBfge1Uxl2jEXI3JVKZDYAbgR09R+lVB8amvPL3iiuLeiU36ZZZwFFU5570HSTKVMg9IJHKxICTgXOBPp7j9Lgqli2ZVvmLJWXWMtJ3FgmlfwKnUp1b6DtI1KmQe1AildkEuAkoqu2L7qn40zPfjc3Y2XcOCZ0PCG70eNp3kGKhOeQelE0n3we+DxwPzPccp0fsHXvp9bE2YyffOSRU6gm2QNtSZdyzNELuJYlUZghwDkE5R/I/vn7ULZ1W+Yv55dY8yncWCY2nCUbFOmnXC1TIvSyRynwXuByI3Ipot1ec/cx2sfc0VSEAHwK/pzp3t+8gxUyFXCCJVObnBEt6RuLE2M6xaW/eVH7B5mbRHN1Lj5lDMD1xHdW5jm2vLV2mQi6gRCrTF/g1kAJCuw5EJQ11b1WOn11hTev5ziLe1BDsyn6xtlYqHBWyB4lUpj/BZXKnQviWr7yxPP30rvE3d/GdQ7xoIlhIawLVuTm+w5QaFbJHiVRmMHAa8Fugv+c4AIyzd9+9veKcjc2I+84iBXcXcCbVuQ99BylVKuQQSKQyqxOMlo/H44i5nKaGtyqP+28faxztK4MUXBMwiWBq4nXfYUqdCjlE8lMZxxJMZxR8/nZi+d+e3if+iqYqSkMOuBa4jOrcLN9hJKBCDqFEKhMHDiYYNRdkV47v2kcf3F3xlw3MKCvE9xNvssClwD90si58VMghl19/+RTgQOidsozT3PRW5fjp/ax+k944vnjngMeBq4H7tdNzeKmQIyKRyowAjgaOAzbqyWNfUn7lUwfHp+zak8eUUFhAsPDPNVTnPvIdRlZNhRxBiVRmJ4Ji/inQrzvH2syyMzIVZ65jRkWPhJMweIFgNHwH1bk632Gk41TIEZZIZVYj2Hj1GLow1xyjpXla5S/eH2DLxvR0Nim4V4G7gbu6s86EmTUDbxHsG9lEsHrhpc65lu4GNLNqoMY5d1F3j1WsVMhFIpHKjAJ+TDBq3g6wVb3mvLLrnz6ibLKuqoimFuB5ghK+m+rcpz1xUDOrcc71z78/HLgVmOKc+0sPHLsaFfJKqZCLUCKVWZugnH9CsKjRt8p5Q5uVfazijBFm9C10PumyJuBJghK+l+rcFz39DVoXcv7j9YGXCW71jxGsx7IrUAlc6Zy7xsz6A/cBgwlG1n90zt2Xf/0fgKOAmcA84FUV8oqpkItcIpUZCRwE7AfsBvQD596o/OVbg2zpFn7TSQfUAf+f4C66+3t7V462hZx/biGwCcGVPsOdc+eYWSUwheA3splAP+fcYjMbRjCHvSGwFcH2ZuMIrhB6DZioQl4xXXNa5LLp5OfAVcBViVSmD7DLofEntx1kSw/xHE1WbD7wBEEJPxSC64WX/4a1F7CFmf0k//FAguKdBZxnZjsTTKWsBYwAdgLucc7VApjZ/QVNHUEq5BKSTSfrgEch+ShcfDbVA0cAuwM/yD8SPvOVKAe8TzDafB54PkyLv+enLJqBuQTFfJJz7tE2X3MMsDrwPedco5ll+XpvSf0K3gkq5FIWrOZ1W/4B1QPXJyjmnYCxwKbo70hPqwVe4usCnhrWzUHNbHVgInCFc86Z2aPAr8xscr54NwI+Ixgpz80/txuwbv4QzwA3mlma4O/RAcA1hf9JokNzyLJi1QMrgTEE5bxlq7cDfcaKmJkExbu8gKeFeaH3di57uxn4m3OuxcxiBNuSHUAwWp5HcH6iHHgg//YNghPJ+zrnsq1O6n1KMLXxruaQV0yFLJ1XPXA9vlnSY/l6VFSqZgPTWz0+AF7Uwj3SGSpk6RnVAwfxdUFvTLBV1Vr5tyMg8usrO4IR3vR2HjOozi31mE2KhApZel/1wDhBKY9s9VirnfeHeko2INhpAAAA8ElEQVRYS7Duw/z8YwHBSayP+Wbp6jZk6VUqZAmPYM56TYLyriSYk6xo5217z7X9XCPBvnBL27xd/v5ilpevilZCQoUsIhIS2uJdRCQkVMgiIiGhQhYRCQkVsohISKiQRURCQoUsIhISKmQRkZBQIYuIhIQKWUQkJFTIIiIhoUIWEQkJFbKISEiokEVEQkKFLCISEipkEZGQUCGLiISECllEJCRUyCIiIaFCFhEJCRWyiEhIqJBFREJChSwiEhIqZBGRkFAhi4iEhApZRCQkVMgiIiGhQhYRCQkVsohISKiQRURCQoUsIhISKmQRkZD4PxpnLyBEAtLrAAAAAElFTkSuQmCC\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "plt.pie(data_crosstab['No'],labels=['Alive','Dead'],autopct='%1.2f%%')\n", "plt.title('No Smokers')\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Confidence interval for mortality of no smokers is (%) : (28.057932601982447, 27.24599393736745)\n" ] } ], "source": [ "n_no=502+230\n", "se_nosmoker = np.sqrt(mortality_nosmoker['Dead'] * (100 - mortality_nosmoker['Dead'])/n_no)\n", "lnosmoker = mortality_nosmoker['Dead'] - z_score* se_nosmoker #lower limit of the CI\n", "unosmoker = mortality_smoker['Dead'] + z_score* se_nosmoker #upper limit of the CI\n", "CInosmoker = (lnosmoker,unosmoker)\n", "print (\"Confidence interval for mortality of no smokers is (%) :\",CInosmoker)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "__The mortality is higher in no smokers group than in smokers group, but this rate of no smoker mortality is not included in its confidence interval.__ " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 2. Reprenez la question 1 (effectifs et taux de mortalité) en rajoutant une nouvelle catégorie liée à la classe d'âge. On considérera par exemple les classes suivantes : 18-34 ans, 34-54 ans, 55-64 ans, plus de 65 ans. En quoi ce résultat est-il surprenant ? Arrivez-vous à expliquer ce paradoxe ? De même, vous pourrez proposer une représentation graphique de ces données pour étayer vos explications." ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SmokerStatusAgeAgeGroup
0YesAlive21.018-34
1YesAlive19.318-34
2NoDead57.555-64
3NoAlive47.135-54
4YesAlive81.4>65
5NoAlive36.835-54
6NoAlive23.818-34
7YesDead57.555-64
8YesAlive24.818-34
9YesAlive49.535-54
10YesAlive30.018-34
11NoDead66.0>65
12YesAlive49.235-54
13NoAlive58.455-64
14NoDead60.655-64
15NoAlive25.118-34
16NoAlive43.535-54
17NoAlive27.118-34
18NoAlive58.355-64
19YesAlive65.7>65
20NoDead73.2>65
21YesAlive38.335-54
22NoAlive33.418-34
23YesDead62.355-64
24NoAlive18.018-34
25NoAlive56.255-64
26YesAlive59.255-64
27NoAlive25.818-34
28NoDead36.935-54
29NoAlive20.218-34
...............
1284YesDead36.035-54
1285YesAlive48.335-54
1286NoAlive63.155-64
1287NoAlive60.855-64
1288YesDead39.335-54
1289NoAlive36.735-54
1290NoAlive63.855-64
1291NoDead71.3>65
1292NoAlive57.755-64
1293NoAlive63.255-64
1294NoAlive46.635-54
1295YesDead82.4>65
1296YesAlive38.335-54
1297YesAlive32.718-34
1298NoAlive39.735-54
1299YesDead60.055-64
1300NoDead71.0>65
1301NoAlive20.518-34
1302NoAlive44.435-54
1303YesAlive31.218-34
1304YesAlive47.835-54
1305YesAlive60.955-64
1306NoDead61.455-64
1307YesAlive43.035-54
1308NoAlive42.135-54
1309YesAlive35.935-54
1310NoAlive22.318-34
1311YesDead62.155-64
1312NoDead88.6>65
1313NoAlive39.135-54
\n", "

1314 rows × 4 columns

\n", "
" ], "text/plain": [ " Smoker Status Age AgeGroup\n", "0 Yes Alive 21.0 18-34\n", "1 Yes Alive 19.3 18-34\n", "2 No Dead 57.5 55-64\n", "3 No Alive 47.1 35-54\n", "4 Yes Alive 81.4 >65\n", "5 No Alive 36.8 35-54\n", "6 No Alive 23.8 18-34\n", "7 Yes Dead 57.5 55-64\n", "8 Yes Alive 24.8 18-34\n", "9 Yes Alive 49.5 35-54\n", "10 Yes Alive 30.0 18-34\n", "11 No Dead 66.0 >65\n", "12 Yes Alive 49.2 35-54\n", "13 No Alive 58.4 55-64\n", "14 No Dead 60.6 55-64\n", "15 No Alive 25.1 18-34\n", "16 No Alive 43.5 35-54\n", "17 No Alive 27.1 18-34\n", "18 No Alive 58.3 55-64\n", "19 Yes Alive 65.7 >65\n", "20 No Dead 73.2 >65\n", "21 Yes Alive 38.3 35-54\n", "22 No Alive 33.4 18-34\n", "23 Yes Dead 62.3 55-64\n", "24 No Alive 18.0 18-34\n", "25 No Alive 56.2 55-64\n", "26 Yes Alive 59.2 55-64\n", "27 No Alive 25.8 18-34\n", "28 No Dead 36.9 35-54\n", "29 No Alive 20.2 18-34\n", "... ... ... ... ...\n", "1284 Yes Dead 36.0 35-54\n", "1285 Yes Alive 48.3 35-54\n", "1286 No Alive 63.1 55-64\n", "1287 No Alive 60.8 55-64\n", "1288 Yes Dead 39.3 35-54\n", "1289 No Alive 36.7 35-54\n", "1290 No Alive 63.8 55-64\n", "1291 No Dead 71.3 >65\n", "1292 No Alive 57.7 55-64\n", "1293 No Alive 63.2 55-64\n", "1294 No Alive 46.6 35-54\n", "1295 Yes Dead 82.4 >65\n", "1296 Yes Alive 38.3 35-54\n", "1297 Yes Alive 32.7 18-34\n", "1298 No Alive 39.7 35-54\n", "1299 Yes Dead 60.0 55-64\n", "1300 No Dead 71.0 >65\n", "1301 No Alive 20.5 18-34\n", "1302 No Alive 44.4 35-54\n", "1303 Yes Alive 31.2 18-34\n", "1304 Yes Alive 47.8 35-54\n", "1305 Yes Alive 60.9 55-64\n", "1306 No Dead 61.4 55-64\n", "1307 Yes Alive 43.0 35-54\n", "1308 No Alive 42.1 35-54\n", "1309 Yes Alive 35.9 35-54\n", "1310 No Alive 22.3 18-34\n", "1311 Yes Dead 62.1 55-64\n", "1312 No Dead 88.6 >65\n", "1313 No Alive 39.1 35-54\n", "\n", "[1314 rows x 4 columns]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bins= [17,34,54,64,100]\n", "labels = ['18-34','35-54','55-64','>65']\n", "data['AgeGroup']=pd.cut(data['Age'], bins=bins, labels=labels,)\n", "data" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SmokerNoYes
StatusAliveDeadAliveDead
AgeGroup
18-3421361765
35-541801919641
55-6481406451
>6528165742
\n", "
" ], "text/plain": [ "Smoker No Yes \n", "Status Alive Dead Alive Dead\n", "AgeGroup \n", "18-34 213 6 176 5\n", "35-54 180 19 196 41\n", "55-64 81 40 64 51\n", ">65 28 165 7 42" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_cross_age=pd.crosstab(data['AgeGroup'], [data['Smoker'],data['Status']])\n", "data_cross_age" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Mortality of smokers by age range is (%): AgeGroup\n", "18-34 3.597122\n", "35-54 29.496403\n", "55-64 36.690647\n", ">65 30.215827\n", "Name: Dead, dtype: float64\n" ] } ], "source": [ "mortality_smoker_age=(data_cross_age['Yes']/data_cross_age['Yes'].sum()*100)\n", "print(\"Mortality of smokers by age range is (%):\", mortality_smoker_age['Dead'])" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Confidence intervals for mortality of smokers by age are (%) : (AgeGroup\n", "18-34 0.501334\n", "35-54 21.915187\n", "55-64 28.678290\n", ">65 22.581964\n", "Name: Dead, dtype: float64, AgeGroup\n", "18-34 6.692911\n", "35-54 37.077618\n", "55-64 44.703005\n", ">65 37.849691\n", "Name: Dead, dtype: float64)\n" ] } ], "source": [ "n_yes_age=5+41+51+42\n", "se_smoker_age = np.sqrt(mortality_smoker_age['Dead'] * (100 - mortality_smoker_age['Dead'])/n_yes_age)\n", "lsmoker = mortality_smoker_age['Dead'] - z_score* se_smoker_age #lower limit of the CI\n", "usmoker = mortality_smoker_age['Dead'] + z_score* se_smoker_age #upper limit of the CI\n", "CIsmoker_age =(lsmoker,usmoker)\n", "print (\"Confidence intervals for mortality of smokers by age are (%) :\", CIsmoker_age)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "hideOutput": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Mortality of no smokers by age range is (%): AgeGroup\n", "18-34 2.608696\n", "35-54 8.260870\n", "55-64 17.391304\n", ">65 71.739130\n", "Name: Dead, dtype: float64\n" ] } ], "source": [ "mortality_nosmoker_age=(data_cross_age['No']/data_cross_age['No'].sum()*100)\n", "print(\"Mortality of no smokers by age range is (%):\", mortality_nosmoker_age['Dead'])" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Confidence intervals for mortality of no smokers by age are (%) : (AgeGroup\n", "18-34 0.548711\n", "35-54 4.703063\n", "55-64 12.492714\n", ">65 65.919935\n", "Name: Dead, dtype: float64, AgeGroup\n", "18-34 4.668680\n", "35-54 11.818676\n", "55-64 22.289895\n", ">65 77.558326\n", "Name: Dead, dtype: float64)\n" ] } ], "source": [ "n_no_age=6+19+40+165\n", "se_nosmoker_age = np.sqrt(mortality_nosmoker_age['Dead'] * (100 - mortality_nosmoker_age['Dead'])/n_no_age)\n", "lnosmoker = mortality_nosmoker_age['Dead'] - z_score* se_nosmoker_age #lower limit of the CI\n", "unosmoker = mortality_nosmoker_age['Dead'] + z_score* se_nosmoker_age #upper limit of the CI\n", "CInosmoker_age =(lnosmoker,unosmoker)\n", "print (\"Confidence intervals for mortality of no smokers by age are (%) :\", CInosmoker_age)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "data_cross_age.plot(kind='bar')\n", "plt.ylabel('Number of persons')\n", "plt.xlabel('Age group (years)')\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The mortality rate of no smokers at age >65 years is higher than smokers (70% against 30%, IC=95). This paradoxe coule be explained by different size of samples between smokers and no smokers, and other factors like health conditions are not respected in analysis." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### 3. Afin d'éviter un biais induit par des regroupements en tranches d'âges arbitraires et non régulières, il est envisageable d'essayer de réaliser une régression logistique. Si on introduit une variable Death valant 1 ou 0 pour indiquer si l'individu est décédé durant la période de 20 ans, on peut étudier le modèle Death ~ Age pour étudier la probabilité de décès en fonction de l'âge selon que l'on considère le groupe des fumeuses ou des non fumeuses. Ces régressions vous permettent-elles de conclure sur la nocivité du tabagisme ? Vous pourrez proposer une représentation graphique de ces régressions (en n'omettant pas les régions de confiance)." ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SmokerStatusAgeAgeGroupDeath
0YesAlive21.018-340.0
1YesAlive19.318-340.0
2NoDead57.555-641.0
3NoAlive47.135-540.0
4YesAlive81.4>650.0
5NoAlive36.835-540.0
6NoAlive23.818-340.0
7YesDead57.555-641.0
8YesAlive24.818-340.0
9YesAlive49.535-540.0
10YesAlive30.018-340.0
11NoDead66.0>651.0
12YesAlive49.235-540.0
13NoAlive58.455-640.0
14NoDead60.655-641.0
15NoAlive25.118-340.0
16NoAlive43.535-540.0
17NoAlive27.118-340.0
18NoAlive58.355-640.0
19YesAlive65.7>650.0
20NoDead73.2>651.0
21YesAlive38.335-540.0
22NoAlive33.418-340.0
23YesDead62.355-641.0
24NoAlive18.018-340.0
25NoAlive56.255-640.0
26YesAlive59.255-640.0
27NoAlive25.818-340.0
28NoDead36.935-541.0
29NoAlive20.218-340.0
..................
1284YesDead36.035-541.0
1285YesAlive48.335-540.0
1286NoAlive63.155-640.0
1287NoAlive60.855-640.0
1288YesDead39.335-541.0
1289NoAlive36.735-540.0
1290NoAlive63.855-640.0
1291NoDead71.3>651.0
1292NoAlive57.755-640.0
1293NoAlive63.255-640.0
1294NoAlive46.635-540.0
1295YesDead82.4>651.0
1296YesAlive38.335-540.0
1297YesAlive32.718-340.0
1298NoAlive39.735-540.0
1299YesDead60.055-641.0
1300NoDead71.0>651.0
1301NoAlive20.518-340.0
1302NoAlive44.435-540.0
1303YesAlive31.218-340.0
1304YesAlive47.835-540.0
1305YesAlive60.955-640.0
1306NoDead61.455-641.0
1307YesAlive43.035-540.0
1308NoAlive42.135-540.0
1309YesAlive35.935-540.0
1310NoAlive22.318-340.0
1311YesDead62.155-641.0
1312NoDead88.6>651.0
1313NoAlive39.135-540.0
\n", "

1314 rows × 5 columns

\n", "
" ], "text/plain": [ " Smoker Status Age AgeGroup Death\n", "0 Yes Alive 21.0 18-34 0.0\n", "1 Yes Alive 19.3 18-34 0.0\n", "2 No Dead 57.5 55-64 1.0\n", "3 No Alive 47.1 35-54 0.0\n", "4 Yes Alive 81.4 >65 0.0\n", "5 No Alive 36.8 35-54 0.0\n", "6 No Alive 23.8 18-34 0.0\n", "7 Yes Dead 57.5 55-64 1.0\n", "8 Yes Alive 24.8 18-34 0.0\n", "9 Yes Alive 49.5 35-54 0.0\n", "10 Yes Alive 30.0 18-34 0.0\n", "11 No Dead 66.0 >65 1.0\n", "12 Yes Alive 49.2 35-54 0.0\n", "13 No Alive 58.4 55-64 0.0\n", "14 No Dead 60.6 55-64 1.0\n", "15 No Alive 25.1 18-34 0.0\n", "16 No Alive 43.5 35-54 0.0\n", "17 No Alive 27.1 18-34 0.0\n", "18 No Alive 58.3 55-64 0.0\n", "19 Yes Alive 65.7 >65 0.0\n", "20 No Dead 73.2 >65 1.0\n", "21 Yes Alive 38.3 35-54 0.0\n", "22 No Alive 33.4 18-34 0.0\n", "23 Yes Dead 62.3 55-64 1.0\n", "24 No Alive 18.0 18-34 0.0\n", "25 No Alive 56.2 55-64 0.0\n", "26 Yes Alive 59.2 55-64 0.0\n", "27 No Alive 25.8 18-34 0.0\n", "28 No Dead 36.9 35-54 1.0\n", "29 No Alive 20.2 18-34 0.0\n", "... ... ... ... ... ...\n", "1284 Yes Dead 36.0 35-54 1.0\n", "1285 Yes Alive 48.3 35-54 0.0\n", "1286 No Alive 63.1 55-64 0.0\n", "1287 No Alive 60.8 55-64 0.0\n", "1288 Yes Dead 39.3 35-54 1.0\n", "1289 No Alive 36.7 35-54 0.0\n", "1290 No Alive 63.8 55-64 0.0\n", "1291 No Dead 71.3 >65 1.0\n", "1292 No Alive 57.7 55-64 0.0\n", "1293 No Alive 63.2 55-64 0.0\n", "1294 No Alive 46.6 35-54 0.0\n", "1295 Yes Dead 82.4 >65 1.0\n", "1296 Yes Alive 38.3 35-54 0.0\n", "1297 Yes Alive 32.7 18-34 0.0\n", "1298 No Alive 39.7 35-54 0.0\n", "1299 Yes Dead 60.0 55-64 1.0\n", "1300 No Dead 71.0 >65 1.0\n", "1301 No Alive 20.5 18-34 0.0\n", "1302 No Alive 44.4 35-54 0.0\n", "1303 Yes Alive 31.2 18-34 0.0\n", "1304 Yes Alive 47.8 35-54 0.0\n", "1305 Yes Alive 60.9 55-64 0.0\n", "1306 No Dead 61.4 55-64 1.0\n", "1307 Yes Alive 43.0 35-54 0.0\n", "1308 No Alive 42.1 35-54 0.0\n", "1309 Yes Alive 35.9 35-54 0.0\n", "1310 No Alive 22.3 18-34 0.0\n", "1311 Yes Dead 62.1 55-64 1.0\n", "1312 No Dead 88.6 >65 1.0\n", "1313 No Alive 39.1 35-54 0.0\n", "\n", "[1314 rows x 5 columns]" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.loc[data.Status == 'Alive', 'Death']=0\n", "data.loc[data.Status == 'Dead', 'Death']=1\n", "data" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "import seaborn as sns" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/opt/conda/lib/python3.6/site-packages/scipy/stats/stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.\n", " return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval\n" ] }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "sns.lmplot(x=\"Age\", y=\"Death\",hue=\"Smoker\", data=data, logistic=True, y_jitter=.1)\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "import statsmodels.api as sm\n", "from statsmodels.formula.api import logit" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Optimization terminated successfully.\n", " Current function value: 0.364541\n", " Iterations 8\n" ] }, { "data": { "text/html": [ "\n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "
Logit Regression Results
Dep. Variable: Death No. Observations: 1314
Model: Logit Df Residuals: 1309
Method: MLE Df Model: 4
Date: Tue, 27 Jul 2021 Pseudo R-squ.: 0.3860
Time: 20:58:25 Log-Likelihood: -479.01
converged: True LL-Null: -780.16
LLR p-value: 4.919e-129
\n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "
coef std err z P>|z| [0.025 0.975]
Intercept -3.7947 0.321 -11.809 0.000 -4.425 -3.165
Smoker[T.Yes] 0.4528 0.176 2.577 0.010 0.108 0.797
AgeGroup[T.35-54] 1.6950 0.336 5.039 0.000 1.036 2.354
AgeGroup[T.55-64] 3.1024 0.334 9.279 0.000 2.447 3.758
AgeGroup[T.>65] 5.4917 0.364 15.104 0.000 4.779 6.204
" ], "text/plain": [ "\n", "\"\"\"\n", " Logit Regression Results \n", "==============================================================================\n", "Dep. Variable: Death No. Observations: 1314\n", "Model: Logit Df Residuals: 1309\n", "Method: MLE Df Model: 4\n", "Date: Tue, 27 Jul 2021 Pseudo R-squ.: 0.3860\n", "Time: 20:58:25 Log-Likelihood: -479.01\n", "converged: True LL-Null: -780.16\n", " LLR p-value: 4.919e-129\n", "=====================================================================================\n", " coef std err z P>|z| [0.025 0.975]\n", "-------------------------------------------------------------------------------------\n", "Intercept -3.7947 0.321 -11.809 0.000 -4.425 -3.165\n", "Smoker[T.Yes] 0.4528 0.176 2.577 0.010 0.108 0.797\n", "AgeGroup[T.35-54] 1.6950 0.336 5.039 0.000 1.036 2.354\n", "AgeGroup[T.55-64] 3.1024 0.334 9.279 0.000 2.447 3.758\n", "AgeGroup[T.>65] 5.4917 0.364 15.104 0.000 4.779 6.204\n", "=====================================================================================\n", "\"\"\"" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "formula=('Death~Smoker+AgeGroup')\n", "model=logit(formula=formula, data=data).fit()\n", "model.summary()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The results show that smoking has an influence in women mortality (p-value < 0,05) but age seems to be a better explicatif factor of mortality in different age groups (p-value <<< 0,05). Only 39 % (Pseudo R-sqaured = 0,3860) of mortality can be explained by smoking and age group." ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4" } }, "nbformat": 4, "nbformat_minor": 2 }