From 303f7d4b28c8d054cdd0057237dd35e38127fc40 Mon Sep 17 00:00:00 2001
From: 096323827ddd6110f7ba1fd93f19e12c
<096323827ddd6110f7ba1fd93f19e12c@app-learninglab.inria.fr>
Date: Tue, 6 Apr 2021 20:34:11 +0000
Subject: [PATCH] Final 2
---
module3/exo3/saev_final.ipynb | 1032 +++++++++++++++++++++++++++++++++
1 file changed, 1032 insertions(+)
create mode 100644 module3/exo3/saev_final.ipynb
diff --git a/module3/exo3/saev_final.ipynb b/module3/exo3/saev_final.ipynb
new file mode 100644
index 0000000..5481a9e
--- /dev/null
+++ b/module3/exo3/saev_final.ipynb
@@ -0,0 +1,1032 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Sujet 6 : Autour du Paradoxe de Simpson"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "En 1972-1974, à Whickham, une ville du nord-est de l'Angleterre, située à environ 6,5 kilomètres au sud-ouest de Newcastle upon Tyne, un sondage d'un sixième des électeurs a été effectué afin d'éclairer des travaux sur les maladies thyroïdiennes et cardiaques (Tunbridge et al. 1977). Une suite de cette étude a été menée vingt ans plus tard (Vanderpump et al. 1995). Certains des résultats avaient trait au tabagisme et cherchaient à savoir si les individus étaient toujours en vie lors de la seconde étude. Par simplicité, nous nous restreindrons aux femmes et parmi celles-ci aux 1314 qui ont été catégorisées comme \"fumant actuellement\" ou \"n'ayant jamais fumé\". Il y avait relativement peu de femmes dans le sondage initial ayant fumé et ayant arrêté depuis (162) et très peu pour lesquelles l'information n'était pas disponible (18). La survie à 20 ans a été déterminée pour l'ensemble des femmes du premier sondage."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Préparation"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Chargement des données"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
Smoker
\n",
+ "
Status
\n",
+ "
Age
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
0
\n",
+ "
Yes
\n",
+ "
Alive
\n",
+ "
21.0
\n",
+ "
\n",
+ "
\n",
+ "
1
\n",
+ "
Yes
\n",
+ "
Alive
\n",
+ "
19.3
\n",
+ "
\n",
+ "
\n",
+ "
2
\n",
+ "
No
\n",
+ "
Dead
\n",
+ "
57.5
\n",
+ "
\n",
+ "
\n",
+ "
3
\n",
+ "
No
\n",
+ "
Alive
\n",
+ "
47.1
\n",
+ "
\n",
+ "
\n",
+ "
4
\n",
+ "
Yes
\n",
+ "
Alive
\n",
+ "
81.4
\n",
+ "
\n",
+ "
\n",
+ "
5
\n",
+ "
No
\n",
+ "
Alive
\n",
+ "
36.8
\n",
+ "
\n",
+ "
\n",
+ "
6
\n",
+ "
No
\n",
+ "
Alive
\n",
+ "
23.8
\n",
+ "
\n",
+ "
\n",
+ "
7
\n",
+ "
Yes
\n",
+ "
Dead
\n",
+ "
57.5
\n",
+ "
\n",
+ "
\n",
+ "
8
\n",
+ "
Yes
\n",
+ "
Alive
\n",
+ "
24.8
\n",
+ "
\n",
+ "
\n",
+ "
9
\n",
+ "
Yes
\n",
+ "
Alive
\n",
+ "
49.5
\n",
+ "
\n",
+ "
\n",
+ "
10
\n",
+ "
Yes
\n",
+ "
Alive
\n",
+ "
30.0
\n",
+ "
\n",
+ "
\n",
+ "
11
\n",
+ "
No
\n",
+ "
Dead
\n",
+ "
66.0
\n",
+ "
\n",
+ "
\n",
+ "
12
\n",
+ "
Yes
\n",
+ "
Alive
\n",
+ "
49.2
\n",
+ "
\n",
+ "
\n",
+ "
13
\n",
+ "
No
\n",
+ "
Alive
\n",
+ "
58.4
\n",
+ "
\n",
+ "
\n",
+ "
14
\n",
+ "
No
\n",
+ "
Dead
\n",
+ "
60.6
\n",
+ "
\n",
+ "
\n",
+ "
15
\n",
+ "
No
\n",
+ "
Alive
\n",
+ "
25.1
\n",
+ "
\n",
+ "
\n",
+ "
16
\n",
+ "
No
\n",
+ "
Alive
\n",
+ "
43.5
\n",
+ "
\n",
+ "
\n",
+ "
17
\n",
+ "
No
\n",
+ "
Alive
\n",
+ "
27.1
\n",
+ "
\n",
+ "
\n",
+ "
18
\n",
+ "
No
\n",
+ "
Alive
\n",
+ "
58.3
\n",
+ "
\n",
+ "
\n",
+ "
19
\n",
+ "
Yes
\n",
+ "
Alive
\n",
+ "
65.7
\n",
+ "
\n",
+ "
\n",
+ "
20
\n",
+ "
No
\n",
+ "
Dead
\n",
+ "
73.2
\n",
+ "
\n",
+ "
\n",
+ "
21
\n",
+ "
Yes
\n",
+ "
Alive
\n",
+ "
38.3
\n",
+ "
\n",
+ "
\n",
+ "
22
\n",
+ "
No
\n",
+ "
Alive
\n",
+ "
33.4
\n",
+ "
\n",
+ "
\n",
+ "
23
\n",
+ "
Yes
\n",
+ "
Dead
\n",
+ "
62.3
\n",
+ "
\n",
+ "
\n",
+ "
24
\n",
+ "
No
\n",
+ "
Alive
\n",
+ "
18.0
\n",
+ "
\n",
+ "
\n",
+ "
25
\n",
+ "
No
\n",
+ "
Alive
\n",
+ "
56.2
\n",
+ "
\n",
+ "
\n",
+ "
26
\n",
+ "
Yes
\n",
+ "
Alive
\n",
+ "
59.2
\n",
+ "
\n",
+ "
\n",
+ "
27
\n",
+ "
No
\n",
+ "
Alive
\n",
+ "
25.8
\n",
+ "
\n",
+ "
\n",
+ "
28
\n",
+ "
No
\n",
+ "
Dead
\n",
+ "
36.9
\n",
+ "
\n",
+ "
\n",
+ "
29
\n",
+ "
No
\n",
+ "
Alive
\n",
+ "
20.2
\n",
+ "
\n",
+ "
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
\n",
+ "
\n",
+ "
1284
\n",
+ "
Yes
\n",
+ "
Dead
\n",
+ "
36.0
\n",
+ "
\n",
+ "
\n",
+ "
1285
\n",
+ "
Yes
\n",
+ "
Alive
\n",
+ "
48.3
\n",
+ "
\n",
+ "
\n",
+ "
1286
\n",
+ "
No
\n",
+ "
Alive
\n",
+ "
63.1
\n",
+ "
\n",
+ "
\n",
+ "
1287
\n",
+ "
No
\n",
+ "
Alive
\n",
+ "
60.8
\n",
+ "
\n",
+ "
\n",
+ "
1288
\n",
+ "
Yes
\n",
+ "
Dead
\n",
+ "
39.3
\n",
+ "
\n",
+ "
\n",
+ "
1289
\n",
+ "
No
\n",
+ "
Alive
\n",
+ "
36.7
\n",
+ "
\n",
+ "
\n",
+ "
1290
\n",
+ "
No
\n",
+ "
Alive
\n",
+ "
63.8
\n",
+ "
\n",
+ "
\n",
+ "
1291
\n",
+ "
No
\n",
+ "
Dead
\n",
+ "
71.3
\n",
+ "
\n",
+ "
\n",
+ "
1292
\n",
+ "
No
\n",
+ "
Alive
\n",
+ "
57.7
\n",
+ "
\n",
+ "
\n",
+ "
1293
\n",
+ "
No
\n",
+ "
Alive
\n",
+ "
63.2
\n",
+ "
\n",
+ "
\n",
+ "
1294
\n",
+ "
No
\n",
+ "
Alive
\n",
+ "
46.6
\n",
+ "
\n",
+ "
\n",
+ "
1295
\n",
+ "
Yes
\n",
+ "
Dead
\n",
+ "
82.4
\n",
+ "
\n",
+ "
\n",
+ "
1296
\n",
+ "
Yes
\n",
+ "
Alive
\n",
+ "
38.3
\n",
+ "
\n",
+ "
\n",
+ "
1297
\n",
+ "
Yes
\n",
+ "
Alive
\n",
+ "
32.7
\n",
+ "
\n",
+ "
\n",
+ "
1298
\n",
+ "
No
\n",
+ "
Alive
\n",
+ "
39.7
\n",
+ "
\n",
+ "
\n",
+ "
1299
\n",
+ "
Yes
\n",
+ "
Dead
\n",
+ "
60.0
\n",
+ "
\n",
+ "
\n",
+ "
1300
\n",
+ "
No
\n",
+ "
Dead
\n",
+ "
71.0
\n",
+ "
\n",
+ "
\n",
+ "
1301
\n",
+ "
No
\n",
+ "
Alive
\n",
+ "
20.5
\n",
+ "
\n",
+ "
\n",
+ "
1302
\n",
+ "
No
\n",
+ "
Alive
\n",
+ "
44.4
\n",
+ "
\n",
+ "
\n",
+ "
1303
\n",
+ "
Yes
\n",
+ "
Alive
\n",
+ "
31.2
\n",
+ "
\n",
+ "
\n",
+ "
1304
\n",
+ "
Yes
\n",
+ "
Alive
\n",
+ "
47.8
\n",
+ "
\n",
+ "
\n",
+ "
1305
\n",
+ "
Yes
\n",
+ "
Alive
\n",
+ "
60.9
\n",
+ "
\n",
+ "
\n",
+ "
1306
\n",
+ "
No
\n",
+ "
Dead
\n",
+ "
61.4
\n",
+ "
\n",
+ "
\n",
+ "
1307
\n",
+ "
Yes
\n",
+ "
Alive
\n",
+ "
43.0
\n",
+ "
\n",
+ "
\n",
+ "
1308
\n",
+ "
No
\n",
+ "
Alive
\n",
+ "
42.1
\n",
+ "
\n",
+ "
\n",
+ "
1309
\n",
+ "
Yes
\n",
+ "
Alive
\n",
+ "
35.9
\n",
+ "
\n",
+ "
\n",
+ "
1310
\n",
+ "
No
\n",
+ "
Alive
\n",
+ "
22.3
\n",
+ "
\n",
+ "
\n",
+ "
1311
\n",
+ "
Yes
\n",
+ "
Dead
\n",
+ "
62.1
\n",
+ "
\n",
+ "
\n",
+ "
1312
\n",
+ "
No
\n",
+ "
Dead
\n",
+ "
88.6
\n",
+ "
\n",
+ "
\n",
+ "
1313
\n",
+ "
No
\n",
+ "
Alive
\n",
+ "
39.1
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
1314 rows × 3 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Smoker Status Age\n",
+ "0 Yes Alive 21.0\n",
+ "1 Yes Alive 19.3\n",
+ "2 No Dead 57.5\n",
+ "3 No Alive 47.1\n",
+ "4 Yes Alive 81.4\n",
+ "5 No Alive 36.8\n",
+ "6 No Alive 23.8\n",
+ "7 Yes Dead 57.5\n",
+ "8 Yes Alive 24.8\n",
+ "9 Yes Alive 49.5\n",
+ "10 Yes Alive 30.0\n",
+ "11 No Dead 66.0\n",
+ "12 Yes Alive 49.2\n",
+ "13 No Alive 58.4\n",
+ "14 No Dead 60.6\n",
+ "15 No Alive 25.1\n",
+ "16 No Alive 43.5\n",
+ "17 No Alive 27.1\n",
+ "18 No Alive 58.3\n",
+ "19 Yes Alive 65.7\n",
+ "20 No Dead 73.2\n",
+ "21 Yes Alive 38.3\n",
+ "22 No Alive 33.4\n",
+ "23 Yes Dead 62.3\n",
+ "24 No Alive 18.0\n",
+ "25 No Alive 56.2\n",
+ "26 Yes Alive 59.2\n",
+ "27 No Alive 25.8\n",
+ "28 No Dead 36.9\n",
+ "29 No Alive 20.2\n",
+ "... ... ... ...\n",
+ "1284 Yes Dead 36.0\n",
+ "1285 Yes Alive 48.3\n",
+ "1286 No Alive 63.1\n",
+ "1287 No Alive 60.8\n",
+ "1288 Yes Dead 39.3\n",
+ "1289 No Alive 36.7\n",
+ "1290 No Alive 63.8\n",
+ "1291 No Dead 71.3\n",
+ "1292 No Alive 57.7\n",
+ "1293 No Alive 63.2\n",
+ "1294 No Alive 46.6\n",
+ "1295 Yes Dead 82.4\n",
+ "1296 Yes Alive 38.3\n",
+ "1297 Yes Alive 32.7\n",
+ "1298 No Alive 39.7\n",
+ "1299 Yes Dead 60.0\n",
+ "1300 No Dead 71.0\n",
+ "1301 No Alive 20.5\n",
+ "1302 No Alive 44.4\n",
+ "1303 Yes Alive 31.2\n",
+ "1304 Yes Alive 47.8\n",
+ "1305 Yes Alive 60.9\n",
+ "1306 No Dead 61.4\n",
+ "1307 Yes Alive 43.0\n",
+ "1308 No Alive 42.1\n",
+ "1309 Yes Alive 35.9\n",
+ "1310 No Alive 22.3\n",
+ "1311 Yes Dead 62.1\n",
+ "1312 No Dead 88.6\n",
+ "1313 No Alive 39.1\n",
+ "\n",
+ "[1314 rows x 3 columns]"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "%matplotlib inline\n",
+ "import matplotlib.pyplot as plt\n",
+ "import pandas as pd\n",
+ "\n",
+ "csv_url = 'https://gitlab.inria.fr/learninglab/mooc-rr/mooc-rr-ressources/-/raw/master/module3/Practical_session/Subject6_smoking.csv?inline=false'\n",
+ "raw_data = pd.read_csv(csv_url)\n",
+ "raw_data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 1. Calcul des taux de moratlité globaux"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Trie des données brutes par catégories fumeueses / non fumeuses et vivantes / décédées."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "smoker = raw_data[raw_data['Smoker'] == 'Yes']\n",
+ "non_smoker = raw_data[raw_data['Smoker'] == 'No']\n",
+ "\n",
+ "smoker_alive = smoker[smoker['Status'] == 'Alive']\n",
+ "smoker_dead = smoker[smoker['Status'] == 'Dead']\n",
+ "non_smoker_alive = non_smoker[non_smoker['Status'] == 'Alive']\n",
+ "non_smoker_dead = non_smoker[non_smoker['Status'] == 'Dead']"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Calcul des effectifs :"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "
Vivant
\n",
+ "
Décédé
\n",
+ "
Total
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
0
\n",
+ "
Fumeur
\n",
+ "
443
\n",
+ "
139
\n",
+ "
945
\n",
+ "
\n",
+ "
\n",
+ "
1
\n",
+ "
Non Fumeur
\n",
+ "
502
\n",
+ "
230
\n",
+ "
369
\n",
+ "
\n",
+ "
\n",
+ "
2
\n",
+ "
Total
\n",
+ "
945
\n",
+ "
673
\n",
+ "
1314
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Vivant Décédé Total\n",
+ "0 Fumeur 443 139 945\n",
+ "1 Non Fumeur 502 230 369\n",
+ "2 Total 945 673 1314"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "nb_sma = len(smoker_alive)\n",
+ "nb_smd = len(smoker_dead)\n",
+ "nb_nsma = len(non_smoker_alive)\n",
+ "nb_nsmd = len(non_smoker_dead)\n",
+ " \n",
+ "\n",
+ "tab = pd.DataFrame({'': ['Fumeur', 'Non Fumeur', 'Total'],\n",
+ " 'Vivant': [nb_sma, nb_nsma, nb_sma + nb_nsma],\n",
+ " 'Décédé': [nb_smd, nb_nsmd, nb_sma + nb_nsmd],\n",
+ " 'Total': [nb_nsma + nb_sma, nb_nsmd + nb_smd, nb_sma + nb_smd + nb_nsma + nb_nsmd]\n",
+ " }, columns = ['', 'Vivant', 'Décédé', 'Total'])\n",
+ "tab"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Calcul des taux de mortalité :"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Taux de mortalité chez les fumeuses: 0.239\n",
+ "Taux de mortalité chez les non-fumeuses: 0.314\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(f'Taux de mortalité chez les fumeuses: {nb_smd / (nb_smd + nb_sma):.3f}')\n",
+ "print(f'Taux de mortalité chez les non-fumeuses: {nb_nsmd / (nb_nsmd + nb_nsma):.3f}')\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "On peut constater étonnamment que le taux de mortalité est plus élevé chez les femmes non-fumeuses que ches les femmes fumeuses."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 2. Calcul des taux de moratlité par tranche d'age"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "On décide maintenant de calculer les taux de mortalité par tranche d'age : : 18-34 ans, 35-54 ans, 55-64 ans, plus de 65 ans."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Taux de mortalité :\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "
"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "x = [sli[0] for sli in slices]\n",
+ "pl1, = plt.plot(x, [rates[sli][0] for sli in slices], 'or')\n",
+ "pl2, = plt.plot(x, [rates[sli][1] for sli in slices], 'ob')\n",
+ "plt.legend([pl1, pl2], ['Fumeuses', 'Non Fumeuses'])\n",
+ "plt.title(\"Taux de mortalité par tranche d'age\")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Ces résultats semblent contredire les résultats obtenus lors de la première partie. En effet On constate que pour chaque tranche d'age prise séparément, le taux de mortalité est plus élevé chez les fumeuses que chez les non-fumeuses. Cela semble bien illustrer le paradoxe de Simpson. Ces résultats contradictoires peuvent-être expliqués en affichant les effectifs de chaque catégorie:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
Age
\n",
+ "
Fumeur
\n",
+ "
Non fumeur
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
0
\n",
+ "
(18, 34)
\n",
+ "
179
\n",
+ "
219
\n",
+ "
\n",
+ "
\n",
+ "
1
\n",
+ "
(35, 54)
\n",
+ "
229
\n",
+ "
191
\n",
+ "
\n",
+ "
\n",
+ "
2
\n",
+ "
(55, 64)
\n",
+ "
115
\n",
+ "
119
\n",
+ "
\n",
+ "
\n",
+ "
3
\n",
+ "
(65, 999)
\n",
+ "
49
\n",
+ "
193
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Age Fumeur Non fumeur\n",
+ "0 (18, 34) 179 219\n",
+ "1 (35, 54) 229 191\n",
+ "2 (55, 64) 115 119\n",
+ "3 (65, 999) 49 193"
+ ]
+ },
+ "execution_count": 41,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "tab = pd.DataFrame({'Age': slices,\n",
+ " 'Fumeur': [nbs[sli][0] for sli in slices],\n",
+ " 'Non fumeur': [nbs[sli][1] for sli in slices],\n",
+ " }, columns = ['Age', 'Fumeur', 'Non fumeur'])\n",
+ "tab"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "On peut constater que la réparttion des fumeuses / non fumeuses est relativement équilibrée par tranche d'age, mis à part pour la dernière tranche 65+ ans. En effet, dans cet age, le nombre de non fumeuses est 4 fois supérieur. Or c'est aussi dans cette tranche qu'on enregistre logiquement le plus haut taux de mortalité, et ce quelle que soit la catégorie. Ainsi, ce grand nombre de personnes agées ayant participé à l'étude couplé au fort taux de mortalité pour cette tranche d'age augmente fortement le taux de mortalité total pour les non fuemurs, ce qui est moins le cas chez les fumeurs. Cela peut être confirmé en calculant les taux de mortalité totaux en retirant les 65+ ans:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 101,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Taux de mortalité chez les fumeuses: 0.182\n",
+ "Taux de mortalité chez les non-fumeuses: 0.121\n",
+ "89.9\n"
+ ]
+ }
+ ],
+ "source": [
+ "filter_data = raw_data[raw_data['Age'] < 65]\n",
+ "\n",
+ "smoker = filter_data[filter_data['Smoker'] == 'Yes']\n",
+ "non_smoker = filter_data[filter_data['Smoker'] == 'No']\n",
+ "\n",
+ "smoker_alive = smoker[smoker['Status'] == 'Alive']\n",
+ "smoker_dead = smoker[smoker['Status'] == 'Dead']\n",
+ "non_smoker_alive = non_smoker[non_smoker['Status'] == 'Alive']\n",
+ "non_smoker_dead = non_smoker[non_smoker['Status'] == 'Dead']\n",
+ "\n",
+ "print(f'Taux de mortalité chez les fumeuses: {len(smoker_dead) / len(smoker):.3f}')\n",
+ "print(f'Taux de mortalité chez les non-fumeuses: {len(non_smoker_dead) / len(non_smoker):.3f}')\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 3. Etude probabilistique"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Afin d'éviter un biais induit par des regroupements en tranches d'âges arbitraires et non régulières, nous réalisons une régression logistique. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 103,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "import numpy as np\n",
+ "from sklearn.linear_model import LogisticRegression\n",
+ "\n",
+ "def list_to_array(l):\n",
+ " return np.array([[i] for i in l])\n",
+ "\n",
+ "probs = []\n",
+ "\n",
+ "xs = list(range(int(min(raw_data['Age'])), int(max(raw_data['Age']))))\n",
+ "for fil in ['Yes', 'No']:\n",
+ " filter_data = raw_data[raw_data['Smoker'] == fil]\n",
+ " ages = filter_data['Age']\n",
+ " deaths = (filter_data['Status'] == 'Dead').astype(int)\n",
+ "\n",
+ " model = LogisticRegression(solver='liblinear', random_state=0).fit(list_to_array(ages), deaths)\n",
+ " probs.append(model.predict_proba(list_to_array(xs))[:,1].tolist())\n",
+ " \n",
+ "pl1 = plt.plot(xs, probs[0], '-r', label='Fumeuses')\n",
+ "pl2 = plt.plot(xs , probs[1], '-b', label='Non Fumeuses')\n",
+ "plt.title(\"Probabilité de décès en fonction de l'age.\")\n",
+ "plt.legend()\n",
+ "plt.show() \n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "On peut constater sur ce graphique que jusqu'à 65 ans, fumer semble augmeneter la probabilité de décès sur une période de 20 ans, ce qui prouve une diminution de l'espérance de vie. La tendance s'inverse à partir de 65 ans, explicable en partie par la différence d'effectifs dans cette tranche d'age."
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--
2.18.1