diff --git a/module3/exo3/exercice.ipynb b/module3/exo3/exercice.ipynb
index 0bbbe371b01e359e381e43239412d77bf53fb1fb..b5330838a9712f81cbfb3bd3b8c4d260cce6402e 100644
--- a/module3/exo3/exercice.ipynb
+++ b/module3/exo3/exercice.ipynb
@@ -1,5 +1,785 @@
{
- "cells": [],
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Sujet 6 : Autour du Paradoxe de Simpson"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 88,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%matplotlib inline\n",
+ "import matplotlib.pyplot as plt\n",
+ "import pandas as pd"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 89,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Smoker | \n",
+ " Status | \n",
+ " Age | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Yes | \n",
+ " Alive | \n",
+ " 21.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Yes | \n",
+ " Alive | \n",
+ " 19.3 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " No | \n",
+ " Dead | \n",
+ " 57.5 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " No | \n",
+ " Alive | \n",
+ " 47.1 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Yes | \n",
+ " Alive | \n",
+ " 81.4 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " No | \n",
+ " Alive | \n",
+ " 36.8 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " No | \n",
+ " Alive | \n",
+ " 23.8 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " Yes | \n",
+ " Dead | \n",
+ " 57.5 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " Yes | \n",
+ " Alive | \n",
+ " 24.8 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " Yes | \n",
+ " Alive | \n",
+ " 49.5 | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " Yes | \n",
+ " Alive | \n",
+ " 30.0 | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " No | \n",
+ " Dead | \n",
+ " 66.0 | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " Yes | \n",
+ " Alive | \n",
+ " 49.2 | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " No | \n",
+ " Alive | \n",
+ " 58.4 | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " No | \n",
+ " Dead | \n",
+ " 60.6 | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " No | \n",
+ " Alive | \n",
+ " 25.1 | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " No | \n",
+ " Alive | \n",
+ " 43.5 | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " No | \n",
+ " Alive | \n",
+ " 27.1 | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " No | \n",
+ " Alive | \n",
+ " 58.3 | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " Yes | \n",
+ " Alive | \n",
+ " 65.7 | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " No | \n",
+ " Dead | \n",
+ " 73.2 | \n",
+ "
\n",
+ " \n",
+ " 21 | \n",
+ " Yes | \n",
+ " Alive | \n",
+ " 38.3 | \n",
+ "
\n",
+ " \n",
+ " 22 | \n",
+ " No | \n",
+ " Alive | \n",
+ " 33.4 | \n",
+ "
\n",
+ " \n",
+ " 23 | \n",
+ " Yes | \n",
+ " Dead | \n",
+ " 62.3 | \n",
+ "
\n",
+ " \n",
+ " 24 | \n",
+ " No | \n",
+ " Alive | \n",
+ " 18.0 | \n",
+ "
\n",
+ " \n",
+ " 25 | \n",
+ " No | \n",
+ " Alive | \n",
+ " 56.2 | \n",
+ "
\n",
+ " \n",
+ " 26 | \n",
+ " Yes | \n",
+ " Alive | \n",
+ " 59.2 | \n",
+ "
\n",
+ " \n",
+ " 27 | \n",
+ " No | \n",
+ " Alive | \n",
+ " 25.8 | \n",
+ "
\n",
+ " \n",
+ " 28 | \n",
+ " No | \n",
+ " Dead | \n",
+ " 36.9 | \n",
+ "
\n",
+ " \n",
+ " 29 | \n",
+ " No | \n",
+ " Alive | \n",
+ " 20.2 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 1284 | \n",
+ " Yes | \n",
+ " Dead | \n",
+ " 36.0 | \n",
+ "
\n",
+ " \n",
+ " 1285 | \n",
+ " Yes | \n",
+ " Alive | \n",
+ " 48.3 | \n",
+ "
\n",
+ " \n",
+ " 1286 | \n",
+ " No | \n",
+ " Alive | \n",
+ " 63.1 | \n",
+ "
\n",
+ " \n",
+ " 1287 | \n",
+ " No | \n",
+ " Alive | \n",
+ " 60.8 | \n",
+ "
\n",
+ " \n",
+ " 1288 | \n",
+ " Yes | \n",
+ " Dead | \n",
+ " 39.3 | \n",
+ "
\n",
+ " \n",
+ " 1289 | \n",
+ " No | \n",
+ " Alive | \n",
+ " 36.7 | \n",
+ "
\n",
+ " \n",
+ " 1290 | \n",
+ " No | \n",
+ " Alive | \n",
+ " 63.8 | \n",
+ "
\n",
+ " \n",
+ " 1291 | \n",
+ " No | \n",
+ " Dead | \n",
+ " 71.3 | \n",
+ "
\n",
+ " \n",
+ " 1292 | \n",
+ " No | \n",
+ " Alive | \n",
+ " 57.7 | \n",
+ "
\n",
+ " \n",
+ " 1293 | \n",
+ " No | \n",
+ " Alive | \n",
+ " 63.2 | \n",
+ "
\n",
+ " \n",
+ " 1294 | \n",
+ " No | \n",
+ " Alive | \n",
+ " 46.6 | \n",
+ "
\n",
+ " \n",
+ " 1295 | \n",
+ " Yes | \n",
+ " Dead | \n",
+ " 82.4 | \n",
+ "
\n",
+ " \n",
+ " 1296 | \n",
+ " Yes | \n",
+ " Alive | \n",
+ " 38.3 | \n",
+ "
\n",
+ " \n",
+ " 1297 | \n",
+ " Yes | \n",
+ " Alive | \n",
+ " 32.7 | \n",
+ "
\n",
+ " \n",
+ " 1298 | \n",
+ " No | \n",
+ " Alive | \n",
+ " 39.7 | \n",
+ "
\n",
+ " \n",
+ " 1299 | \n",
+ " Yes | \n",
+ " Dead | \n",
+ " 60.0 | \n",
+ "
\n",
+ " \n",
+ " 1300 | \n",
+ " No | \n",
+ " Dead | \n",
+ " 71.0 | \n",
+ "
\n",
+ " \n",
+ " 1301 | \n",
+ " No | \n",
+ " Alive | \n",
+ " 20.5 | \n",
+ "
\n",
+ " \n",
+ " 1302 | \n",
+ " No | \n",
+ " Alive | \n",
+ " 44.4 | \n",
+ "
\n",
+ " \n",
+ " 1303 | \n",
+ " Yes | \n",
+ " Alive | \n",
+ " 31.2 | \n",
+ "
\n",
+ " \n",
+ " 1304 | \n",
+ " Yes | \n",
+ " Alive | \n",
+ " 47.8 | \n",
+ "
\n",
+ " \n",
+ " 1305 | \n",
+ " Yes | \n",
+ " Alive | \n",
+ " 60.9 | \n",
+ "
\n",
+ " \n",
+ " 1306 | \n",
+ " No | \n",
+ " Dead | \n",
+ " 61.4 | \n",
+ "
\n",
+ " \n",
+ " 1307 | \n",
+ " Yes | \n",
+ " Alive | \n",
+ " 43.0 | \n",
+ "
\n",
+ " \n",
+ " 1308 | \n",
+ " No | \n",
+ " Alive | \n",
+ " 42.1 | \n",
+ "
\n",
+ " \n",
+ " 1309 | \n",
+ " Yes | \n",
+ " Alive | \n",
+ " 35.9 | \n",
+ "
\n",
+ " \n",
+ " 1310 | \n",
+ " No | \n",
+ " Alive | \n",
+ " 22.3 | \n",
+ "
\n",
+ " \n",
+ " 1311 | \n",
+ " Yes | \n",
+ " Dead | \n",
+ " 62.1 | \n",
+ "
\n",
+ " \n",
+ " 1312 | \n",
+ " No | \n",
+ " Dead | \n",
+ " 88.6 | \n",
+ "
\n",
+ " \n",
+ " 1313 | \n",
+ " No | \n",
+ " Alive | \n",
+ " 39.1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
1314 rows × 3 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Smoker Status Age\n",
+ "0 Yes Alive 21.0\n",
+ "1 Yes Alive 19.3\n",
+ "2 No Dead 57.5\n",
+ "3 No Alive 47.1\n",
+ "4 Yes Alive 81.4\n",
+ "5 No Alive 36.8\n",
+ "6 No Alive 23.8\n",
+ "7 Yes Dead 57.5\n",
+ "8 Yes Alive 24.8\n",
+ "9 Yes Alive 49.5\n",
+ "10 Yes Alive 30.0\n",
+ "11 No Dead 66.0\n",
+ "12 Yes Alive 49.2\n",
+ "13 No Alive 58.4\n",
+ "14 No Dead 60.6\n",
+ "15 No Alive 25.1\n",
+ "16 No Alive 43.5\n",
+ "17 No Alive 27.1\n",
+ "18 No Alive 58.3\n",
+ "19 Yes Alive 65.7\n",
+ "20 No Dead 73.2\n",
+ "21 Yes Alive 38.3\n",
+ "22 No Alive 33.4\n",
+ "23 Yes Dead 62.3\n",
+ "24 No Alive 18.0\n",
+ "25 No Alive 56.2\n",
+ "26 Yes Alive 59.2\n",
+ "27 No Alive 25.8\n",
+ "28 No Dead 36.9\n",
+ "29 No Alive 20.2\n",
+ "... ... ... ...\n",
+ "1284 Yes Dead 36.0\n",
+ "1285 Yes Alive 48.3\n",
+ "1286 No Alive 63.1\n",
+ "1287 No Alive 60.8\n",
+ "1288 Yes Dead 39.3\n",
+ "1289 No Alive 36.7\n",
+ "1290 No Alive 63.8\n",
+ "1291 No Dead 71.3\n",
+ "1292 No Alive 57.7\n",
+ "1293 No Alive 63.2\n",
+ "1294 No Alive 46.6\n",
+ "1295 Yes Dead 82.4\n",
+ "1296 Yes Alive 38.3\n",
+ "1297 Yes Alive 32.7\n",
+ "1298 No Alive 39.7\n",
+ "1299 Yes Dead 60.0\n",
+ "1300 No Dead 71.0\n",
+ "1301 No Alive 20.5\n",
+ "1302 No Alive 44.4\n",
+ "1303 Yes Alive 31.2\n",
+ "1304 Yes Alive 47.8\n",
+ "1305 Yes Alive 60.9\n",
+ "1306 No Dead 61.4\n",
+ "1307 Yes Alive 43.0\n",
+ "1308 No Alive 42.1\n",
+ "1309 Yes Alive 35.9\n",
+ "1310 No Alive 22.3\n",
+ "1311 Yes Dead 62.1\n",
+ "1312 No Dead 88.6\n",
+ "1313 No Alive 39.1\n",
+ "\n",
+ "[1314 rows x 3 columns]"
+ ]
+ },
+ "execution_count": 89,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# get data\n",
+ "data_url = \"https://gitlab.inria.fr/learninglab/mooc-rr/mooc-rr-ressources/-/raw/master/module3/Practical_session/Subject6_smoking.csv\"\n",
+ "\n",
+ "raw_data = pd.read_csv(data_url)\n",
+ "raw_data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 1. taux de mortalité en fonction de leur habitude de tabagisme"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " taux de décès parmi les non-fumeurs (%) | \n",
+ " 31.420765 | \n",
+ "
\n",
+ " \n",
+ " taux de décès parmi les fumeurs (%) | \n",
+ " 23.883162 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " \n",
+ "taux de décès parmi les non-fumeurs (%) 31.420765\n",
+ "taux de décès parmi les fumeurs (%) 23.883162"
+ ]
+ },
+ "execution_count": 47,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# data[i] i=0: non-smoker, i=1 smoker\n",
+ "# data[i][j] j=0: alive, j=1 dead\n",
+ "data = [[0, 0], [0, 0]]\n",
+ "NONSMOKER, SMOKER = (0, 1)\n",
+ "ALIVE, DEAD = (0, 1)\n",
+ "\n",
+ "for _, e in raw_data.iterrows(): \n",
+ " data[e[\"Smoker\"]==\"Yes\"][e[\"Status\"]==\"Dead\"] += 1\n",
+ "\n",
+ "# mortality_rate = (%dead among non-smokers), (%dead among smokers)\n",
+ "mortality_rate = [\n",
+ " data[NONSMOKER][DEAD]/sum(data[NONSMOKER])*100,\n",
+ " data[SMOKER][DEAD]/sum(data[SMOKER])*100\n",
+ "]\n",
+ "\n",
+ "pd.DataFrame(mortality_rate, columns=[\"\"], index=[\"taux de décès parmi les non-fumeurs (%)\", \"taux de décès parmi les fumeurs (%)\"])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Conclusion: Il y a plus de morts parmi les non-fumeurs. Ne pas fumer serait dangereux ?\n",
+ " \n",
+ "## 2. taux de mortalité en fonction de l'age et l'habitude de tabagisme"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " taux de décès parmi les non-fumeurs (%) | \n",
+ " taux de décès parmi les fumeurs (%) | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 18-34 ans | \n",
+ " 2.739726 | \n",
+ " 3.296703 | \n",
+ "
\n",
+ " \n",
+ " 34-54 ans | \n",
+ " 9.547739 | \n",
+ " 17.299578 | \n",
+ "
\n",
+ " \n",
+ " 55-64 ans | \n",
+ " 33.057851 | \n",
+ " 43.859649 | \n",
+ "
\n",
+ " \n",
+ " 65+ ans | \n",
+ " 85.492228 | \n",
+ " 85.714286 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " taux de décès parmi les non-fumeurs (%) \\\n",
+ "18-34 ans 2.739726 \n",
+ "34-54 ans 9.547739 \n",
+ "55-64 ans 33.057851 \n",
+ "65+ ans 85.492228 \n",
+ "\n",
+ " taux de décès parmi les fumeurs (%) \n",
+ "18-34 ans 3.296703 \n",
+ "34-54 ans 17.299578 \n",
+ "55-64 ans 43.859649 \n",
+ "65+ ans 85.714286 "
+ ]
+ },
+ "execution_count": 52,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "age_categories = [\"18-34 ans\", \"34-54 ans\", \"55-64 ans\", \"65+ ans\"]\n",
+ "# data[age][is smoking][is dead] = nb people\n",
+ "data = [ [[0, 0], [0, 0]] for a in age_categories ]\n",
+ "\n",
+ "for _, e in raw_data.iterrows():\n",
+ " age = 0\n",
+ " if e[\"Age\"] > 34 and e[\"Age\"] <= 54:\n",
+ " age = 1\n",
+ " elif e[\"Age\"] > 55 and e[\"Age\"] <= 64:\n",
+ " age = 2\n",
+ " elif e[\"Age\"] > 64:\n",
+ " age = 3\n",
+ " data[age][e[\"Smoker\"]==\"Yes\"][e[\"Status\"]==\"Dead\"] += 1\n",
+ "\n",
+ "mortality_rate = [[\n",
+ " e[NONSMOKER][DEAD]/sum(e[NONSMOKER])*100,\n",
+ " e[SMOKER][DEAD]/sum(e[SMOKER])*100\n",
+ "] for e in data]\n",
+ " \n",
+ "pd.DataFrame(mortality_rate, index=age_categories, columns=[\"taux de décès parmi les non-fumeurs (%)\", \"taux de décès parmi les fumeurs (%)\"])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Conclusion: il y a moins de décès chez les non-fumeurs indépendament de leur catégorie d'age.\n",
+ "\n",
+ "Ce qui semble contredire le résultat précédent ? Pas forcément. Pour y voir plus clair, étudions le nombre de fumeurs en fonction de leur catégorie d'age.\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 58,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " nb non-fumeurs | \n",
+ " nb fumeurs | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 18-34 ans | \n",
+ " 219 | \n",
+ " 182 | \n",
+ "
\n",
+ " \n",
+ " 34-54 ans | \n",
+ " 199 | \n",
+ " 237 | \n",
+ "
\n",
+ " \n",
+ " 55-64 ans | \n",
+ " 121 | \n",
+ " 114 | \n",
+ "
\n",
+ " \n",
+ " 65+ ans | \n",
+ " 193 | \n",
+ " 49 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " nb non-fumeurs nb fumeurs\n",
+ "18-34 ans 219 182\n",
+ "34-54 ans 199 237\n",
+ "55-64 ans 121 114\n",
+ "65+ ans 193 49"
+ ]
+ },
+ "execution_count": 58,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "nb = [[\n",
+ " sum(e[NONSMOKER]), sum(e[SMOKER])\n",
+ "] for e in data]\n",
+ "\n",
+ "pd.DataFrame(nb, index=age_categories, columns=[\"nb non-fumeurs\", \"nb fumeurs\"])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "On remarque le piège ! La plupart des personnes de 65+ ans ne fument pas (~75%) ! Mais ont le plus grand taux de décès, indépendament du fait qu'elles fument. En aggrégant les catégories d'age, elles font remonter énormément le nombre de décès parmi les non-fumeurs et peu parmi les non-fumeurs. D'où le fait qu'on observe un taux de décès parmi les non-fumeurs plus important."
+ ]
+ }
+ ],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
@@ -16,10 +796,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.3"
+ "version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
-