From 7dda79b09e592f8621bdb9ec0224650aba0e0d2e Mon Sep 17 00:00:00 2001 From: 62003ad659b42f2646d4732566ceeffb <62003ad659b42f2646d4732566ceeffb@app-learninglab.inria.fr> Date: Thu, 17 Dec 2020 01:17:22 +0000 Subject: [PATCH] Paradoxe_Simpson_Sujet6 --- module3/exo3/Paradoxe_Simpson.ipynb | 604 ++++++++++++++++++++++++++++ module3/exo3/exercice.ipynb | 25 -- 2 files changed, 604 insertions(+), 25 deletions(-) create mode 100644 module3/exo3/Paradoxe_Simpson.ipynb delete mode 100644 module3/exo3/exercice.ipynb diff --git a/module3/exo3/Paradoxe_Simpson.ipynb b/module3/exo3/Paradoxe_Simpson.ipynb new file mode 100644 index 0000000..63b4397 --- /dev/null +++ b/module3/exo3/Paradoxe_Simpson.ipynb @@ -0,0 +1,604 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Autour du Paradoxe de Simpson" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Les données traitées sont sur [gitlab](https://gitlab.inria.fr/learninglab/mooc-rr/mooc-rr-ressources/-/raw/master/module3/Practical_session/Subject6_smoking.csv?inline=false)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# On récupère les données grâce au module pandas au format CVS\n", + "datas = pd.read_csv(\"https://gitlab.inria.fr/learninglab/mooc-rr/mooc-rr-ressources/-/raw/master/module3/Practical_session/Subject6_smoking.csv?inline=false\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "On crée une fonction pour faire le compte des femmes fumeuses/non fumeuses, vivantes/mortes." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def nb_etat_datas(data):\n", + " \n", + " s_d = 0 # smoker and dead\n", + " s_l = 0 # smoker and alive\n", + " ns_d = 0 # not smoker and dead\n", + " ns_l = 0 # not smoker and alive\n", + " \n", + " for st, sm in zip(data[\"Status\"], data[\"Smoker\"]):\n", + " if st == \"Alive\" and sm == \"Yes\":\n", + " s_l += 1\n", + " elif st == \"Alive\" and sm == \"No\":\n", + " ns_l += 1\n", + " elif st == \"Dead\" and sm == \"Yes\":\n", + " s_d += 1\n", + " elif st == \"Dead\" and sm == \"No\":\n", + " ns_d += 1\n", + " \n", + " return s_d,s_l,ns_d,ns_l" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "#nb_w = nombre de femmes\n", + "# nb_s_d = nombre de fumeuses mortes\n", + "# nb_s_l = nombre de fumeuses vivantes\n", + "# nb_ns_d = nombre de non fumeuses mortes\n", + "# nb_ns_l = nombre de non fumeuses vivantes\n", + "\n", + "nb_w = len(datas)\n", + "smoker_or_not = [\"smoker\", \"not smoker\"]\n", + "nb_s_d, nb_s_l, nb_ns_d, nb_ns_l = nb_etat_datas(datas)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "live = pd.Series([nb_s_l, nb_ns_l], index = smoker_or_not)\n", + "dead = pd.Series([nb_s_d, nb_ns_d], index = smoker_or_not)\n", + "df = pd.DataFrame({\"alive\":live, \"dead\" : dead})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Voici le tableau du nombre total de femmes vivantes et décédées en fonction de leur tabagisme" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
alivedead
smoker443139
not smoker502230
\n", + "
" + ], + "text/plain": [ + " alive dead\n", + "smoker 443 139\n", + "not smoker 502 230" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Voici le taux de mortalité chez les femmes fumeuses et non fumeuses" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Le taux de mortalité chez les femmes fumeuses est de : 0.24\n", + "Le taux de mortalité chez les femmes non fumeuse est de : 0.31\n" + ] + } + ], + "source": [ + "t_m_smoker = nb_s_d/(nb_s_d + nb_s_l) # taux de mortalité chez les fumeuses\n", + "t_m_nsmoker = nb_ns_d/(nb_ns_d + nb_ns_l) # taux de mortalité chez les non fumeuses\n", + "print(f\"Le taux de mortalité chez les femmes fumeuses est de : {t_m_smoker : 0.2}\")\n", + "print(f\"Le taux de mortalité chez les femmes non fumeuse est de : {t_m_nsmoker : 0.2}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Représentons les taux de mortalité calculés ci-dessus avec un histogramme" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# J'ai décalé les éléments de fumeuse de longueur 582 et non fumeuses de longueur 732 pour bien espacer les bars \n", + "\n", + "plt.axis([0, 2*nb_w, 0, 100])\n", + "fumeuse = list(range(100,nb_s_l+nb_s_d+101))\n", + "non_fumeuse = list(range(nb_s_l+nb_s_d+201, nb_ns_l+nb_ns_d + 202 + nb_s_l+nb_s_d))\n", + "\n", + "width = 1\n", + "height_s = t_m_smoker * 100 * np.ones(len(fumeuse))\n", + "height_ns = t_m_nsmoker * 100 * np.ones(len(non_fumeuse))\n", + "\n", + "b_fumeuse = plt.bar(fumeuse, height_s, width, color = \"blue\")\n", + "b_nfumeuse = plt.bar(non_fumeuse, height_ns, width, color = \"red\")\n", + "\n", + "plt.title(\"Taux de mortalité\")\n", + "plt.ylabel(\"Pourcentage %\")\n", + "leg = plt.legend([b_fumeuse, b_nfumeuse], ['fumeuse', 'non fumeuse'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "D'après les calculs et les histogrammes ci-dessus, on remarque que les femmes qui fument vivent plus longtemps que les femmes qui ne fument pas. On peut conclure que fumer, c'est bon pour la santé." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Prenons en compte des tranches d'âge pour l'étude à savoir 18-34 ans, 34-54 ans, 55-64 ans et plus de 65 ans et mieux comprendre la conclusion précédente." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "hideCode": false + }, + "outputs": [], + "source": [ + "# On va considérer des tranches d'âges\n", + "def nb_etat_datas_v2(data):\n", + " \n", + " # classe 18 - 34 ans\n", + " s_d_1834 = 0 # smoker and dead\n", + " s_l_1834 = 0 # smoker and alive\n", + " ns_d_1834 = 0 # not smoker and dead\n", + " ns_l_1834 = 0 # not smoker and alive\n", + " \n", + " # classe 34 - 54 ans\n", + " s_d_3454 = 0 # smoker and dead\n", + " s_l_3454 = 0 # smoker and alive\n", + " ns_d_3454 = 0 # not smoker and dead\n", + " ns_l_3454 = 0 # not smoker and alive\n", + " \n", + " # classe 55 - 64 ans\n", + " s_d_5564 = 0 # smoker and dead\n", + " s_l_5564 = 0 # smoker and alive\n", + " ns_d_5564 = 0 # not smoker and dead\n", + " ns_l_5564 = 0 # not smoker and alive\n", + " \n", + " # classe 65 et plus\n", + " s_d_65_p = 0 # smoker and dead\n", + " s_l_65_p = 0 # smoker and alive\n", + " ns_d_65_p = 0 # not smoker and dead\n", + " ns_l_65_p = 0 # not smoker and alive\n", + " \n", + " for st, sm, age in zip(data[\"Status\"], data[\"Smoker\"], data[\"Age\"]):\n", + " if 18 <= age < 34:\n", + " if st == \"Alive\" and sm == \"Yes\":\n", + " s_l_1834 += 1\n", + " elif st == \"Alive\" and sm == \"No\":\n", + " ns_l_1834 += 1\n", + " elif st == \"Dead\" and sm == \"Yes\":\n", + " s_d_1834 += 1\n", + " elif st == \"Dead\" and sm == \"No\":\n", + " ns_d_1834 += 1\n", + " elif 34 <= age <= 54:\n", + " if st == \"Alive\" and sm == \"Yes\":\n", + " s_l_3454 += 1\n", + " elif st == \"Alive\" and sm == \"No\":\n", + " ns_l_3454 += 1\n", + " elif st == \"Dead\" and sm == \"Yes\":\n", + " s_d_3454 += 1\n", + " elif st == \"Dead\" and sm == \"No\":\n", + " ns_d_3454 += 1\n", + " elif 55 <= age <= 64:\n", + " if st == \"Alive\" and sm == \"Yes\":\n", + " s_l_5564 += 1\n", + " elif st == \"Alive\" and sm == \"No\":\n", + " ns_l_5564 += 1\n", + " elif st == \"Dead\" and sm == \"Yes\":\n", + " s_d_5564 += 1\n", + " elif st == \"Dead\" and sm == \"No\":\n", + " ns_d_5564 += 1\n", + " elif age >= 65:\n", + " if st == \"Alive\" and sm == \"Yes\":\n", + " s_l_65_p += 1\n", + " elif st == \"Alive\" and sm == \"No\":\n", + " ns_l_65_p += 1\n", + " elif st == \"Dead\" and sm == \"Yes\":\n", + " s_d_65_p += 1\n", + " elif st == \"Dead\" and sm == \"No\":\n", + " ns_d_65_p += 1\n", + " \n", + " return (s_d_1834, s_l_1834, ns_d_1834, ns_l_1834),(s_d_3454, s_l_3454, ns_d_3454, ns_l_3454),(s_d_5564 ,s_l_5564 ,ns_d_5564 ,ns_l_5564),(s_d_65_p ,s_l_65_p ,ns_d_65_p ,ns_l_65_p)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "La fonction nb_etat_datas_v2 est une version améliorée où l'on prend en compte des classes d'âges" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "classe_age = [\"18-34\", \"34-54\", \"55-64\", \"65 et plus\"]\n", + "data_1834, data_3454, data_5564, data_65_p = nb_etat_datas_v2(datas)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "s_live = pd.Series([data_1834[1], data_3454[1], data_5564[1], data_65_p[1]], index = classe_age) # fumeuse_vivante\n", + "ns_live = pd.Series([data_1834[3], data_3454[3], data_5564[3], data_65_p[3]], index = classe_age) # non fumeuse vivante\n", + "s_dead = pd.Series([data_1834[0], data_3454[0], data_5564[0], data_65_p[0]], index = classe_age) # fumeuse morte\n", + "ns_dead = pd.Series([data_1834[2], data_3454[2], data_5564[2], data_65_p[2]], index = classe_age) # non fumeuse morte\n", + "df_v2 = pd.DataFrame({\"smocker-alive\": s_live,\n", + " \"smocker-dead\" : s_dead,\n", + " \"not-smocker-alive\" : ns_live, \n", + " \"not-smocker-dead\": ns_dead}\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Voici le nouveau tableau avec les tranches d'âges pris en considération" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
not-smocker-alivenot-smocker-deadsmocker-alivesmocker-dead
18-3421361745
34-541801919841
55-6481406451
65 et plus28165742
\n", + "
" + ], + "text/plain": [ + " not-smocker-alive not-smocker-dead smocker-alive smocker-dead\n", + "18-34 213 6 174 5\n", + "34-54 180 19 198 41\n", + "55-64 81 40 64 51\n", + "65 et plus 28 165 7 42" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_v2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "On peut remarquer que les femmes ayant plus 65 ans et ne fumant pas possèdent un taux de mortalité élevé par rapport aux autres\n", + "femmes de différentes classes d'âge; et la mortalité chez les fumeuses (hormis la dernière tranche d'âge) est élevée par rapport à celle chez les non fumeuses.\n", + "Voici un graphique illustrant de nouveau le problème posé (le couple bleu/rouge représente une classe d'âge par ordre croissant)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "hideCode": false + }, + "outputs": [], + "source": [ + "# On calcule les taux de mortalité pour les différentes classes d'âges en s'inspirant de ce qui a déjà été fait précédemment\n", + "t_m_s_1834 = data_1834[0] / (data_1834[0] + data_1834[1])\n", + "t_m_ns_1834 = data_1834[2] / (data_1834[2] + data_1834[3])\n", + "\n", + "t_m_s_3454 = data_3454[0] / (data_3454[0] + data_3454[1])\n", + "t_m_ns_3454 = data_3454[2] / (data_3454[2] + data_3454[3])\n", + "\n", + "t_m_s_5564 = data_5564[0] / (data_5564[0] + data_5564[1])\n", + "t_m_ns_5564 = data_5564[2] / (data_5564[2] + data_5564[3])\n", + "\n", + "t_m_s_65_p = data_65_p[0] / (data_65_p[0] + data_65_p[1])\n", + "t_m_ns_65_p = data_65_p[2] / (data_65_p[2] + data_65_p[3])" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "hideCode": false + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.axis([0, 2*nb_w, 0, 100])\n", + "\n", + "# somme pour la classe 18-34 du nombre de fumeuse et de non fumeuse\n", + "in_f_1834 = data_1834[0] + data_1834[1]\n", + "in_nf_1834 = data_1834[2] + data_1834[3]\n", + "\n", + "# somme pour la classe 34-54 du nombre de fumeuse et de non fumeuse\n", + "in_f_3454 = data_3454[0] + data_3454[1]\n", + "in_nf_3454 = data_3454[2] + data_3454[3]\n", + "\n", + "# somme pour la classe 54-64 du nombre de fumeuse et de non fumeuse\n", + "in_f_5564 = data_5564[0] + data_5564[1]\n", + "in_nf_5564 = data_5564[2] + data_5564[3]\n", + "\n", + "# somme pour la classe 65 et plus du nombre de fumeuse et de non fumeuse\n", + "in_f_65p = data_65_p[0] + data_65_p[1]\n", + "in_nf_65p = data_65_p[2] + data_65_p[3]\n", + "\n", + "# classe 18 - 34\n", + "f_1834 = list(range(0, in_f_1834 + 1))\n", + "nf_1834 = list(range(in_f_1834 + 10, in_nf_1834 + in_f_1834 + 11))\n", + "\n", + "s_1834 = in_nf_1834 + in_f_1834\n", + "\n", + "# classe 34 - 54\n", + "f_3454 = list(range(s_1834 + 30, in_f_3454 + s_1834 + 31))\n", + "nf_3454 = list(range(in_f_3454 + s_1834 + 40, in_nf_3454 + in_f_3454 + s_1834 + 41))\n", + "\n", + "s_3454 = in_nf_3454 + in_f_3454 + s_1834\n", + "\n", + "# classe 55 - 64\n", + "f_5564 = list(range(s_3454 + 50, in_f_5564 + s_3454 + 51))\n", + "nf_5564 = list(range(in_f_5564 + s_3454 + 60, in_nf_5564 + in_f_5564 + s_3454 + 61))\n", + "\n", + "s_5564 = in_nf_5564 + in_f_5564 + s_3454\n", + "\n", + "# classe 65 et plus\n", + "f_65_p = list(range(s_5564 + 70, in_f_65p + s_5564 + 71))\n", + "nf_65_p = list(range(in_f_65p + s_5564 + 80, in_nf_65p + in_f_65p + s_5564 + 81))\n", + "\n", + "width = 1\n", + "\n", + "# hauteurs pour la classe 18 - 34 \n", + "h_s_1834 = t_m_s_1834 * 100 * np.ones(len(f_1834))\n", + "h_ns_1834 = t_m_ns_1834 * 100 * np.ones(len(nf_1834))\n", + "\n", + "# hauteurs pour la classe 34 - 54 \n", + "h_s_3454 = t_m_s_3454 * 100 * np.ones(len(f_3454))\n", + "h_ns_3454 = t_m_ns_3454 * 100 * np.ones(len(nf_3454))\n", + "\n", + "# hauteurs pour la classe 55 - 64\n", + "h_s_5564 = t_m_s_5564 * 100 * np.ones(len(f_5564))\n", + "h_ns_5564 = t_m_ns_5564 * 100 * np.ones(len(nf_5564))\n", + "\n", + "# hauteurs pour la classe 65 et plus\n", + "h_s_65p = t_m_s_65_p * 100 * np.ones(len(f_65_p))\n", + "h_ns_65p = t_m_ns_65_p * 100 * np.ones(len(nf_65_p))\n", + "\n", + "# On trace des bars pour chaque classe\n", + "b_f1834 = plt.bar(f_1834, h_s_1834, width, color = \"blue\")\n", + "b_nf1834 = plt.bar(nf_1834, h_ns_1834, width, color = \"red\")\n", + "\n", + "b_f3454 = plt.bar(f_3454, h_s_3454, width, color = \"blue\")\n", + "b_nf3454 = plt.bar(nf_3454, h_ns_3454, width, color = \"red\")\n", + "\n", + "b_f5564 = plt.bar(f_5564, h_s_5564, width, color = \"blue\")\n", + "b_nf5564 = plt.bar(nf_5564, h_ns_5564, width, color = \"red\")\n", + "\n", + "b_f65p = plt.bar(f_65_p, h_s_65p, width, color = \"blue\")\n", + "b_nf65p = plt.bar(nf_65_p, h_ns_65p, width, color = \"red\")\n", + "\n", + "plt.title(\"Taux de mortalité\")\n", + "plt.xlabel(\"Classe d\\'age\")\n", + "plt.ylabel(\"Pourcentage %\")\n", + "leg = plt.legend([b_f1834, b_nf1834], ['fumeuse', 'non fumeuse'])" + ] + } + ], + "metadata": { + "celltoolbar": "Hide code", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/module3/exo3/exercice.ipynb b/module3/exo3/exercice.ipynb deleted file mode 100644 index 0bbbe37..0000000 --- a/module3/exo3/exercice.ipynb +++ /dev/null @@ -1,25 +0,0 @@ -{ - "cells": [], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} - -- 2.18.1