{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Première analyse rapide des données" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# import des packages importants\n", "import numpy as np \n", "import matplotlib.pyplot as plt \n", "import pandas as pd \n", "import seaborn as sns" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "data_url = \"https://gitlab.inria.fr/learninglab/mooc-rr/mooc-rr-ressources/-/raw/master/module3/Practical_session/Subject6_smoking.csv?inline=false\" " ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SmokerStatusAge
0YesAlive21.0
1YesAlive19.3
2NoDead57.5
3NoAlive47.1
4YesAlive81.4
5NoAlive36.8
6NoAlive23.8
7YesDead57.5
8YesAlive24.8
9YesAlive49.5
\n", "
" ], "text/plain": [ " Smoker Status Age\n", "0 Yes Alive 21.0\n", "1 Yes Alive 19.3\n", "2 No Dead 57.5\n", "3 No Alive 47.1\n", "4 Yes Alive 81.4\n", "5 No Alive 36.8\n", "6 No Alive 23.8\n", "7 Yes Dead 57.5\n", "8 Yes Alive 24.8\n", "9 Yes Alive 49.5" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "raw_data = pd.read_csv(data_url)\n", "raw_data[:10]" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SmokerStatusAge
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [Smoker, Status, Age]\n", "Index: []" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "raw_data[raw_data.isnull().any(axis=1)] " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Donc ici pas de point manquant -> pas besoin de modifier les données." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "On compte les morts, les vivants, les fumeurs et non-fumeurs. Cela permet notamment de vérifier rapidement l'intégrité des données. " ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "number alive = 945\n", "number dead = 369\n", "total number = 1314\n", "number smoker = 582\n", "number non smoker = 732\n", "total number = 1314\n", "Number of data : 1314\n" ] } ], "source": [ "dead = raw_data['Status'].value_counts()['Dead']\n", "alive = raw_data['Status'].value_counts()['Alive']\n", "print(f'number alive = {alive}')\n", "print(f'number dead = {dead}')\n", "print(f'total number = {alive + dead}')\n", "smoker = raw_data['Smoker'].value_counts()['Yes']\n", "non_smoker = raw_data['Smoker'].value_counts()['No']\n", "print(f'number smoker = {smoker}')\n", "print(f'number non smoker = {non_smoker}')\n", "print(f'total number = {smoker + non_smoker}')\n", "\n", "print(f'Number of data : {len(raw_data)}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Question 1" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SmokerNoYes
Status
Alive502443
Dead230139
\n", "
" ], "text/plain": [ "Smoker No Yes\n", "Status \n", "Alive 502 443\n", "Dead 230 139" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tab = pd.crosstab(raw_data.Status, raw_data.Smoker) # on a bien le bon nombre de vivants et de morts\n", "tab" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[502 443]\n", " [230 139]]\n", "ratio_smoker = 0.238832\n", "ratio_non_smoker = 0.314208\n" ] } ], "source": [ "numpy_data = np.array(tab)\n", "print(numpy_data)\n", "ratio_smoker = numpy_data[1,1]/smoker\n", "ratio_non_smoker = numpy_data[1,0]/non_smoker\n", "print(f'ratio_smoker = {ratio_smoker:.6f}')\n", "print(f'ratio_non_smoker = {ratio_non_smoker:.6f}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "C'est dommage, il semblerait que les gens qui ne fument pas meurent plus que les gens qui fument... embêtant." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Question 2 " ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SmokerStatusAgeAgeGroup
0YesAlive21.01
1YesAlive19.31
2NoDead57.53
3NoAlive47.12
4YesAlive81.44
5NoAlive36.82
6NoAlive23.81
7YesDead57.53
8YesAlive24.81
9YesAlive49.52
\n", "
" ], "text/plain": [ " Smoker Status Age AgeGroup\n", "0 Yes Alive 21.0 1\n", "1 Yes Alive 19.3 1\n", "2 No Dead 57.5 3\n", "3 No Alive 47.1 2\n", "4 Yes Alive 81.4 4\n", "5 No Alive 36.8 2\n", "6 No Alive 23.8 1\n", "7 Yes Dead 57.5 3\n", "8 Yes Alive 24.8 1\n", "9 Yes Alive 49.5 2" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "raw_data.loc[((raw_data.Age < 34) & (raw_data.Age >= 18) ), 'AgeGroup'] = '1'\n", "raw_data.loc[((raw_data.Age < 55) & (raw_data.Age >= 34) ), 'AgeGroup'] = '2'\n", "raw_data.loc[((raw_data.Age < 65) & (raw_data.Age >= 55) ), 'AgeGroup'] = '3'\n", "raw_data.loc[(raw_data.Age >= 65), 'AgeGroup'] = '4'\n", "\n", "raw_data[:10]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "##################\n", "Groupe d'age : 1\n", "number alive = 387\n", "number dead = 11\n", "total number = 398\n", "number smoker = 179\n", "number non smoker = 219\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SmokerNoYes
Status
Alive213174
Dead65
\n", "
" ], "text/plain": [ "Smoker No Yes\n", "Status \n", "Alive 213 174\n", "Dead 6 5" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "ratio_smoker = 0.027933\n", "ratio_non_smoker = 0.027397\n", "##################\n", "\n", "##################\n", "Groupe d'age : 2\n", "number alive = 378\n", "number dead = 60\n", "total number = 438\n", "number smoker = 239\n", "number non smoker = 199\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SmokerNoYes
Status
Alive180198
Dead1941
\n", "
" ], "text/plain": [ "Smoker No Yes\n", "Status \n", "Alive 180 198\n", "Dead 19 41" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "ratio_smoker = 0.171548\n", "ratio_non_smoker = 0.095477\n", "##################\n", "\n", "##################\n", "Groupe d'age : 3\n", "number alive = 145\n", "number dead = 91\n", "total number = 236\n", "number smoker = 115\n", "number non smoker = 121\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SmokerNoYes
Status
Alive8164
Dead4051
\n", "
" ], "text/plain": [ "Smoker No Yes\n", "Status \n", "Alive 81 64\n", "Dead 40 51" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "ratio_smoker = 0.443478\n", "ratio_non_smoker = 0.330579\n", "##################\n", "\n", "##################\n", "Groupe d'age : 4\n", "number alive = 35\n", "number dead = 207\n", "total number = 242\n", "number smoker = 49\n", "number non smoker = 193\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SmokerNoYes
Status
Alive287
Dead16542
\n", "
" ], "text/plain": [ "Smoker No Yes\n", "Status \n", "Alive 28 7\n", "Dead 165 42" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "ratio_smoker = 0.857143\n", "ratio_non_smoker = 0.854922\n", "##################\n", "\n" ] } ], "source": [ "for age_group in ['1', '2', '3', '4']:\n", " print('##################')\n", " print(f'Groupe d\\'age : {age_group}')\n", " tab_class = raw_data.loc[(raw_data.AgeGroup == age_group)]\n", " dead = tab_class['Status'].value_counts()['Dead']\n", " alive = tab_class['Status'].value_counts()['Alive']\n", " print(f'number alive = {alive}')\n", " print(f'number dead = {dead}')\n", " print(f'total number = {alive + dead}')\n", " smoker = tab_class['Smoker'].value_counts()['Yes']\n", " non_smoker = tab_class['Smoker'].value_counts()['No']\n", " print(f'number smoker = {smoker}')\n", " print(f'number non smoker = {non_smoker}')\n", " tab_class = pd.crosstab(tab_class.Status, tab_class.Smoker) # on a bien le bon nombre de vivants et de morts\n", " display(tab_class)\n", " numpy_data = np.array(tab_class)\n", " \n", " ratio_smoker = numpy_data[1,1]/smoker\n", " ratio_non_smoker = numpy_data[1,0]/non_smoker\n", " print(f'ratio_smoker = {ratio_smoker:.6f}')\n", " print(f'ratio_non_smoker = {ratio_non_smoker:.6f}')\n", " print('##################\\n')\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Analyse :** \n", "- Dans le groupe 1 : chez les plus jeunes (18-34 ans), le fait de fumer n'influe pas énormément sur le taux de mortalité. \n", "- Dans le groupe 2 : fumer tue, le taux de mortalité est presque 2 fois plus élévé chez les fumeurs. \n", "- Dans le groupe 3 : ici, le tabac semble augmenter un peu le taux de mortalité. \n", "- Dans le groupe 4 : c'est catastrophique, tout le monde meurt. Les fumeurs ont le même taux de mortalité que les non-fumeurs, ce qui est normal puisque c'est la catégorie où l'âge est le plus élevé. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "C'est donc la catégorie 4 qui semble biaiser les données globales." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Question 3" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "raw_data.loc[(raw_data.Status =='Dead'),'Death'] = 1\n", "raw_data.loc[(raw_data.Status =='Alive'),'Death'] = 0" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SmokerStatusAgeAgeGroupDeathIntercept
0YesAlive21.0101
1YesAlive19.3101
2NoDead57.5311
3NoAlive47.1201
4YesAlive81.4401
5NoAlive36.8201
6NoAlive23.8101
7YesDead57.5311
8YesAlive24.8101
9YesAlive49.5201
\n", "
" ], "text/plain": [ " Smoker Status Age AgeGroup Death Intercept\n", "0 Yes Alive 21.0 1 0 1\n", "1 Yes Alive 19.3 1 0 1\n", "2 No Dead 57.5 3 1 1\n", "3 No Alive 47.1 2 0 1\n", "4 Yes Alive 81.4 4 0 1\n", "5 No Alive 36.8 2 0 1\n", "6 No Alive 23.8 1 0 1\n", "7 Yes Dead 57.5 3 1 1\n", "8 Yes Alive 24.8 1 0 1\n", "9 Yes Alive 49.5 2 0 1" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "display(raw_data[:10])" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4" } }, "nbformat": 4, "nbformat_minor": 2 }