{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Première analyse rapide des données"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# import des packages importants\n",
"import numpy as np \n",
"import matplotlib.pyplot as plt \n",
"import pandas as pd \n",
"import seaborn as sns"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"data_url = \"https://gitlab.inria.fr/learninglab/mooc-rr/mooc-rr-ressources/-/raw/master/module3/Practical_session/Subject6_smoking.csv?inline=false\" "
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Smoker | \n",
" Status | \n",
" Age | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Yes | \n",
" Alive | \n",
" 21.0 | \n",
"
\n",
" \n",
" 1 | \n",
" Yes | \n",
" Alive | \n",
" 19.3 | \n",
"
\n",
" \n",
" 2 | \n",
" No | \n",
" Dead | \n",
" 57.5 | \n",
"
\n",
" \n",
" 3 | \n",
" No | \n",
" Alive | \n",
" 47.1 | \n",
"
\n",
" \n",
" 4 | \n",
" Yes | \n",
" Alive | \n",
" 81.4 | \n",
"
\n",
" \n",
" 5 | \n",
" No | \n",
" Alive | \n",
" 36.8 | \n",
"
\n",
" \n",
" 6 | \n",
" No | \n",
" Alive | \n",
" 23.8 | \n",
"
\n",
" \n",
" 7 | \n",
" Yes | \n",
" Dead | \n",
" 57.5 | \n",
"
\n",
" \n",
" 8 | \n",
" Yes | \n",
" Alive | \n",
" 24.8 | \n",
"
\n",
" \n",
" 9 | \n",
" Yes | \n",
" Alive | \n",
" 49.5 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Smoker Status Age\n",
"0 Yes Alive 21.0\n",
"1 Yes Alive 19.3\n",
"2 No Dead 57.5\n",
"3 No Alive 47.1\n",
"4 Yes Alive 81.4\n",
"5 No Alive 36.8\n",
"6 No Alive 23.8\n",
"7 Yes Dead 57.5\n",
"8 Yes Alive 24.8\n",
"9 Yes Alive 49.5"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"raw_data = pd.read_csv(data_url)\n",
"raw_data[:10]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Smoker | \n",
" Status | \n",
" Age | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [Smoker, Status, Age]\n",
"Index: []"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"raw_data[raw_data.isnull().any(axis=1)] "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Donc ici pas de point manquant -> pas besoin de modifier les données."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"On compte les morts, les vivants, les fumeurs et non-fumeurs. Cela permet notamment de vérifier rapidement l'intégrité des données. "
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"number alive = 945\n",
"number dead = 369\n",
"total number = 1314\n",
"number smoker = 582\n",
"number non smoker = 732\n",
"total number = 1314\n",
"Number of data : 1314\n"
]
}
],
"source": [
"dead = raw_data['Status'].value_counts()['Dead']\n",
"alive = raw_data['Status'].value_counts()['Alive']\n",
"print(f'number alive = {alive}')\n",
"print(f'number dead = {dead}')\n",
"print(f'total number = {alive + dead}')\n",
"smoker = raw_data['Smoker'].value_counts()['Yes']\n",
"non_smoker = raw_data['Smoker'].value_counts()['No']\n",
"print(f'number smoker = {smoker}')\n",
"print(f'number non smoker = {non_smoker}')\n",
"print(f'total number = {smoker + non_smoker}')\n",
"\n",
"print(f'Number of data : {len(raw_data)}')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Question 1"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" Smoker | \n",
" No | \n",
" Yes | \n",
"
\n",
" \n",
" Status | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" Alive | \n",
" 502 | \n",
" 443 | \n",
"
\n",
" \n",
" Dead | \n",
" 230 | \n",
" 139 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Smoker No Yes\n",
"Status \n",
"Alive 502 443\n",
"Dead 230 139"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tab = pd.crosstab(raw_data.Status, raw_data.Smoker) # on a bien le bon nombre de vivants et de morts\n",
"tab"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[502 443]\n",
" [230 139]]\n",
"ratio_smoker = 0.238832\n",
"ratio_non_smoker = 0.314208\n"
]
}
],
"source": [
"numpy_data = np.array(tab)\n",
"print(numpy_data)\n",
"ratio_smoker = numpy_data[1,1]/smoker\n",
"ratio_non_smoker = numpy_data[1,0]/non_smoker\n",
"print(f'ratio_smoker = {ratio_smoker:.6f}')\n",
"print(f'ratio_non_smoker = {ratio_non_smoker:.6f}')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"C'est dommage, il semblerait que les gens qui ne fument pas meurent plus que les gens qui fument... embêtant."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Question 2 "
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Smoker | \n",
" Status | \n",
" Age | \n",
" AgeGroup | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Yes | \n",
" Alive | \n",
" 21.0 | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" Yes | \n",
" Alive | \n",
" 19.3 | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" No | \n",
" Dead | \n",
" 57.5 | \n",
" 3 | \n",
"
\n",
" \n",
" 3 | \n",
" No | \n",
" Alive | \n",
" 47.1 | \n",
" 2 | \n",
"
\n",
" \n",
" 4 | \n",
" Yes | \n",
" Alive | \n",
" 81.4 | \n",
" 4 | \n",
"
\n",
" \n",
" 5 | \n",
" No | \n",
" Alive | \n",
" 36.8 | \n",
" 2 | \n",
"
\n",
" \n",
" 6 | \n",
" No | \n",
" Alive | \n",
" 23.8 | \n",
" 1 | \n",
"
\n",
" \n",
" 7 | \n",
" Yes | \n",
" Dead | \n",
" 57.5 | \n",
" 3 | \n",
"
\n",
" \n",
" 8 | \n",
" Yes | \n",
" Alive | \n",
" 24.8 | \n",
" 1 | \n",
"
\n",
" \n",
" 9 | \n",
" Yes | \n",
" Alive | \n",
" 49.5 | \n",
" 2 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Smoker Status Age AgeGroup\n",
"0 Yes Alive 21.0 1\n",
"1 Yes Alive 19.3 1\n",
"2 No Dead 57.5 3\n",
"3 No Alive 47.1 2\n",
"4 Yes Alive 81.4 4\n",
"5 No Alive 36.8 2\n",
"6 No Alive 23.8 1\n",
"7 Yes Dead 57.5 3\n",
"8 Yes Alive 24.8 1\n",
"9 Yes Alive 49.5 2"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"raw_data.loc[((raw_data.Age < 34) & (raw_data.Age >= 18) ), 'AgeGroup'] = '1'\n",
"raw_data.loc[((raw_data.Age < 55) & (raw_data.Age >= 34) ), 'AgeGroup'] = '2'\n",
"raw_data.loc[((raw_data.Age < 65) & (raw_data.Age >= 55) ), 'AgeGroup'] = '3'\n",
"raw_data.loc[(raw_data.Age >= 65), 'AgeGroup'] = '4'\n",
"\n",
"raw_data[:10]"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"##################\n",
"Groupe d'age : 1\n",
"number alive = 387\n",
"number dead = 11\n",
"total number = 398\n",
"number smoker = 179\n",
"number non smoker = 219\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" Smoker | \n",
" No | \n",
" Yes | \n",
"
\n",
" \n",
" Status | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" Alive | \n",
" 213 | \n",
" 174 | \n",
"
\n",
" \n",
" Dead | \n",
" 6 | \n",
" 5 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Smoker No Yes\n",
"Status \n",
"Alive 213 174\n",
"Dead 6 5"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"ratio_smoker = 0.027933\n",
"ratio_non_smoker = 0.027397\n",
"##################\n",
"\n",
"##################\n",
"Groupe d'age : 2\n",
"number alive = 378\n",
"number dead = 60\n",
"total number = 438\n",
"number smoker = 239\n",
"number non smoker = 199\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" Smoker | \n",
" No | \n",
" Yes | \n",
"
\n",
" \n",
" Status | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" Alive | \n",
" 180 | \n",
" 198 | \n",
"
\n",
" \n",
" Dead | \n",
" 19 | \n",
" 41 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Smoker No Yes\n",
"Status \n",
"Alive 180 198\n",
"Dead 19 41"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"ratio_smoker = 0.171548\n",
"ratio_non_smoker = 0.095477\n",
"##################\n",
"\n",
"##################\n",
"Groupe d'age : 3\n",
"number alive = 145\n",
"number dead = 91\n",
"total number = 236\n",
"number smoker = 115\n",
"number non smoker = 121\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" Smoker | \n",
" No | \n",
" Yes | \n",
"
\n",
" \n",
" Status | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" Alive | \n",
" 81 | \n",
" 64 | \n",
"
\n",
" \n",
" Dead | \n",
" 40 | \n",
" 51 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Smoker No Yes\n",
"Status \n",
"Alive 81 64\n",
"Dead 40 51"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"ratio_smoker = 0.443478\n",
"ratio_non_smoker = 0.330579\n",
"##################\n",
"\n",
"##################\n",
"Groupe d'age : 4\n",
"number alive = 35\n",
"number dead = 207\n",
"total number = 242\n",
"number smoker = 49\n",
"number non smoker = 193\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" Smoker | \n",
" No | \n",
" Yes | \n",
"
\n",
" \n",
" Status | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" Alive | \n",
" 28 | \n",
" 7 | \n",
"
\n",
" \n",
" Dead | \n",
" 165 | \n",
" 42 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Smoker No Yes\n",
"Status \n",
"Alive 28 7\n",
"Dead 165 42"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"ratio_smoker = 0.857143\n",
"ratio_non_smoker = 0.854922\n",
"##################\n",
"\n"
]
}
],
"source": [
"for age_group in ['1', '2', '3', '4']:\n",
" print('##################')\n",
" print(f'Groupe d\\'age : {age_group}')\n",
" tab_class = raw_data.loc[(raw_data.AgeGroup == age_group)]\n",
" dead = tab_class['Status'].value_counts()['Dead']\n",
" alive = tab_class['Status'].value_counts()['Alive']\n",
" print(f'number alive = {alive}')\n",
" print(f'number dead = {dead}')\n",
" print(f'total number = {alive + dead}')\n",
" smoker = tab_class['Smoker'].value_counts()['Yes']\n",
" non_smoker = tab_class['Smoker'].value_counts()['No']\n",
" print(f'number smoker = {smoker}')\n",
" print(f'number non smoker = {non_smoker}')\n",
" tab_class = pd.crosstab(tab_class.Status, tab_class.Smoker) # on a bien le bon nombre de vivants et de morts\n",
" display(tab_class)\n",
" numpy_data = np.array(tab_class)\n",
" \n",
" ratio_smoker = numpy_data[1,1]/smoker\n",
" ratio_non_smoker = numpy_data[1,0]/non_smoker\n",
" print(f'ratio_smoker = {ratio_smoker:.6f}')\n",
" print(f'ratio_non_smoker = {ratio_non_smoker:.6f}')\n",
" print('##################\\n')\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Analyse :** \n",
"- Dans le groupe 1 : chez les plus jeunes (18-34 ans), le fait de fumer n'influe pas énormément sur le taux de mortalité. \n",
"- Dans le groupe 2 : fumer tue, le taux de mortalité est presque 2 fois plus élévé chez les fumeurs. \n",
"- Dans le groupe 3 : ici, le tabac semble augmenter un peu le taux de mortalité. \n",
"- Dans le groupe 4 : c'est catastrophique, tout le monde meurt. Les fumeurs ont le même taux de mortalité que les non-fumeurs, ce qui est normal puisque c'est la catégorie où l'âge est le plus élevé. "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"C'est donc la catégorie 4 qui semble biaiser les données globales."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Question 3"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"raw_data.loc[(raw_data.Status =='Dead'),'Death'] = 1\n",
"raw_data.loc[(raw_data.Status =='Alive'),'Death'] = 0"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Smoker | \n",
" Status | \n",
" Age | \n",
" AgeGroup | \n",
" Death | \n",
" Intercept | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Yes | \n",
" Alive | \n",
" 21.0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" Yes | \n",
" Alive | \n",
" 19.3 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" No | \n",
" Dead | \n",
" 57.5 | \n",
" 3 | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
" 3 | \n",
" No | \n",
" Alive | \n",
" 47.1 | \n",
" 2 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 4 | \n",
" Yes | \n",
" Alive | \n",
" 81.4 | \n",
" 4 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 5 | \n",
" No | \n",
" Alive | \n",
" 36.8 | \n",
" 2 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 6 | \n",
" No | \n",
" Alive | \n",
" 23.8 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 7 | \n",
" Yes | \n",
" Dead | \n",
" 57.5 | \n",
" 3 | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
" 8 | \n",
" Yes | \n",
" Alive | \n",
" 24.8 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
" 9 | \n",
" Yes | \n",
" Alive | \n",
" 49.5 | \n",
" 2 | \n",
" 0 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Smoker Status Age AgeGroup Death Intercept\n",
"0 Yes Alive 21.0 1 0 1\n",
"1 Yes Alive 19.3 1 0 1\n",
"2 No Dead 57.5 3 1 1\n",
"3 No Alive 47.1 2 0 1\n",
"4 Yes Alive 81.4 4 0 1\n",
"5 No Alive 36.8 2 0 1\n",
"6 No Alive 23.8 1 0 1\n",
"7 Yes Dead 57.5 3 1 1\n",
"8 Yes Alive 24.8 1 0 1\n",
"9 Yes Alive 49.5 2 0 1"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"display(raw_data[:10])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}