{ "cells": [ { "cell_type": "markdown", "metadata": { "hideCode": false, "hidePrompt": true }, "source": [ "# Autour du paradoxe de Simpson" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "hideCode": false, "hidePrompt": true }, "outputs": [], "source": [ "import pandas as pd\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "hideCode": false, "hidePrompt": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SmokerStatusAge
0YesAlive21.0
1YesAlive19.3
2NoDead57.5
3NoAlive47.1
4YesAlive81.4
5NoAlive36.8
6NoAlive23.8
7YesDead57.5
8YesAlive24.8
9YesAlive49.5
10YesAlive30.0
11NoDead66.0
12YesAlive49.2
13NoAlive58.4
14NoDead60.6
15NoAlive25.1
16NoAlive43.5
17NoAlive27.1
18NoAlive58.3
19YesAlive65.7
20NoDead73.2
21YesAlive38.3
22NoAlive33.4
23YesDead62.3
24NoAlive18.0
25NoAlive56.2
26YesAlive59.2
27NoAlive25.8
28NoDead36.9
29NoAlive20.2
............
1284YesDead36.0
1285YesAlive48.3
1286NoAlive63.1
1287NoAlive60.8
1288YesDead39.3
1289NoAlive36.7
1290NoAlive63.8
1291NoDead71.3
1292NoAlive57.7
1293NoAlive63.2
1294NoAlive46.6
1295YesDead82.4
1296YesAlive38.3
1297YesAlive32.7
1298NoAlive39.7
1299YesDead60.0
1300NoDead71.0
1301NoAlive20.5
1302NoAlive44.4
1303YesAlive31.2
1304YesAlive47.8
1305YesAlive60.9
1306NoDead61.4
1307YesAlive43.0
1308NoAlive42.1
1309YesAlive35.9
1310NoAlive22.3
1311YesDead62.1
1312NoDead88.6
1313NoAlive39.1
\n", "

1314 rows × 3 columns

\n", "
" ], "text/plain": [ " Smoker Status Age\n", "0 Yes Alive 21.0\n", "1 Yes Alive 19.3\n", "2 No Dead 57.5\n", "3 No Alive 47.1\n", "4 Yes Alive 81.4\n", "5 No Alive 36.8\n", "6 No Alive 23.8\n", "7 Yes Dead 57.5\n", "8 Yes Alive 24.8\n", "9 Yes Alive 49.5\n", "10 Yes Alive 30.0\n", "11 No Dead 66.0\n", "12 Yes Alive 49.2\n", "13 No Alive 58.4\n", "14 No Dead 60.6\n", "15 No Alive 25.1\n", "16 No Alive 43.5\n", "17 No Alive 27.1\n", "18 No Alive 58.3\n", "19 Yes Alive 65.7\n", "20 No Dead 73.2\n", "21 Yes Alive 38.3\n", "22 No Alive 33.4\n", "23 Yes Dead 62.3\n", "24 No Alive 18.0\n", "25 No Alive 56.2\n", "26 Yes Alive 59.2\n", "27 No Alive 25.8\n", "28 No Dead 36.9\n", "29 No Alive 20.2\n", "... ... ... ...\n", "1284 Yes Dead 36.0\n", "1285 Yes Alive 48.3\n", "1286 No Alive 63.1\n", "1287 No Alive 60.8\n", "1288 Yes Dead 39.3\n", "1289 No Alive 36.7\n", "1290 No Alive 63.8\n", "1291 No Dead 71.3\n", "1292 No Alive 57.7\n", "1293 No Alive 63.2\n", "1294 No Alive 46.6\n", "1295 Yes Dead 82.4\n", "1296 Yes Alive 38.3\n", "1297 Yes Alive 32.7\n", "1298 No Alive 39.7\n", "1299 Yes Dead 60.0\n", "1300 No Dead 71.0\n", "1301 No Alive 20.5\n", "1302 No Alive 44.4\n", "1303 Yes Alive 31.2\n", "1304 Yes Alive 47.8\n", "1305 Yes Alive 60.9\n", "1306 No Dead 61.4\n", "1307 Yes Alive 43.0\n", "1308 No Alive 42.1\n", "1309 Yes Alive 35.9\n", "1310 No Alive 22.3\n", "1311 Yes Dead 62.1\n", "1312 No Dead 88.6\n", "1313 No Alive 39.1\n", "\n", "[1314 rows x 3 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "donnees = pd.read_csv('Subject6_smoking.csv')\n", "donnees" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "hideCode": false, "hidePrompt": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 1314 entries, 0 to 1313\n", "Data columns (total 3 columns):\n", "Smoker 1314 non-null object\n", "Status 1314 non-null object\n", "Age 1314 non-null float64\n", "dtypes: float64(1), object(2)\n", "memory usage: 30.9+ KB\n" ] } ], "source": [ "donnees.info()" ] }, { "cell_type": "markdown", "metadata": { "hideCode": false, "hidePrompt": true }, "source": [ "Pour faire un peu de statistiques sur ces données, on peut représenter `Yes` par `1` et `No`par `0`, et `Alive`par `1`et `Dead` par `0`. On va en fait procéder autrement mais c'était ma première approche naïve, je laisse donc mon code initial ainsi que les résultats obtenus. Au lieu d'utiliser la methode `apply` j'aurais également pu utiliser la méthode `replace`." ] }, { "cell_type": "markdown", "metadata": { "hideCode": false, "hidePrompt": true }, "source": [ "``` \n", "def convert(x):\n", " if (x=='Yes') | (x==\"Alive\"):\n", " return 1\n", " elif (x=='No') | (x=='Dead'):\n", " return 0\n", "\n", "donnees['Smoker'] = donnees['Smoker'].apply(convert)\n", "donnees['Status'] = donnees['Status'].apply(convert)\n", "donnees[['Smoker','Status']].sum()\n", "```" ] }, { "cell_type": "markdown", "metadata": { "hideCode": false, "hidePrompt": true }, "source": [ "Parmi les 1314 femmes sondées, il y a donc 582 fumeuses, et 945 des femmes (fumeuses et non fumeuses) sont encore vivantes 20 ans après." ] }, { "cell_type": "markdown", "metadata": { "hideCode": false, "hidePrompt": true }, "source": [ "Ré-obtenons ces informations à l'aide des méthodes de regroupement." ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "hideCode": false, "hidePrompt": true }, "outputs": [ { "data": { "text/plain": [ "No 732\n", "Yes 582\n", "Name: Smoker, dtype: int64" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "donnees['Smoker'].value_counts()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "hideCode": false, "hidePrompt": true, "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "Alive 945\n", "Dead 369\n", "Name: Status, dtype: int64" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "donnees['Status'].value_counts()" ] }, { "cell_type": "markdown", "metadata": { "hideCode": false, "hidePrompt": true }, "source": [ "Maintenant regardons les infos jointes, pour essayer de comprendre les dépendances. Pour ça, on pourrait écrire \n", "```\n", "donnees[['Smoker,'Status']].value_counts()\n", "```\n", "pour avoir le tableau souhaité, mais ce notebook jupyter utilise une version non à jour de panda où on ne peut pas utiliser `value_counts` sur un `dataframe`, donc on ruse." ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "hideCode": false, "hidePrompt": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Total
SmokerStatus
NoAlive502
Dead230
YesAlive443
Dead139
\n", "
" ], "text/plain": [ " Total\n", "Smoker Status \n", "No Alive 502\n", " Dead 230\n", "Yes Alive 443\n", " Dead 139" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tableau = donnees.groupby(['Smoker','Status']).count()\n", "tableau = tableau.rename(columns={'Age':'Total'})\n", "tableau" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "hideCode": false, "hidePrompt": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "MultiIndex: 4 entries, (No, Alive) to (Yes, Dead)\n", "Data columns (total 1 columns):\n", "Total 4 non-null int64\n", "dtypes: int64(1)\n", "memory usage: 238.0+ bytes\n" ] } ], "source": [ "tableau.info()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Un autre moyen d'obtenir le même tableau (toujours avec le même souci de nom)." ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Age
SmokerStatus
NoAlive502
Dead230
YesAlive443
Dead139
All1314
\n", "
" ], "text/plain": [ " Age\n", "Smoker Status \n", "No Alive 502\n", " Dead 230\n", "Yes Alive 443\n", " Dead 139\n", "All 1314" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "donnees.pivot_table(index = ['Smoker','Status'], aggfunc='count', margins=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Une manière plus agréable visuellement d'avoir les mêmes données, et en supprimpant le problème de nom :" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
StatusAliveDeadAll
Smoker
No502230732
Yes443139582
All9453691314
\n", "
" ], "text/plain": [ "Status Alive Dead All\n", "Smoker \n", "No 502 230 732\n", "Yes 443 139 582\n", "All 945 369 1314" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "table2 = donnees.pivot_table(index = 'Smoker',values='Age', columns='Status', aggfunc='count', margins=True) \n", "table2" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Index: 3 entries, No to All\n", "Data columns (total 3 columns):\n", "Alive 3 non-null int64\n", "Dead 3 non-null int64\n", "All 3 non-null int64\n", "dtypes: int64(3)\n", "memory usage: 96.0+ bytes\n" ] } ], "source": [ "table2.info()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Remarquons que si on ne précise pas `values='Age'`, on obtient quelque chose de proche mais avec un souci de nom à nouveau." ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Age
StatusAliveDeadAll
Smoker
No502230732
Yes443139582
All9453691314
\n", "
" ], "text/plain": [ " Age \n", "Status Alive Dead All\n", "Smoker \n", "No 502 230 732\n", "Yes 443 139 582\n", "All 945 369 1314" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "table3 = donnees.pivot_table(index = 'Smoker', columns='Status', aggfunc='count', margins=True) \n", "table3" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Index: 3 entries, No to All\n", "Data columns (total 3 columns):\n", "(Age, Alive) 3 non-null int64\n", "(Age, Dead) 3 non-null int64\n", "(Age, All) 3 non-null int64\n", "dtypes: int64(3)\n", "memory usage: 96.0+ bytes\n" ] } ], "source": [ "table3.info()" ] }, { "cell_type": "markdown", "metadata": { "hideCode": false, "hidePrompt": true }, "source": [ "Evaluons maintenant le taux de mortalité selon si on fume ou non." ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
StatusAliveDeadAllratio_deces
Smoker
No5022307320.314208
Yes4431395820.238832
All94536913140.280822
\n", "
" ], "text/plain": [ "Status Alive Dead All ratio_deces\n", "Smoker \n", "No 502 230 732 0.314208\n", "Yes 443 139 582 0.238832\n", "All 945 369 1314 0.280822" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "table2['ratio_deces'] = table2['Dead']/table2['All']\n", "table2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "On observe donc que les fumeuses ont eu un taux de décès plus faible !\n", "\n", "Le problème avec notre étude, c'est que l'âge (qui est clairement un facteur dans la mort des individus) des participants n'a pas été pris en compte, or le fait de fumer (ou non) pour une femme dans les années 70 est corrélé avec l'âge et nos groupes de sont pas équivalent du point de vue de l'âge. On peut facilement le vérifier ici :" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
StatusAliveDeadAll
Smoker
No40.34741070.48173949.815847
Yes39.64898458.99640344.269759
All40.02000066.15528547.359361
\n", "
" ], "text/plain": [ "Status Alive Dead All\n", "Smoker \n", "No 40.347410 70.481739 49.815847\n", "Yes 39.648984 58.996403 44.269759\n", "All 40.020000 66.155285 47.359361" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "donnees.pivot_table(index = 'Smoker',values='Age', columns='Status', aggfunc='mean', margins=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Ajoutons des tranches d'ages pour prendre en compte cette composante." ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "def classifie(x):\n", " if x <= 34:\n", " return '18-34 ans'\n", " elif x <= 54:\n", " return '35-54 ans'\n", " elif x <= 65:\n", " return '55-65 ans'\n", " else:\n", " return '>65 ans'\n", " \n", "donnees['tranche']=donnees['Age'].apply(classifie)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SmokerStatusAgetranche
0YesAlive21.018-34 ans
1YesAlive19.318-34 ans
2NoDead57.555-65 ans
3NoAlive47.135-54 ans
4YesAlive81.4>65 ans
5NoAlive36.835-54 ans
6NoAlive23.818-34 ans
7YesDead57.555-65 ans
8YesAlive24.818-34 ans
9YesAlive49.535-54 ans
10YesAlive30.018-34 ans
11NoDead66.0>65 ans
12YesAlive49.235-54 ans
13NoAlive58.455-65 ans
14NoDead60.655-65 ans
15NoAlive25.118-34 ans
16NoAlive43.535-54 ans
17NoAlive27.118-34 ans
18NoAlive58.355-65 ans
19YesAlive65.7>65 ans
20NoDead73.2>65 ans
21YesAlive38.335-54 ans
22NoAlive33.418-34 ans
23YesDead62.355-65 ans
24NoAlive18.018-34 ans
25NoAlive56.255-65 ans
26YesAlive59.255-65 ans
27NoAlive25.818-34 ans
28NoDead36.935-54 ans
29NoAlive20.218-34 ans
...............
1284YesDead36.035-54 ans
1285YesAlive48.335-54 ans
1286NoAlive63.155-65 ans
1287NoAlive60.855-65 ans
1288YesDead39.335-54 ans
1289NoAlive36.735-54 ans
1290NoAlive63.855-65 ans
1291NoDead71.3>65 ans
1292NoAlive57.755-65 ans
1293NoAlive63.255-65 ans
1294NoAlive46.635-54 ans
1295YesDead82.4>65 ans
1296YesAlive38.335-54 ans
1297YesAlive32.718-34 ans
1298NoAlive39.735-54 ans
1299YesDead60.055-65 ans
1300NoDead71.0>65 ans
1301NoAlive20.518-34 ans
1302NoAlive44.435-54 ans
1303YesAlive31.218-34 ans
1304YesAlive47.835-54 ans
1305YesAlive60.955-65 ans
1306NoDead61.455-65 ans
1307YesAlive43.035-54 ans
1308NoAlive42.135-54 ans
1309YesAlive35.935-54 ans
1310NoAlive22.318-34 ans
1311YesDead62.155-65 ans
1312NoDead88.6>65 ans
1313NoAlive39.135-54 ans
\n", "

1314 rows × 4 columns

\n", "
" ], "text/plain": [ " Smoker Status Age tranche\n", "0 Yes Alive 21.0 18-34 ans\n", "1 Yes Alive 19.3 18-34 ans\n", "2 No Dead 57.5 55-65 ans\n", "3 No Alive 47.1 35-54 ans\n", "4 Yes Alive 81.4 >65 ans\n", "5 No Alive 36.8 35-54 ans\n", "6 No Alive 23.8 18-34 ans\n", "7 Yes Dead 57.5 55-65 ans\n", "8 Yes Alive 24.8 18-34 ans\n", "9 Yes Alive 49.5 35-54 ans\n", "10 Yes Alive 30.0 18-34 ans\n", "11 No Dead 66.0 >65 ans\n", "12 Yes Alive 49.2 35-54 ans\n", "13 No Alive 58.4 55-65 ans\n", "14 No Dead 60.6 55-65 ans\n", "15 No Alive 25.1 18-34 ans\n", "16 No Alive 43.5 35-54 ans\n", "17 No Alive 27.1 18-34 ans\n", "18 No Alive 58.3 55-65 ans\n", "19 Yes Alive 65.7 >65 ans\n", "20 No Dead 73.2 >65 ans\n", "21 Yes Alive 38.3 35-54 ans\n", "22 No Alive 33.4 18-34 ans\n", "23 Yes Dead 62.3 55-65 ans\n", "24 No Alive 18.0 18-34 ans\n", "25 No Alive 56.2 55-65 ans\n", "26 Yes Alive 59.2 55-65 ans\n", "27 No Alive 25.8 18-34 ans\n", "28 No Dead 36.9 35-54 ans\n", "29 No Alive 20.2 18-34 ans\n", "... ... ... ... ...\n", "1284 Yes Dead 36.0 35-54 ans\n", "1285 Yes Alive 48.3 35-54 ans\n", "1286 No Alive 63.1 55-65 ans\n", "1287 No Alive 60.8 55-65 ans\n", "1288 Yes Dead 39.3 35-54 ans\n", "1289 No Alive 36.7 35-54 ans\n", "1290 No Alive 63.8 55-65 ans\n", "1291 No Dead 71.3 >65 ans\n", "1292 No Alive 57.7 55-65 ans\n", "1293 No Alive 63.2 55-65 ans\n", "1294 No Alive 46.6 35-54 ans\n", "1295 Yes Dead 82.4 >65 ans\n", "1296 Yes Alive 38.3 35-54 ans\n", "1297 Yes Alive 32.7 18-34 ans\n", "1298 No Alive 39.7 35-54 ans\n", "1299 Yes Dead 60.0 55-65 ans\n", "1300 No Dead 71.0 >65 ans\n", "1301 No Alive 20.5 18-34 ans\n", "1302 No Alive 44.4 35-54 ans\n", "1303 Yes Alive 31.2 18-34 ans\n", "1304 Yes Alive 47.8 35-54 ans\n", "1305 Yes Alive 60.9 55-65 ans\n", "1306 No Dead 61.4 55-65 ans\n", "1307 Yes Alive 43.0 35-54 ans\n", "1308 No Alive 42.1 35-54 ans\n", "1309 Yes Alive 35.9 35-54 ans\n", "1310 No Alive 22.3 18-34 ans\n", "1311 Yes Dead 62.1 55-65 ans\n", "1312 No Dead 88.6 >65 ans\n", "1313 No Alive 39.1 35-54 ans\n", "\n", "[1314 rows x 4 columns]" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "donnees" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
StatusAliveDeadAllratio_deces
Smoker
No22162270.026432
Yes18271890.037037
All403134160.031250
\n", "
" ], "text/plain": [ "Status Alive Dead All ratio_deces\n", "Smoker \n", "No 221 6 227 0.026432\n", "Yes 182 7 189 0.037037\n", "All 403 13 416 0.031250" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "table18_34 = pd.pivot_table(donnees[donnees['Age']<35],index = 'Smoker',values='Age', columns='Status', aggfunc='count', margins=True)\n", "table18_34['ratio_deces'] = table18_34['Dead']/table18_34['All']\n", "table18_34" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
StatusAliveDeadAllratio_deces
Smoker
No172191910.099476
Yes190392290.170306
All362584200.138095
\n", "
" ], "text/plain": [ "Status Alive Dead All ratio_deces\n", "Smoker \n", "No 172 19 191 0.099476\n", "Yes 190 39 229 0.170306\n", "All 362 58 420 0.138095" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "table35_54 = pd.pivot_table(donnees[(donnees['Age']>=35) & (donnees['Age']<55)],index = 'Smoker',values='Age', columns='Status', aggfunc='count', margins=True)\n", "table35_54['ratio_deces'] = table35_54['Dead']/table35_54['All']\n", "table35_54" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
StatusAliveDeadAllratio_deces
Smoker
No82491310.374046
Yes65531180.449153
All1471022490.409639
\n", "
" ], "text/plain": [ "Status Alive Dead All ratio_deces\n", "Smoker \n", "No 82 49 131 0.374046\n", "Yes 65 53 118 0.449153\n", "All 147 102 249 0.409639" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "table55_65 = pd.pivot_table(donnees[(donnees['Age']>=55) & (donnees['Age']<66)],index = 'Smoker',values='Age', columns='Status', aggfunc='count', margins=True)\n", "table55_65['ratio_deces'] = table55_65['Dead']/table55_65['All']\n", "table55_65" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
StatusAliveDeadAllratio_deces
Smoker
No271651920.859375
Yes742490.857143
All342072410.858921
\n", "
" ], "text/plain": [ "Status Alive Dead All ratio_deces\n", "Smoker \n", "No 27 165 192 0.859375\n", "Yes 7 42 49 0.857143\n", "All 34 207 241 0.858921" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "table65plus = pd.pivot_table(donnees[(donnees['Age']>65)],index = 'Smoker',values='Age', columns='Status', aggfunc='count', margins=True)\n", "table65plus['ratio_deces'] = table65plus['Dead']/table65plus['All']\n", "table65plus" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "On remarque que dans chaque tranche d'age, les fumeuses ont un taux de décès plus fort. La tendance s'inverse quand on ne regarde plus l'âge car dans nos données les fumeuses sont plutôt plus jeunes que les non fumeuses." ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "tableAge = donnees.pivot_table(index = 'tranche',values='Age', columns=['Smoker','Status'], aggfunc='count')\n", "tableAge.plot.bar(stacked=False,figsize=(15,7))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "On peut sinon simplement calculer la proportion de fumeuses par tranche d'age et voir que cette proportion s'effondre pour les personnes de plus de 65 ans." ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SmokerNoYesAllratio
tranche
18-34 ans2191814000.452500
35-54 ans1992374360.543578
55-65 ans1221152370.485232
>65 ans192492410.203320
All73258213140.442922
\n", "
" ], "text/plain": [ "Smoker No Yes All ratio\n", "tranche \n", "18-34 ans 219 181 400 0.452500\n", "35-54 ans 199 237 436 0.543578\n", "55-65 ans 122 115 237 0.485232\n", ">65 ans 192 49 241 0.203320\n", "All 732 582 1314 0.442922" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fumeuseAge = donnees.pivot_table(index = 'tranche',values='Age', columns='Smoker', aggfunc='count',margins=True)\n", "fumeuseAge['ratio']=fumeuseAge['Yes']/fumeuseAge['All']\n", "fumeuseAge" ] } ], "metadata": { "hide_code_all_hidden": true, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4" } }, "nbformat": 4, "nbformat_minor": 2 }