"Ce document présente une analyse de données pour le MOOC recherche reproductible. Le but est d'analyser les données autour du paradoxe de Simpson, qui donne l'impression - en premier lieu - donner des conclusions surprenantes sur l'effet du tabagisme sur la santé."
]
]
},
},
{
{
...
@@ -23,7 +23,7 @@
...
@@ -23,7 +23,7 @@
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count": 1,
"execution_count": 2,
"metadata": {},
"metadata": {},
"outputs": [],
"outputs": [],
"source": [
"source": [
...
@@ -41,7 +41,7 @@
...
@@ -41,7 +41,7 @@
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count": 2,
"execution_count": 3,
"metadata": {},
"metadata": {},
"outputs": [],
"outputs": [],
"source": [
"source": [
...
@@ -50,7 +50,7 @@
...
@@ -50,7 +50,7 @@
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count": 3,
"execution_count": 4,
"metadata": {},
"metadata": {},
"outputs": [],
"outputs": [],
"source": [
"source": [
...
@@ -66,7 +66,7 @@
...
@@ -66,7 +66,7 @@
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count": 4,
"execution_count": 5,
"metadata": {},
"metadata": {},
"outputs": [
"outputs": [
{
{
...
@@ -534,7 +534,7 @@
...
@@ -534,7 +534,7 @@
"[1314 rows x 3 columns]"
"[1314 rows x 3 columns]"
]
]
},
},
"execution_count": 4,
"execution_count": 5,
"metadata": {},
"metadata": {},
"output_type": "execute_result"
"output_type": "execute_result"
}
}
...
@@ -552,7 +552,7 @@
...
@@ -552,7 +552,7 @@
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count": 5,
"execution_count": 6,
"metadata": {},
"metadata": {},
"outputs": [
"outputs": [
{
{
...
@@ -592,7 +592,7 @@
...
@@ -592,7 +592,7 @@
"Index: []"
"Index: []"
]
]
},
},
"execution_count": 5,
"execution_count": 6,
"metadata": {},
"metadata": {},
"output_type": "execute_result"
"output_type": "execute_result"
}
}
...
@@ -603,7 +603,7 @@
...
@@ -603,7 +603,7 @@
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count": 17,
"execution_count": 7,
"metadata": {},
"metadata": {},
"outputs": [
"outputs": [
{
{
...
@@ -643,7 +643,7 @@
...
@@ -643,7 +643,7 @@
"Index: []"
"Index: []"
]
]
},
},
"execution_count": 17,
"execution_count": 7,
"metadata": {},
"metadata": {},
"output_type": "execute_result"
"output_type": "execute_result"
}
}
...
@@ -654,7 +654,7 @@
...
@@ -654,7 +654,7 @@
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count": 19,
"execution_count": 8,
"metadata": {},
"metadata": {},
"outputs": [
"outputs": [
{
{
...
@@ -694,7 +694,7 @@
...
@@ -694,7 +694,7 @@
"Index: []"
"Index: []"
]
]
},
},
"execution_count": 19,
"execution_count": 8,
"metadata": {},
"metadata": {},
"output_type": "execute_result"
"output_type": "execute_result"
}
}
...
@@ -712,7 +712,7 @@
...
@@ -712,7 +712,7 @@
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count": 20,
"execution_count": 9,
"metadata": {},
"metadata": {},
"outputs": [],
"outputs": [],
"source": [
"source": [
...
@@ -735,7 +735,7 @@
...
@@ -735,7 +735,7 @@
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count": 36,
"execution_count": 10,
"metadata": {},
"metadata": {},
"outputs": [
"outputs": [
{
{
...
@@ -768,7 +768,7 @@
...
@@ -768,7 +768,7 @@
"cell_type": "markdown",
"cell_type": "markdown",
"metadata": {},
"metadata": {},
"source": [
"source": [
"## Taux de mortalité par classes d'age"
"## Taux de mortalité par classes d'age (question 2)"
]
]
},
},
{
{
...
@@ -780,7 +780,7 @@
...
@@ -780,7 +780,7 @@
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count": 49,
"execution_count": 11,
"metadata": {},
"metadata": {},
"outputs": [
"outputs": [
{
{
...
@@ -822,7 +822,7 @@
...
@@ -822,7 +822,7 @@
"cell_type": "markdown",
"cell_type": "markdown",
"metadata": {},
"metadata": {},
"source": [
"source": [
"## Vérification de l'hypothèse - régression logistique"
"## Vérification de l'hypothèse - régression logistique (question 3)"
]
]
},
},
{
{
...
@@ -834,20 +834,570 @@
...
@@ -834,20 +834,570 @@
},
},
{
{
"cell_type": "code",
"cell_type": "code",
"execution_count": 56,
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"dead_bool = [(data['Status'][i] == \"Dead\") for i in range(len(data))]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data.insert(4, \"Dead?\",data)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"data['Dead?'] = dead_bool"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"metadata": {},
"outputs": [
"outputs": [
{
{
"ename": "SyntaxError",
"data": {
"evalue": "invalid syntax (<ipython-input-56-12aabec9006d>, line 1)",
"text/html": [
"output_type": "error",
"<div>\n",
"traceback": [
"<style scoped>\n",
"\u001b[0;36m File \u001b[0;32m\"<ipython-input-56-12aabec9006d>\"\u001b[0;36m, line \u001b[0;32m1\u001b[0m\n\u001b[0;31m dead_bool = [(col[i]['Status'] == \"Dead\") for i in 1:len(data)]\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n"
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Smoker</th>\n",
" <th>Status</th>\n",
" <th>Age</th>\n",
" <th>Dead?</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Yes</td>\n",
" <td>Alive</td>\n",
" <td>21.0</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Yes</td>\n",
" <td>Alive</td>\n",
" <td>19.3</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>No</td>\n",
" <td>Dead</td>\n",
" <td>57.5</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>No</td>\n",
" <td>Alive</td>\n",
" <td>47.1</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Yes</td>\n",
" <td>Alive</td>\n",
" <td>81.4</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>No</td>\n",
" <td>Alive</td>\n",
" <td>36.8</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>No</td>\n",
" <td>Alive</td>\n",
" <td>23.8</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>Yes</td>\n",
" <td>Dead</td>\n",
" <td>57.5</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>Yes</td>\n",
" <td>Alive</td>\n",
" <td>24.8</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>Yes</td>\n",
" <td>Alive</td>\n",
" <td>49.5</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>Yes</td>\n",
" <td>Alive</td>\n",
" <td>30.0</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>No</td>\n",
" <td>Dead</td>\n",
" <td>66.0</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>Yes</td>\n",
" <td>Alive</td>\n",
" <td>49.2</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>No</td>\n",
" <td>Alive</td>\n",
" <td>58.4</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>No</td>\n",
" <td>Dead</td>\n",
" <td>60.6</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>No</td>\n",
" <td>Alive</td>\n",
" <td>25.1</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>No</td>\n",
" <td>Alive</td>\n",
" <td>43.5</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>No</td>\n",
" <td>Alive</td>\n",
" <td>27.1</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>No</td>\n",
" <td>Alive</td>\n",
" <td>58.3</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>Yes</td>\n",
" <td>Alive</td>\n",
" <td>65.7</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>No</td>\n",
" <td>Dead</td>\n",
" <td>73.2</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>Yes</td>\n",
" <td>Alive</td>\n",
" <td>38.3</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>No</td>\n",
" <td>Alive</td>\n",
" <td>33.4</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>Yes</td>\n",
" <td>Dead</td>\n",
" <td>62.3</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>No</td>\n",
" <td>Alive</td>\n",
" <td>18.0</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>No</td>\n",
" <td>Alive</td>\n",
" <td>56.2</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>Yes</td>\n",
" <td>Alive</td>\n",
" <td>59.2</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>No</td>\n",
" <td>Alive</td>\n",
" <td>25.8</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>No</td>\n",
" <td>Dead</td>\n",
" <td>36.9</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>No</td>\n",
" <td>Alive</td>\n",
" <td>20.2</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1284</th>\n",
" <td>Yes</td>\n",
" <td>Dead</td>\n",
" <td>36.0</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1285</th>\n",
" <td>Yes</td>\n",
" <td>Alive</td>\n",
" <td>48.3</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1286</th>\n",
" <td>No</td>\n",
" <td>Alive</td>\n",
" <td>63.1</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1287</th>\n",
" <td>No</td>\n",
" <td>Alive</td>\n",
" <td>60.8</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1288</th>\n",
" <td>Yes</td>\n",
" <td>Dead</td>\n",
" <td>39.3</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1289</th>\n",
" <td>No</td>\n",
" <td>Alive</td>\n",
" <td>36.7</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1290</th>\n",
" <td>No</td>\n",
" <td>Alive</td>\n",
" <td>63.8</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1291</th>\n",
" <td>No</td>\n",
" <td>Dead</td>\n",
" <td>71.3</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1292</th>\n",
" <td>No</td>\n",
" <td>Alive</td>\n",
" <td>57.7</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1293</th>\n",
" <td>No</td>\n",
" <td>Alive</td>\n",
" <td>63.2</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1294</th>\n",
" <td>No</td>\n",
" <td>Alive</td>\n",
" <td>46.6</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1295</th>\n",
" <td>Yes</td>\n",
" <td>Dead</td>\n",
" <td>82.4</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1296</th>\n",
" <td>Yes</td>\n",
" <td>Alive</td>\n",
" <td>38.3</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1297</th>\n",
" <td>Yes</td>\n",
" <td>Alive</td>\n",
" <td>32.7</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1298</th>\n",
" <td>No</td>\n",
" <td>Alive</td>\n",
" <td>39.7</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1299</th>\n",
" <td>Yes</td>\n",
" <td>Dead</td>\n",
" <td>60.0</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1300</th>\n",
" <td>No</td>\n",
" <td>Dead</td>\n",
" <td>71.0</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1301</th>\n",
" <td>No</td>\n",
" <td>Alive</td>\n",
" <td>20.5</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1302</th>\n",
" <td>No</td>\n",
" <td>Alive</td>\n",
" <td>44.4</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1303</th>\n",
" <td>Yes</td>\n",
" <td>Alive</td>\n",
" <td>31.2</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1304</th>\n",
" <td>Yes</td>\n",
" <td>Alive</td>\n",
" <td>47.8</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1305</th>\n",
" <td>Yes</td>\n",
" <td>Alive</td>\n",
" <td>60.9</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1306</th>\n",
" <td>No</td>\n",
" <td>Dead</td>\n",
" <td>61.4</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1307</th>\n",
" <td>Yes</td>\n",
" <td>Alive</td>\n",
" <td>43.0</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1308</th>\n",
" <td>No</td>\n",
" <td>Alive</td>\n",
" <td>42.1</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1309</th>\n",
" <td>Yes</td>\n",
" <td>Alive</td>\n",
" <td>35.9</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1310</th>\n",
" <td>No</td>\n",
" <td>Alive</td>\n",
" <td>22.3</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1311</th>\n",
" <td>Yes</td>\n",
" <td>Dead</td>\n",
" <td>62.1</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1312</th>\n",
" <td>No</td>\n",
" <td>Dead</td>\n",
" <td>88.6</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1313</th>\n",
" <td>No</td>\n",
" <td>Alive</td>\n",
" <td>39.1</td>\n",
" <td>False</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1314 rows × 4 columns</p>\n",
"</div>"
],
"text/plain": [
" Smoker Status Age Dead?\n",
"0 Yes Alive 21.0 False\n",
"1 Yes Alive 19.3 False\n",
"2 No Dead 57.5 True\n",
"3 No Alive 47.1 False\n",
"4 Yes Alive 81.4 False\n",
"5 No Alive 36.8 False\n",
"6 No Alive 23.8 False\n",
"7 Yes Dead 57.5 True\n",
"8 Yes Alive 24.8 False\n",
"9 Yes Alive 49.5 False\n",
"10 Yes Alive 30.0 False\n",
"11 No Dead 66.0 True\n",
"12 Yes Alive 49.2 False\n",
"13 No Alive 58.4 False\n",
"14 No Dead 60.6 True\n",
"15 No Alive 25.1 False\n",
"16 No Alive 43.5 False\n",
"17 No Alive 27.1 False\n",
"18 No Alive 58.3 False\n",
"19 Yes Alive 65.7 False\n",
"20 No Dead 73.2 True\n",
"21 Yes Alive 38.3 False\n",
"22 No Alive 33.4 False\n",
"23 Yes Dead 62.3 True\n",
"24 No Alive 18.0 False\n",
"25 No Alive 56.2 False\n",
"26 Yes Alive 59.2 False\n",
"27 No Alive 25.8 False\n",
"28 No Dead 36.9 True\n",
"29 No Alive 20.2 False\n",
"... ... ... ... ...\n",
"1284 Yes Dead 36.0 True\n",
"1285 Yes Alive 48.3 False\n",
"1286 No Alive 63.1 False\n",
"1287 No Alive 60.8 False\n",
"1288 Yes Dead 39.3 True\n",
"1289 No Alive 36.7 False\n",
"1290 No Alive 63.8 False\n",
"1291 No Dead 71.3 True\n",
"1292 No Alive 57.7 False\n",
"1293 No Alive 63.2 False\n",
"1294 No Alive 46.6 False\n",
"1295 Yes Dead 82.4 True\n",
"1296 Yes Alive 38.3 False\n",
"1297 Yes Alive 32.7 False\n",
"1298 No Alive 39.7 False\n",
"1299 Yes Dead 60.0 True\n",
"1300 No Dead 71.0 True\n",
"1301 No Alive 20.5 False\n",
"1302 No Alive 44.4 False\n",
"1303 Yes Alive 31.2 False\n",
"1304 Yes Alive 47.8 False\n",
"1305 Yes Alive 60.9 False\n",
"1306 No Dead 61.4 True\n",
"1307 Yes Alive 43.0 False\n",
"1308 No Alive 42.1 False\n",
"1309 Yes Alive 35.9 False\n",
"1310 No Alive 22.3 False\n",
"1311 Yes Dead 62.1 True\n",
"1312 No Dead 88.6 True\n",
"1313 No Alive 39.1 False\n",
"\n",
"[1314 rows x 4 columns]"
]
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
}
],
],
"source": [
"source": [
"dead_bool = [(col[i]['Status'] == \"Dead\") for i in 1:len(data)]"