diff --git a/module3/exo3/exercice.ipynb b/module3/exo3/exercice.ipynb index e722915fc006682ad4fe5cf5b97f8f062c963177..5fe6985255029b72a0d36da6d897fcc8daff8d88 100644 --- a/module3/exo3/exercice.ipynb +++ b/module3/exo3/exercice.ipynb @@ -23,7 +23,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -41,7 +41,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -50,7 +50,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -66,7 +66,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -534,7 +534,7 @@ "[1314 rows x 3 columns]" ] }, - "execution_count": 5, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -552,7 +552,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -592,7 +592,7 @@ "Index: []" ] }, - "execution_count": 6, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -603,7 +603,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -643,7 +643,7 @@ "Index: []" ] }, - "execution_count": 7, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -654,7 +654,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -694,7 +694,7 @@ "Index: []" ] }, - "execution_count": 8, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -707,12 +707,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "There seems to be no error in the dataset." + "Le dataset paraît correct, et les données brutes sont ainsi utilisées pour l'analyse." ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -735,7 +735,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -761,7 +761,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "On remarque que le taux de mortalité est - nettement - plus élevé dans le groupe des non fumeuses, ce qui constitue le paradoxe de Simpson" + "On remarque que le taux de mortalité est - nettement - plus élevé dans le groupe des non fumeuses, ce qui constitue le paradoxe de Simpson, vu que le sens commun ferait s'attendre à la conclusion inverse." ] }, { @@ -780,7 +780,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -815,7 +815,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "blabla il y a plus de vieilles non fumeuses, donc plus de morts" + "Une explication possible de ce paradoxe est que le groupe des non fumeuses contient plus de personnes agées (proportionnellement), vu que les non fumeuses vivent plus longtemps, et du coup il contient également un taux de mortalité plus élevé, vu que l'âge est la principale variable explicative du taux de décès." ] }, { @@ -834,7 +834,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -853,74 +853,7 @@ }, { "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "from sklearn.linear_model import LogisticRegression\n", - "from sklearn.metrics import classification_report, confusion_matrix" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "LogisticRegression(C=10.0, class_weight=None, dual=False, fit_intercept=True,\n", - " intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n", - " penalty='l2', random_state=0, solver='liblinear', tol=0.0001,\n", - " verbose=0, warm_start=False)" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "model = LogisticRegression(solver='liblinear', C=10.0, random_state=0)\n", - "model.fit(data[data['Smoker'] == \"Yes\"]['Age'].values.reshape(-1,1), data[data['Smoker'] == \"Yes\"]['Dead?'])\n" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [], - "source": [ - "p_pred = model.predict_proba(data['Age'].values.reshape(-1,1))" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[[0.98225578 0.01774422]\n", - " [0.98490288 0.01509712]\n", - " [0.61947454 0.38052546]\n", - " ...\n", - " [0.51071991 0.48928009]\n", - " [0.07464525 0.92535475]\n", - " [0.90594064 0.09405936]]\n" - ] - } - ], - "source": [ - "print(p_pred)" - ] - }, - { - "cell_type": "code", - "execution_count": 34, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -930,125 +863,7 @@ }, { "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [ - { - "ename": "AttributeError", - "evalue": "'list' object has no attribute 'reshape'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mx\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Age'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Smoker'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreshape\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mx\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msm\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0madd_constant\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Dead?'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mAttributeError\u001b[0m: 'list' object has no attribute 'reshape'" - ] - } - ], - "source": [ - "x1 = data['Age'].values.reshape(-1,1)\n", - "x2 = data['Smoke?'].values.reshape(-1,1)\n", - "x = sm.add_constant(x)\n", - "y = data['Dead?']" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Optimization terminated successfully.\n", - " Current function value: 0.382339\n", - " Iterations 7\n" - ] - } - ], - "source": [ - "model = sm.Logit(y, x)\n", - "result = model.fit(method='newton')" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - "\n", - "
Logit Regression Results
Dep. Variable: Dead? No. Observations: 1314
Model: Logit Df Residuals: 1312
Method: MLE Df Model: 1
Date: Mon, 31 Aug 2020 Pseudo R-squ.: 0.3560
Time: 15:21:58 Log-Likelihood: -502.39
converged: True LL-Null: -780.16
LLR p-value: 7.883e-123
\n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - "\n", - "\n", - " \n", - "\n", - "
coef std err z P>|z| [0.025 0.975]
const -6.1045 0.321 -18.992 0.000 -6.735 -5.475
x1 0.0977 0.006 17.578 0.000 0.087 0.109
" - ], - "text/plain": [ - "\n", - "\"\"\"\n", - " Logit Regression Results \n", - "==============================================================================\n", - "Dep. Variable: Dead? No. Observations: 1314\n", - "Model: Logit Df Residuals: 1312\n", - "Method: MLE Df Model: 1\n", - "Date: Mon, 31 Aug 2020 Pseudo R-squ.: 0.3560\n", - "Time: 15:21:58 Log-Likelihood: -502.39\n", - "converged: True LL-Null: -780.16\n", - " LLR p-value: 7.883e-123\n", - "==============================================================================\n", - " coef std err z P>|z| [0.025 0.975]\n", - "------------------------------------------------------------------------------\n", - "const -6.1045 0.321 -18.992 0.000 -6.735 -5.475\n", - "x1 0.0977 0.006 17.578 0.000 0.087 0.109\n", - "==============================================================================\n", - "\"\"\"" - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "result.summary()" - ] - }, - { - "cell_type": "code", - "execution_count": 60, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -1061,7 +876,7 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -1081,7 +896,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -1099,10 +914,10 @@ " Method: MLE Df Model: 2 \n", "\n", "\n", - " Date: Mon, 31 Aug 2020 Pseudo R-squ.: 0.3579 \n", + " Date: Tue, 01 Sep 2020 Pseudo R-squ.: 0.3579 \n", "\n", "\n", - " Time: 15:35:59 Log-Likelihood: -500.95 \n", + " Time: 10:16:00 Log-Likelihood: -500.95 \n", "\n", "\n", " converged: True LL-Null: -780.16 \n", @@ -1134,8 +949,8 @@ "Dep. Variable: Dead? No. Observations: 1314\n", "Model: Logit Df Residuals: 1311\n", "Method: MLE Df Model: 2\n", - "Date: Mon, 31 Aug 2020 Pseudo R-squ.: 0.3579\n", - "Time: 15:35:59 Log-Likelihood: -500.95\n", + "Date: Tue, 01 Sep 2020 Pseudo R-squ.: 0.3579\n", + "Time: 10:16:00 Log-Likelihood: -500.95\n", "converged: True LL-Null: -780.16\n", " LLR p-value: 5.534e-122\n", "==============================================================================\n", @@ -1148,7 +963,7 @@ "\"\"\"" ] }, - "execution_count": 62, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -1156,6 +971,20 @@ "source": [ "result.summary()" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Le tableau précédent donne les résultats de la régression logistique, avec x1 qui représente l'âge, et x2 qui représente le status (fumeuse ou non fumeuse). Le modèle cherche donc à expliquer la variable \"Dead ?\" à l'aide des variables âge et fumeur ou non. Les résultats montrent un p-value à 0.09 pour le status de fumeur, ce qui signifie que l'on peut rejeter l'hypothèse nulle pour une valeur significative de 10%, mais pas pour 5%. De plus le coefficient associé est positif, on peut donc en conclure que le fait de fumer impacte négativement l'espérance de vie, pour un seuil significatif de 10%. Des recherches plus approfondies seraient nécessaires pour établir des conclusions plus claires." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": {