From 9a0a462000cf37a95cb2e9d4ba59ce3208d10789 Mon Sep 17 00:00:00 2001 From: 86d2379a8cd828206f6e8576c862739f <86d2379a8cd828206f6e8576c862739f@app-learninglab.inria.fr> Date: Mon, 31 Aug 2020 15:37:56 +0000 Subject: [PATCH] no commit message --- module3/exo3/exercice.ipynb | 810 +++++++++++++----------------------- 1 file changed, 280 insertions(+), 530 deletions(-) diff --git a/module3/exo3/exercice.ipynb b/module3/exo3/exercice.ipynb index d2bd5bc..e722915 100644 --- a/module3/exo3/exercice.ipynb +++ b/module3/exo3/exercice.ipynb @@ -834,578 +834,328 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 45, "metadata": {}, "outputs": [], "source": [ - "dead_bool = [(data['Status'][i] == \"Dead\") for i in range(len(data))]" + "dead_bool = [(data['Status'][i] == \"Dead\") for i in range(len(data))]\n", + "data['Dead?'] = dead_bool\n", + "smoke_bool = [(data['Smoker'][i] == \"Yes\") for i in range(len(data))]\n", + "data['Smoke?'] = smoke_bool" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " Nous allons tester les hypothèses par régression logistique." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ - "data.insert(4, \"Dead?\",data)" + "import numpy as np\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.metrics import classification_report, confusion_matrix" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "LogisticRegression(C=10.0, class_weight=None, dual=False, fit_intercept=True,\n", + " intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n", + " penalty='l2', random_state=0, solver='liblinear', tol=0.0001,\n", + " verbose=0, warm_start=False)" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model = LogisticRegression(solver='liblinear', C=10.0, random_state=0)\n", + "model.fit(data[data['Smoker'] == \"Yes\"]['Age'].values.reshape(-1,1), data[data['Smoker'] == \"Yes\"]['Dead?'])\n" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ - "data['Dead?'] = dead_bool" + "p_pred = model.predict_proba(data['Age'].values.reshape(-1,1))" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[0.98225578 0.01774422]\n", + " [0.98490288 0.01509712]\n", + " [0.61947454 0.38052546]\n", + " ...\n", + " [0.51071991 0.48928009]\n", + " [0.07464525 0.92535475]\n", + " [0.90594064 0.09405936]]\n" + ] + } + ], + "source": [ + "print(p_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import statsmodels.api as sm\n" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "ename": "AttributeError", + "evalue": "'list' object has no attribute 'reshape'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mx\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Age'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Smoker'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreshape\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mx\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msm\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0madd_constant\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Dead?'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mAttributeError\u001b[0m: 'list' object has no attribute 'reshape'" + ] + } + ], + "source": [ + "x1 = data['Age'].values.reshape(-1,1)\n", + "x2 = data['Smoke?'].values.reshape(-1,1)\n", + "x = sm.add_constant(x)\n", + "y = data['Dead?']" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Optimization terminated successfully.\n", + " Current function value: 0.382339\n", + " Iterations 7\n" + ] + } + ], + "source": [ + "model = sm.Logit(y, x)\n", + "result = model.fit(method='newton')" + ] + }, + { + "cell_type": "code", + "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + "
SmokerStatusAgeDead?
0YesAlive21.0False
1YesAlive19.3False
2NoDead57.5True
3NoAlive47.1False
4YesAlive81.4False
5NoAlive36.8False
6NoAlive23.8False
7YesDead57.5True
8YesAlive24.8False
9YesAlive49.5False
10YesAlive30.0False
11NoDead66.0True
12YesAlive49.2False
13NoAlive58.4False
14NoDead60.6True
15NoAlive25.1False
16NoAlive43.5False
17NoAlive27.1False
18NoAlive58.3False
19YesAlive65.7False
20NoDead73.2True
21YesAlive38.3False
22NoAlive33.4False
23YesDead62.3True
24NoAlive18.0False
25NoAlive56.2False
26YesAlive59.2False
27NoAlive25.8False
28NoDead36.9True
29NoAlive20.2False
...............
1284YesDead36.0True
1285YesAlive48.3False
1286NoAlive63.1False
1287NoAlive60.8False
1288YesDead39.3True
1289NoAlive36.7False
1290NoAlive63.8False
1291NoDead71.3True
1292NoAlive57.7False
1293NoAlive63.2False
1294NoAlive46.6False
1295YesDead82.4True
1296YesAlive38.3False
1297YesAlive32.7False
1298NoAlive39.7False
1299YesDead60.0True
1300NoDead71.0True
1301NoAlive20.5False
1302NoAlive44.4False
1303YesAlive31.2False
1304YesAlive47.8False
1305YesAlive60.9False
1306NoDead61.4True
1307YesAlive43.0False
1308NoAlive42.1False
1309YesAlive35.9False
1310NoAlive22.3False
1311YesDead62.1True
1312NoDead88.6True
1313NoAlive39.1False
\n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", "
Logit Regression Results
Dep. Variable: Dead? No. Observations: 1314
Model: Logit Df Residuals: 1312
Method: MLE Df Model: 1
Date: Mon, 31 Aug 2020 Pseudo R-squ.: 0.3560
Time: 15:21:58 Log-Likelihood: -502.39
converged: True LL-Null: -780.16
LLR p-value: 7.883e-123
\n", - "

1314 rows × 4 columns

\n", - "
" + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "
coef std err z P>|z| [0.025 0.975]
const -6.1045 0.321 -18.992 0.000 -6.735 -5.475
x1 0.0977 0.006 17.578 0.000 0.087 0.109
" ], "text/plain": [ - " Smoker Status Age Dead?\n", - "0 Yes Alive 21.0 False\n", - "1 Yes Alive 19.3 False\n", - "2 No Dead 57.5 True\n", - "3 No Alive 47.1 False\n", - "4 Yes Alive 81.4 False\n", - "5 No Alive 36.8 False\n", - "6 No Alive 23.8 False\n", - "7 Yes Dead 57.5 True\n", - "8 Yes Alive 24.8 False\n", - "9 Yes Alive 49.5 False\n", - "10 Yes Alive 30.0 False\n", - "11 No Dead 66.0 True\n", - "12 Yes Alive 49.2 False\n", - "13 No Alive 58.4 False\n", - "14 No Dead 60.6 True\n", - "15 No Alive 25.1 False\n", - "16 No Alive 43.5 False\n", - "17 No Alive 27.1 False\n", - "18 No Alive 58.3 False\n", - "19 Yes Alive 65.7 False\n", - "20 No Dead 73.2 True\n", - "21 Yes Alive 38.3 False\n", - "22 No Alive 33.4 False\n", - "23 Yes Dead 62.3 True\n", - "24 No Alive 18.0 False\n", - "25 No Alive 56.2 False\n", - "26 Yes Alive 59.2 False\n", - "27 No Alive 25.8 False\n", - "28 No Dead 36.9 True\n", - "29 No Alive 20.2 False\n", - "... ... ... ... ...\n", - "1284 Yes Dead 36.0 True\n", - "1285 Yes Alive 48.3 False\n", - "1286 No Alive 63.1 False\n", - "1287 No Alive 60.8 False\n", - "1288 Yes Dead 39.3 True\n", - "1289 No Alive 36.7 False\n", - "1290 No Alive 63.8 False\n", - "1291 No Dead 71.3 True\n", - "1292 No Alive 57.7 False\n", - "1293 No Alive 63.2 False\n", - "1294 No Alive 46.6 False\n", - "1295 Yes Dead 82.4 True\n", - "1296 Yes Alive 38.3 False\n", - "1297 Yes Alive 32.7 False\n", - "1298 No Alive 39.7 False\n", - "1299 Yes Dead 60.0 True\n", - "1300 No Dead 71.0 True\n", - "1301 No Alive 20.5 False\n", - "1302 No Alive 44.4 False\n", - "1303 Yes Alive 31.2 False\n", - "1304 Yes Alive 47.8 False\n", - "1305 Yes Alive 60.9 False\n", - "1306 No Dead 61.4 True\n", - "1307 Yes Alive 43.0 False\n", - "1308 No Alive 42.1 False\n", - "1309 Yes Alive 35.9 False\n", - "1310 No Alive 22.3 False\n", - "1311 Yes Dead 62.1 True\n", - "1312 No Dead 88.6 True\n", - "1313 No Alive 39.1 False\n", - "\n", - "[1314 rows x 4 columns]" + "\n", + "\"\"\"\n", + " Logit Regression Results \n", + "==============================================================================\n", + "Dep. Variable: Dead? No. Observations: 1314\n", + "Model: Logit Df Residuals: 1312\n", + "Method: MLE Df Model: 1\n", + "Date: Mon, 31 Aug 2020 Pseudo R-squ.: 0.3560\n", + "Time: 15:21:58 Log-Likelihood: -502.39\n", + "converged: True LL-Null: -780.16\n", + " LLR p-value: 7.883e-123\n", + "==============================================================================\n", + " coef std err z P>|z| [0.025 0.975]\n", + "------------------------------------------------------------------------------\n", + "const -6.1045 0.321 -18.992 0.000 -6.735 -5.475\n", + "x1 0.0977 0.006 17.578 0.000 0.087 0.109\n", + "==============================================================================\n", + "\"\"\"" ] }, - "execution_count": 17, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "data" + "result.summary()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 60, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "x1 = data['Age'].values.reshape(-1,1)\n", + "x2 = data['Smoke?'].values.reshape(-1,1)\n", + "x = np.hstack((x1,x2))\n", + "x = sm.add_constant(x)\n", + "y = data['Dead?']" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Optimization terminated successfully.\n", + " Current function value: 0.381244\n", + " Iterations 7\n" + ] + } + ], + "source": [ + "model = sm.Logit(y, x)\n", + "result = model.fit(method='newton')" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "
Logit Regression Results
Dep. Variable: Dead? No. Observations: 1314
Model: Logit Df Residuals: 1311
Method: MLE Df Model: 2
Date: Mon, 31 Aug 2020 Pseudo R-squ.: 0.3579
Time: 15:35:59 Log-Likelihood: -500.95
converged: True LL-Null: -780.16
LLR p-value: 5.534e-122
\n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "
coef std err z P>|z| [0.025 0.975]
const -6.3519 0.360 -17.637 0.000 -7.058 -5.646
x1 0.0998 0.006 17.290 0.000 0.089 0.111
x2 0.2787 0.165 1.689 0.091 -0.045 0.602
" + ], + "text/plain": [ + "\n", + "\"\"\"\n", + " Logit Regression Results \n", + "==============================================================================\n", + "Dep. Variable: Dead? No. Observations: 1314\n", + "Model: Logit Df Residuals: 1311\n", + "Method: MLE Df Model: 2\n", + "Date: Mon, 31 Aug 2020 Pseudo R-squ.: 0.3579\n", + "Time: 15:35:59 Log-Likelihood: -500.95\n", + "converged: True LL-Null: -780.16\n", + " LLR p-value: 5.534e-122\n", + "==============================================================================\n", + " coef std err z P>|z| [0.025 0.975]\n", + "------------------------------------------------------------------------------\n", + "const -6.3519 0.360 -17.637 0.000 -7.058 -5.646\n", + "x1 0.0998 0.006 17.290 0.000 0.089 0.111\n", + "x2 0.2787 0.165 1.689 0.091 -0.045 0.602\n", + "==============================================================================\n", + "\"\"\"" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result.summary()" + ] } ], "metadata": { -- 2.18.1