diff --git a/module4/MOOC_challenger.ipynb b/module4/MOOC_challenger.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..c5ef22fcc55d3c316f5ed3280545d1423eb48723 --- /dev/null +++ b/module4/MOOC_challenger.ipynb @@ -0,0 +1,754 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "613c606b-78ea-4262-9700-057e46b188cd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3.9.15 | packaged by conda-forge | (main, Nov 22 2022, 08:41:22) [MSC v.1929 64 bit (AMD64)]\n", + "uname_result(system='Windows', node='mobilis', release='10', version='10.0.14393', machine='AMD64')\n", + "IPython 8.18.1\n", + "IPython.core.release 8.18.1\n", + "PIL 10.3.0\n", + "PIL.Image 10.3.0\n", + "PIL._deprecate 10.3.0\n", + "PIL._version 10.3.0\n", + "_csv 1.0\n", + "_ctypes 1.1.0\n", + "decimal 1.70\n", + "_pydev_bundle.fsnotify 0.1.5\n", + "_pydevd_frame_eval.vendored.bytecode 0.13.0.dev\n", + "argparse 1.1\n", + "bottleneck 1.3.7\n", + "cffi 1.16.0\n", + "colorama 0.4.6\n", + "comm 0.1.4\n", + "csv 1.0\n", + "ctypes 1.1.0\n", + "cycler 0.12.1\n", + "dateutil 2.8.2\n", + "debugpy 1.8.0\n", + "debugpy.public_api 1.8.0\n", + "decimal 1.70\n", + "decorator 5.1.1\n", + "defusedxml 0.7.1\n", + "exceptiongroup 1.2.0\n", + "exceptiongroup._version 1.2.0\n", + "executing 2.0.1\n", + "executing.version 2.0.1\n", + "http.server 0.6\n", + "ipykernel 6.26.0\n", + "ipykernel._version 6.26.0\n", + "ipywidgets 8.1.1\n", + "ipywidgets._version 8.1.1\n", + "jedi 0.19.1\n", + "joblib 1.3.2\n", + "joblib.externals.cloudpickle 2.2.0\n", + "joblib.externals.loky 3.4.1\n", + "json 2.0.9\n", + "jupyter_client 8.6.0\n", + "jupyter_client._version 8.6.0\n", + "jupyter_core 5.5.0\n", + "jupyter_core.version 5.5.0\n", + "kiwisolver 1.4.5\n", + "kiwisolver._cext 1.4.5\n", + "logging 0.5.1.2\n", + "matplotlib 3.8.2\n", + "matplotlib._version 3.8.2\n", + "mkl 2.4.1\n", + "numexpr 2.9.0\n", + "numpy 1.26.4\n", + "numpy.core 1.26.4\n", + "numpy.core._multiarray_umath 3.1\n", + "numpy.linalg._umath_linalg 0.1.5\n", + "numpy.version 1.26.4\n", + "packaging 23.2\n", + "pandas 2.1.3\n", + "pandas._version_meson 2.1.3\n", + "parso 0.8.3\n", + "patsy 0.5.3\n", + "patsy.version 0.5.3\n", + "pickleshare 0.7.5\n", + "platform 1.0.8\n", + "platformdirs 4.0.0\n", + "platformdirs.version 4.0.0\n", + "prompt_toolkit 3.0.41\n", + "psutil 5.9.5\n", + "pure_eval 0.2.2\n", + "pure_eval.version 0.2.2\n", + "pydevd 2.9.5\n", + "pygments 2.17.2\n", + "pyparsing 3.1.1\n", + "pytz 2023.3.post1\n", + "re 2.2.1\n", + "scipy 1.11.4\n", + "scipy._lib._uarray 0.8.8.dev0+aa94c5a4.scipy\n", + "scipy._lib.decorator 4.0.5\n", + "scipy.integrate._dop 1.22.4\n", + "scipy.integrate._lsoda 1.22.4\n", + "scipy.integrate._vode 1.22.4\n", + "scipy.interpolate.dfitpack 1.22.4\n", + "scipy.linalg._fblas 1.22.4\n", + "scipy.linalg._flapack 1.22.4\n", + "scipy.linalg._flinalg 1.22.4\n", + "scipy.linalg._interpolative 1.22.4\n", + "scipy.optimize.__nnls 1.22.4\n", + "scipy.optimize._cobyla 1.22.4\n", + "scipy.optimize._lbfgsb 1.22.4\n", + "scipy.optimize._minpack2 1.22.4\n", + "scipy.optimize._slsqp 1.22.4\n", + "scipy.sparse.linalg._eigen.arpack._arpack 1.22.4\n", + "scipy.sparse.linalg._isolve._iterative 1.22.4\n", + "scipy.special._specfun 1.22.4\n", + "scipy.stats._mvn 1.22.4\n", + "scipy.stats._statlib 1.22.4\n", + "seaborn 0.13.0\n", + "seaborn.external.appdirs 1.4.4\n", + "seaborn.external.husl 2.1.0\n", + "six 1.16.0\n", + "socketserver 0.4\n", + "stack_data 0.6.2\n", + "stack_data.version 0.6.2\n", + "statsmodels 0.14.0\n", + "statsmodels.__init__ 0.14.0\n", + "statsmodels._version 0.14.0\n", + "statsmodels.api 0.14.0\n", + "statsmodels.tools.web 0.14.0\n", + "traitlets 5.14.0\n", + "traitlets._version 5.14.0\n", + "urllib.request 3.9\n", + "wcwidth 0.2.12\n", + "xmlrpc.client 3.9\n", + "zlib 1.0\n", + "zmq 25.1.1\n", + "zmq.sugar 25.1.1\n", + "zmq.sugar.version 25.1.1\n" + ] + } + ], + "source": [ + " def print_imported_modules():\n", + " import sys\n", + " for name, val in sorted(sys.modules.items()):\n", + " if(hasattr(val, '__version__')): \n", + " print(val.__name__, val.__version__)\n", + "# else:\n", + "# print(val.__name__, \"(unknown version)\")\n", + "def print_sys_info():\n", + " import sys\n", + " import platform\n", + " print(sys.version)\n", + " print(platform.uname())\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import statsmodels.api as sm\n", + "import seaborn as sns\n", + "\n", + "print_sys_info()\n", + "print_imported_modules()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "ec5f8965-490f-4db3-b033-12abac36703e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DateCountTemperaturePressureMalfunction
04/12/81666500
111/12/81670501
23/22/82669500
311/11/82668500
44/04/83667500
56/18/82672500
68/30/836731000
711/28/836701000
82/03/846572001
94/06/846632001
108/30/846702001
1110/05/846782000
1211/08/846672000
131/24/856532002
144/12/856672000
154/29/856752000
166/17/856702000
177/29/856812000
188/27/856762000
1910/03/856792000
2010/30/856752002
2111/26/856762000
221/12/866582001
\n", + "
" + ], + "text/plain": [ + " Date Count Temperature Pressure Malfunction\n", + "0 4/12/81 6 66 50 0\n", + "1 11/12/81 6 70 50 1\n", + "2 3/22/82 6 69 50 0\n", + "3 11/11/82 6 68 50 0\n", + "4 4/04/83 6 67 50 0\n", + "5 6/18/82 6 72 50 0\n", + "6 8/30/83 6 73 100 0\n", + "7 11/28/83 6 70 100 0\n", + "8 2/03/84 6 57 200 1\n", + "9 4/06/84 6 63 200 1\n", + "10 8/30/84 6 70 200 1\n", + "11 10/05/84 6 78 200 0\n", + "12 11/08/84 6 67 200 0\n", + "13 1/24/85 6 53 200 2\n", + "14 4/12/85 6 67 200 0\n", + "15 4/29/85 6 75 200 0\n", + "16 6/17/85 6 70 200 0\n", + "17 7/29/85 6 81 200 0\n", + "18 8/27/85 6 76 200 0\n", + "19 10/03/85 6 79 200 0\n", + "20 10/30/85 6 75 200 2\n", + "21 11/26/85 6 76 200 0\n", + "22 1/12/86 6 58 200 1" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = pd.read_csv(\"module2_exo5_shuttle.csv\")\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "e092983b-fa43-485c-bcd2-870f8f4cc5b2", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%matplotlib inline\n", + "pd.set_option('mode.chained_assignment',None) # this removes a useless warning from pandas\n", + "import matplotlib.pyplot as plt\n", + "\n", + "data[\"Frequency\"]=data.Malfunction/data.Count\n", + "data.plot(x=\"Temperature\",y=\"Frequency\",kind=\"scatter\",ylim=[0,1])\n", + "plt.grid(True)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "5ae87d39-9f53-493e-a64a-ce5d0420ab9e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "
Generalized Linear Model Regression Results
Dep. Variable: Frequency No. Observations: 23
Model: GLM Df Residuals: 21
Model Family: Binomial Df Model: 1
Link Function: Logit Scale: 1.0000
Method: IRLS Log-Likelihood: -3.9210
Date: Mon, 16 Sep 2024 Deviance: 3.0144
Time: 11:15:44 Pearson chi2: 5.00
No. Iterations: 6 Pseudo R-squ. (CS): 0.04355
Covariance Type: nonrobust
\n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "
coef std err z P>|z| [0.025 0.975]
Intercept 5.0850 7.477 0.680 0.496 -9.570 19.740
Temperature -0.1156 0.115 -1.004 0.316 -0.341 0.110
" + ], + "text/latex": [ + "\\begin{center}\n", + "\\begin{tabular}{lclc}\n", + "\\toprule\n", + "\\textbf{Dep. Variable:} & Frequency & \\textbf{ No. Observations: } & 23 \\\\\n", + "\\textbf{Model:} & GLM & \\textbf{ Df Residuals: } & 21 \\\\\n", + "\\textbf{Model Family:} & Binomial & \\textbf{ Df Model: } & 1 \\\\\n", + "\\textbf{Link Function:} & Logit & \\textbf{ Scale: } & 1.0000 \\\\\n", + "\\textbf{Method:} & IRLS & \\textbf{ Log-Likelihood: } & -3.9210 \\\\\n", + "\\textbf{Date:} & Mon, 16 Sep 2024 & \\textbf{ Deviance: } & 3.0144 \\\\\n", + "\\textbf{Time:} & 11:15:44 & \\textbf{ Pearson chi2: } & 5.00 \\\\\n", + "\\textbf{No. Iterations:} & 6 & \\textbf{ Pseudo R-squ. (CS):} & 0.04355 \\\\\n", + "\\textbf{Covariance Type:} & nonrobust & \\textbf{ } & \\\\\n", + "\\bottomrule\n", + "\\end{tabular}\n", + "\\begin{tabular}{lcccccc}\n", + " & \\textbf{coef} & \\textbf{std err} & \\textbf{z} & \\textbf{P$> |$z$|$} & \\textbf{[0.025} & \\textbf{0.975]} \\\\\n", + "\\midrule\n", + "\\textbf{Intercept} & 5.0850 & 7.477 & 0.680 & 0.496 & -9.570 & 19.740 \\\\\n", + "\\textbf{Temperature} & -0.1156 & 0.115 & -1.004 & 0.316 & -0.341 & 0.110 \\\\\n", + "\\bottomrule\n", + "\\end{tabular}\n", + "%\\caption{Generalized Linear Model Regression Results}\n", + "\\end{center}" + ], + "text/plain": [ + "\n", + "\"\"\"\n", + " Generalized Linear Model Regression Results \n", + "==============================================================================\n", + "Dep. Variable: Frequency No. Observations: 23\n", + "Model: GLM Df Residuals: 21\n", + "Model Family: Binomial Df Model: 1\n", + "Link Function: Logit Scale: 1.0000\n", + "Method: IRLS Log-Likelihood: -3.9210\n", + "Date: Mon, 16 Sep 2024 Deviance: 3.0144\n", + "Time: 11:15:44 Pearson chi2: 5.00\n", + "No. Iterations: 6 Pseudo R-squ. (CS): 0.04355\n", + "Covariance Type: nonrobust \n", + "===============================================================================\n", + " coef std err z P>|z| [0.025 0.975]\n", + "-------------------------------------------------------------------------------\n", + "Intercept 5.0850 7.477 0.680 0.496 -9.570 19.740\n", + "Temperature -0.1156 0.115 -1.004 0.316 -0.341 0.110\n", + "===============================================================================\n", + "\"\"\"" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + " import statsmodels.api as sm\n", + "\n", + "data[\"Success\"]=data.Count-data.Malfunction\n", + "data[\"Intercept\"]=1\n", + "\n", + "logmodel=sm.GLM(data['Frequency'], data[['Intercept','Temperature']], \n", + " family=sm.families.Binomial()).fit()\n", + "\n", + "logmodel.summary()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "29bfd3cd-ffcd-4411-841c-f89c69e167e1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "
Generalized Linear Model Regression Results
Dep. Variable: Frequency No. Observations: 23
Model: GLM Df Residuals: 21
Model Family: Binomial Df Model: 1
Link Function: Logit Scale: 1.0000
Method: IRLS Log-Likelihood: -23.526
Date: Mon, 16 Sep 2024 Deviance: 18.086
Time: 11:16:03 Pearson chi2: 30.0
No. Iterations: 6 Pseudo R-squ. (CS): 0.2344
Covariance Type: nonrobust
\n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "
coef std err z P>|z| [0.025 0.975]
Intercept 5.0850 3.052 1.666 0.096 -0.898 11.068
Temperature -0.1156 0.047 -2.458 0.014 -0.208 -0.023
" + ], + "text/latex": [ + "\\begin{center}\n", + "\\begin{tabular}{lclc}\n", + "\\toprule\n", + "\\textbf{Dep. Variable:} & Frequency & \\textbf{ No. Observations: } & 23 \\\\\n", + "\\textbf{Model:} & GLM & \\textbf{ Df Residuals: } & 21 \\\\\n", + "\\textbf{Model Family:} & Binomial & \\textbf{ Df Model: } & 1 \\\\\n", + "\\textbf{Link Function:} & Logit & \\textbf{ Scale: } & 1.0000 \\\\\n", + "\\textbf{Method:} & IRLS & \\textbf{ Log-Likelihood: } & -23.526 \\\\\n", + "\\textbf{Date:} & Mon, 16 Sep 2024 & \\textbf{ Deviance: } & 18.086 \\\\\n", + "\\textbf{Time:} & 11:16:03 & \\textbf{ Pearson chi2: } & 30.0 \\\\\n", + "\\textbf{No. Iterations:} & 6 & \\textbf{ Pseudo R-squ. (CS):} & 0.2344 \\\\\n", + "\\textbf{Covariance Type:} & nonrobust & \\textbf{ } & \\\\\n", + "\\bottomrule\n", + "\\end{tabular}\n", + "\\begin{tabular}{lcccccc}\n", + " & \\textbf{coef} & \\textbf{std err} & \\textbf{z} & \\textbf{P$> |$z$|$} & \\textbf{[0.025} & \\textbf{0.975]} \\\\\n", + "\\midrule\n", + "\\textbf{Intercept} & 5.0850 & 3.052 & 1.666 & 0.096 & -0.898 & 11.068 \\\\\n", + "\\textbf{Temperature} & -0.1156 & 0.047 & -2.458 & 0.014 & -0.208 & -0.023 \\\\\n", + "\\bottomrule\n", + "\\end{tabular}\n", + "%\\caption{Generalized Linear Model Regression Results}\n", + "\\end{center}" + ], + "text/plain": [ + "\n", + "\"\"\"\n", + " Generalized Linear Model Regression Results \n", + "==============================================================================\n", + "Dep. Variable: Frequency No. Observations: 23\n", + "Model: GLM Df Residuals: 21\n", + "Model Family: Binomial Df Model: 1\n", + "Link Function: Logit Scale: 1.0000\n", + "Method: IRLS Log-Likelihood: -23.526\n", + "Date: Mon, 16 Sep 2024 Deviance: 18.086\n", + "Time: 11:16:03 Pearson chi2: 30.0\n", + "No. Iterations: 6 Pseudo R-squ. (CS): 0.2344\n", + "Covariance Type: nonrobust \n", + "===============================================================================\n", + " coef std err z P>|z| [0.025 0.975]\n", + "-------------------------------------------------------------------------------\n", + "Intercept 5.0850 3.052 1.666 0.096 -0.898 11.068\n", + "Temperature -0.1156 0.047 -2.458 0.014 -0.208 -0.023\n", + "===============================================================================\n", + "\"\"\"" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "logmodel=sm.GLM(data['Frequency'], data[['Intercept','Temperature']], \n", + " family=sm.families.Binomial(),\n", + " var_weights=data['Count']).fit()\n", + "\n", + "logmodel.summary()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "776aaeb4-e1dd-4dd5-85ef-7db52cd93912", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%matplotlib inline\n", + "data_pred = pd.DataFrame({'Temperature': np.linspace(start=30, stop=90, num=121), 'Intercept': 1})\n", + "data_pred['Frequency'] = logmodel.predict(data_pred)\n", + "data_pred.plot(x=\"Temperature\",y=\"Frequency\",kind=\"line\",ylim=[0,1])\n", + "plt.scatter(x=data[\"Temperature\"],y=data[\"Frequency\"])\n", + "plt.grid(True)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "7ddbe776-9979-4891-b615-8e5baaec5f90", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "sns.set(color_codes=True)\n", + "plt.xlim(30,90)\n", + "plt.ylim(0,1)\n", + "sns.regplot(x='Temperature', y='Frequency', data=data, logistic=True)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf250191-a2b3-4148-adde-3442cdc86e4b", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.15" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}