{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Analyse de risque : Navette Challenger" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "D'après l'article servant de base à cet exercice, les auteurs estiment plusieurs paramètres : $s_{\\hat{\\alpha}} = 3.052$ et $s_{\\hat{\\beta}} = 0.047$. La qualité de l'ajustement est caractérisée par un coefficient $G^2 = 18.086$ avec 21 degrés de liberté." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Informations techniques sur la machine et l'installation python" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Importation des librairies" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", "import sys\n", "import platform\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import statsmodels.api as sm\n", "import seaborn as sn" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Détermination de la version des différentes librairies ainsi que de l'OS" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "3.6.4 |Anaconda, Inc.| (default, Mar 13 2018, 01:15:57) \n", "[GCC 7.2.0]\n", "uname_result(system='Linux', node='d8fc5e21ebcb', release='4.4.0-164-generic', version='#192-Ubuntu SMP Fri Sep 13 12:02:50 UTC 2019', machine='x86_64', processor='x86_64')\n", "IPython \t 7.12.0\n", "IPython.core.release \t 7.12.0\n", "PIL \t 7.0.0\n", "PIL.Image \t 7.0.0\n", "PIL._version \t 7.0.0\n", "_csv \t 1.0\n", "_ctypes \t 1.1.0\n", "_curses \t b'2.2'\n", "decimal \t 1.70\n", "argparse \t 1.1\n", "backcall \t 0.1.0\n", "cffi \t 1.13.2\n", "csv \t 1.0\n", "ctypes \t 1.1.0\n", "cycler \t 0.10.0\n", "dateutil \t 2.8.1\n", "decimal \t 1.70\n", "decorator \t 4.4.1\n", "distutils \t 3.6.4\n", "ipaddress \t 1.0\n", "ipykernel \t 5.1.4\n", "ipykernel._version \t 5.1.4\n", "ipython_genutils \t 0.2.0\n", "ipython_genutils._version \t 0.2.0\n", "ipywidgets \t 7.2.1\n", "ipywidgets._version \t 7.2.1\n", "jedi \t 0.16.0\n", "json \t 2.0.9\n", "jupyter_client \t 6.0.0\n", "jupyter_client._version \t 6.0.0\n", "jupyter_core \t 4.6.3\n", "jupyter_core.version \t 4.6.3\n", "kiwisolver \t 1.1.0\n", "logging \t 0.5.1.2\n", "matplotlib \t 2.2.3\n", "matplotlib.backends.backend_agg \t 2.2.3\n", "numpy \t 1.15.2\n", "numpy.core \t 1.15.2\n", "numpy.core.multiarray \t 3.1\n", "numpy.lib \t 1.15.2\n", "numpy.linalg._umath_linalg \t b'0.1.5'\n", "numpy.matlib \t 1.15.2\n", "optparse \t 1.5.3\n", "pandas \t 0.22.0\n", "_libjson \t 1.33\n", "parso \t 0.6.0\n", "patsy \t 0.5.1\n", "patsy.version \t 0.5.1\n", "pexpect \t 4.8.0\n", "pickleshare \t 0.7.5\n", "platform \t 1.0.8\n", "prompt_toolkit \t 3.0.3\n", "ptyprocess \t 0.6.0\n", "pygments \t 2.5.2\n", "pyparsing \t 2.4.6\n", "pytz \t 2019.3\n", "re \t 2.2.1\n", "scipy \t 1.1.0\n", "scipy._lib.decorator \t 4.0.5\n", "scipy._lib.six \t 1.2.0\n", "scipy.fftpack._fftpack \t b'$Revision: $'\n", "scipy.fftpack.convolve \t b'$Revision: $'\n", "scipy.integrate._dop \t b'$Revision: $'\n", "scipy.integrate._ode \t $Id$\n", "scipy.integrate._odepack \t 1.9 \n", "scipy.integrate._quadpack \t 1.13 \n", "scipy.integrate.lsoda \t b'$Revision: $'\n", "scipy.integrate.vode \t b'$Revision: $'\n", "scipy.interpolate._fitpack \t 1.7 \n", "scipy.interpolate.dfitpack \t b'$Revision: $'\n", "scipy.linalg \t 0.4.9\n", "scipy.linalg._fblas \t b'$Revision: $'\n", "scipy.linalg._flapack \t b'$Revision: $'\n", "scipy.linalg._flinalg \t b'$Revision: $'\n", "scipy.ndimage \t 2.0\n", "scipy.optimize._cobyla \t b'$Revision: $'\n", "scipy.optimize._lbfgsb \t b'$Revision: $'\n", "scipy.optimize._minpack \t 1.10 \n", "scipy.optimize._nnls \t b'$Revision: $'\n", "scipy.optimize._slsqp \t b'$Revision: $'\n", "scipy.optimize.minpack2 \t b'$Revision: $'\n", "scipy.signal.spline \t 0.2\n", "scipy.sparse.linalg.eigen.arpack._arpack \t b'$Revision: $'\n", "scipy.sparse.linalg.isolve._iterative \t b'$Revision: $'\n", "scipy.special.specfun \t b'$Revision: $'\n", "scipy.stats.mvn \t b'$Revision: $'\n", "scipy.stats.statlib \t b'$Revision: $'\n", "seaborn \t 0.8.1\n", "seaborn.external.husl \t 2.1.0\n", "seaborn.external.six \t 1.10.0\n", "six \t 1.14.0\n", "statsmodels \t 0.9.0\n", "statsmodels.__init__ \t 0.9.0\n", "traitlets \t 4.3.3\n", "traitlets._version \t 4.3.3\n", "urllib.request \t 3.6\n", "zlib \t 1.0\n", "zmq \t 17.1.2\n", "zmq.sugar \t 17.1.2\n", "zmq.sugar.version \t 17.1.2\n" ] } ], "source": [ "print(sys.version)\n", "print(platform.uname())\n", "\n", "for name, val in sorted(sys.modules.items()):\n", " try:\n", " print(val.__name__, \"\\t\", val.__version__)\n", " except Exception as e:\n", " pass" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Début de l'étude" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Chargement des données" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Nous commençons par charger les données provenant du lien [https://app-learninglab.inria.fr/moocrr/gitlab/moocrr-session3/moocrr-reproducibility-study/raw/master/data/shuttle.csv](https://app-learninglab.inria.fr/moocrr/gitlab/moocrr-session3/moocrr-reproducibility-study/raw/master/data/shuttle.csv). __Soulignons que le lien donné par [l'exemple](https://app-learninglab.inria.fr/moocrr/gitlab/moocrr-session3/moocrr-reproducibility-study/blob/master/src/Python3/challenger.ipynb) ne peut pas être lu correctement par la librairie pandas__." ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Date Count Temperature Pressure Malfunction\n", "0 4/12/81 6 66 50 0\n", "1 11/12/81 6 70 50 1\n", "2 3/22/82 6 69 50 0\n", "3 11/11/82 6 68 50 0\n", "4 4/04/83 6 67 50 0\n", "5 6/18/82 6 72 50 0\n", "6 8/30/83 6 73 100 0\n", "7 11/28/83 6 70 100 0\n", "8 2/03/84 6 57 200 1\n", "9 4/06/84 6 63 200 1\n", "10 8/30/84 6 70 200 1\n", "11 10/05/84 6 78 200 0\n", "12 11/08/84 6 67 200 0\n", "13 1/24/85 6 53 200 2\n", "14 4/12/85 6 67 200 0\n", "15 4/29/85 6 75 200 0\n", "16 6/17/85 6 70 200 0\n", "17 7/2903/85 6 81 200 0\n", "18 8/27/85 6 76 200 0\n", "19 10/03/85 6 79 200 0\n", "20 10/30/85 6 75 200 2\n", "21 11/26/85 6 76 200 0\n", "22 1/12/86 6 58 200 1\n" ] } ], "source": [ "d = pd.read_csv(\"https://app-learninglab.inria.fr/moocrr/gitlab/moocrr-session3/moocrr-reproducibility-study/raw/master/data/shuttle.csv\")\n", "print(d)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Visualisation graphique des données" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Le fichier ne possédant aucune ligne _nulle_ , nous pouvons continuer en toute tranquillité. Nous voulons obtenir un aperçu graphique du nombre d'accidents comme une fonction de la température ambiante." ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "d['Occurence'] = d.Malfunction / d.Count\n", "d.plot(x='Temperature', y='Occurence', kind='scatter')\n", "plt.grid(True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Analyse statistique des risques" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Nous cherchons maintenant à déterminer la probabilité qu'un joint soit détruit. Pour cela, nous utilisons un outil de régression logistique disponible dans la librairie statsmodels." ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "
Generalized Linear Model Regression Results
Dep. Variable: Occurence No. Observations: 23
Model: GLM Df Residuals: 21
Model Family: Binomial Df Model: 1
Link Function: logit Scale: 1.0000
Method: IRLS Log-Likelihood: -23.526
Date: Thu, 24 Sep 2020 Deviance: 18.086
Time: 12:33:06 Pearson chi2: 30.0
No. Iterations: 6 Covariance Type: nonrobust
\n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "\n", " \n", "\n", "
coef std err z P>|z| [0.025 0.975]
Intercept 5.0850 3.052 1.666 0.096 -0.898 11.068
Temperature -0.1156 0.047 -2.458 0.014 -0.208 -0.023
" ], "text/plain": [ "\n", "\"\"\"\n", " Generalized Linear Model Regression Results \n", "==============================================================================\n", "Dep. Variable: Occurence No. Observations: 23\n", "Model: GLM Df Residuals: 21\n", "Model Family: Binomial Df Model: 1\n", "Link Function: logit Scale: 1.0000\n", "Method: IRLS Log-Likelihood: -23.526\n", "Date: Thu, 24 Sep 2020 Deviance: 18.086\n", "Time: 12:33:06 Pearson chi2: 30.0\n", "No. Iterations: 6 Covariance Type: nonrobust\n", "===============================================================================\n", " coef std err z P>|z| [0.025 0.975]\n", "-------------------------------------------------------------------------------\n", "Intercept 5.0850 3.052 1.666 0.096 -0.898 11.068\n", "Temperature -0.1156 0.047 -2.458 0.014 -0.208 -0.023\n", "===============================================================================\n", "\"\"\"" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d['ok'] = d.Count - d.Malfunction\n", "d['Intercept'] = 1\n", "\n", "LogisticModel = sm.GLM(d['Occurence'], d[['Intercept','Temperature']], family=sm.families.Binomial(sm.families.links.logit()), var_weights=d['Count']).fit()\n", "LogisticModel.summary()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Nous obtenons $G^2 = 18.086$ ainsi que les erreurs standards sur les coefficients $s_\\hat{\\alpha} = 3.052$ et $s_\\hat{\\beta} = 0.047$. Les résultats de l'article sont donc correctement reproductibles." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Probabilité d'accident" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Nous utilisons le modèle logisitique afin de prévoir la probabilité d'occurrence d'un accident en fonction de la température initiale." ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Intercept Temperature Occurence\n", "0 1 30.0 0.834373\n", "1 1 31.0 0.817482\n", "2 1 32.0 0.799283\n", "3 1 33.0 0.779759\n", "4 1 34.0 0.758908\n", "5 1 35.0 0.736749\n", "6 1 36.0 0.713323\n", "7 1 37.0 0.688694\n", "8 1 38.0 0.662948\n", "9 1 39.0 0.636197\n", "10 1 40.0 0.608578\n", "11 1 41.0 0.580244\n", "12 1 42.0 0.551372\n", "13 1 43.0 0.522149\n", "14 1 44.0 0.492774\n", "15 1 45.0 0.463449\n", "16 1 46.0 0.434374\n", "17 1 47.0 0.405744\n", "18 1 48.0 0.377741\n", "19 1 49.0 0.350531\n", "20 1 50.0 0.324259\n", "21 1 51.0 0.299049\n", "22 1 52.0 0.275002\n", "23 1 53.0 0.252193\n", "24 1 54.0 0.230674\n", "25 1 55.0 0.210474\n", "26 1 56.0 0.191602\n", "27 1 57.0 0.174050\n", "28 1 58.0 0.157792\n", "29 1 59.0 0.142789\n", ".. ... ... ...\n", "31 1 61.0 0.116353\n", "32 1 62.0 0.104800\n", "33 1 63.0 0.094272\n", "34 1 64.0 0.084702\n", "35 1 65.0 0.076021\n", "36 1 66.0 0.068164\n", "37 1 67.0 0.061066\n", "38 1 68.0 0.054663\n", "39 1 69.0 0.048896\n", "40 1 70.0 0.043710\n", "41 1 71.0 0.039052\n", "42 1 72.0 0.034871\n", "43 1 73.0 0.031124\n", "44 1 74.0 0.027768\n", "45 1 75.0 0.024764\n", "46 1 76.0 0.022078\n", "47 1 77.0 0.019678\n", "48 1 78.0 0.017533\n", "49 1 79.0 0.015619\n", "50 1 80.0 0.013911\n", "51 1 81.0 0.012387\n", "52 1 82.0 0.011028\n", "53 1 83.0 0.009817\n", "54 1 84.0 0.008738\n", "55 1 85.0 0.007776\n", "56 1 86.0 0.006920\n", "57 1 87.0 0.006157\n", "58 1 88.0 0.005478\n", "59 1 89.0 0.004873\n", "60 1 90.0 NaN\n", "\n", "[61 rows x 3 columns]\n" ] } ], "source": [ "dLogistic = pd.DataFrame({'Temperature': np.linspace(start=30, stop=90, num=61), 'Intercept': 1})\n", "sm.add_constant(dLogistic)\n", "dLogistic['Occurence'] = LogisticModel.predict(X)\n", "print(dLogistic)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "dLogistic.plot(x='Temperature', y='Occurence', kind='line')\n", "plt.scatter(x=d['Temperature'], y=d['Occurence'])\n", "plt.grid(True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Bien que l'utilisation d'un tel modèle pour prédire la probabilité d'un accident soit hautement discutable (trop peu de points et plusieurs points pathologiques), il en résulte sûrement un résultat surestimé. Ainsi, pour une valeur de $T = 31$ °F, la probabilité d'accident serait très importante." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4" } }, "nbformat": 4, "nbformat_minor": 4 }