diff --git a/src_Python3_challenger_Python_org.org b/src_Python3_challenger_Python_org.org new file mode 100644 index 0000000000000000000000000000000000000000..dfc1c784879aba4542bb138eb97f554fe032ccf4 --- /dev/null +++ b/src_Python3_challenger_Python_org.org @@ -0,0 +1,410 @@ +# -*- coding: utf-8 -*- +# -*- mode: org -*- +#+STARTUP: overview indent inlineimages logdrawer +#+LANGUAGE: en +#+OPTIONS: num:nil toc:t \n:nil @:t ::t |:t ^:nil -:t f:t *:t <:t +#+OPTIONS: TeX:t LaTeX:t skip:nil d:nil todo:t pri:nil tags:not-in-toc +#+OPTIONS: email:nil creator:nil timestamp:t +#+OPTIONS: title:nil + +#+TAGS: noexport(n) deprecated(d) nofinalexport(f) +#+EXPORT_SELECT_TAGS: export +#+EXPORT_EXCLUDE_TAGS: noexport +#+LATEX_HEADER: \usepackage{listings} + +#+TITLE: Challenger - Python - Emacs - Linux + +# # Define listing style for latex +#+begin_export latex +\lstdefinestyle{customc}{ + belowcaptionskip=1\baselineskip, + breaklines=true, + frame=L, + xleftmargin=\parindent, + language=python, + showstringspaces=false, + basicstyle=\small\ttfamily, + keywordstyle=\bfseries\color{green!40!black}, + commentstyle=\itshape\color{purple!40!black}, + identifierstyle=\color{blue}, + stringstyle=\color{orange}, +} + +\lstdefinestyle{customasm}{ + belowcaptionskip=1\baselineskip, + frame=L, + xleftmargin=\parindent, + language=[x86masm]Assembler, + basicstyle=\footnotesize\ttfamily, + commentstyle=\itshape\color{purple!40!black}, +} + +\lstset{escapechar=@,style=customc} +#+end_export + +* Risk Analysis of the Space Shuttle: Pre-Challenger Prediction of Failure + +In this document we reperform some of the analysis provided in +/Risk Analysis of the Space Shuttle: Pre-Challenger Prediction of +Failure/ by /Siddhartha R. Dalal, Edward B. Fowlkes, Bruce Hoadley/ +published in /Journal of the American Statistical Association/, Vol. 84, +No. 408 (Dec., 1989), pp. 945-957 and available at +http://www.jstor.org/stable/2290069. + +On the fourth page of this article, they indicate that the maximum +likelihood estimates of the logistic regression using only temperature +are: *$\hat{\alpha}$ = 5.085* and *$\hat{\beta}$ = -0.1156* and their +asymptotic standard errors are *$s_{\hat{\alpha}}$ = 3.052* and +*$s_{\hat{\beta}}$ = 0.047*. The Goodness of fit indicated for this model was +*$G^2$ = 18.086* with *21* degrees of freedom. Our goal is to reproduce +the computation behind these values and the Figure 4 of this article, +possibly in a nicer looking way. + +** Technical information on the computer on which the analysis is run + +We will be using the Python 3 language using the pandas, statsmodels, +and numpy library. + +#+begin_src python :results output :session :exports both +def print_imported_modules(): + import sys + for name, val in sorted(sys.modules.items()): + if(hasattr(val, '__version__')): + print(val.__name__, val.__version__) +# else: +# print(val.__name__, "(unknown version)") +def print_sys_info(): + import sys + import platform + print(sys.version) + print(platform.uname()) + +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +import statsmodels.api as sm + +print_sys_info() +print_imported_modules() +#+end_src + +#+RESULTS: +#+begin_example +/usr/lib/python3.10/site-packages/statsmodels/compat/pandas.py:65: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead. + from pandas import Int64Index as NumericIndex +3.10.2 (main, Jan 15 2022, 19:56:27) [GCC 11.1.0] +uname_result(system='Linux', node='dellarch', release='5.15.24-1-lts', version='#1 SMP Wed, 16 Feb 2022 16:04:21 +0000', machine='x86_64') +IPython 8.0.1 +IPython.core.release 8.0.1 +PIL 9.0.1 +PIL.Image 9.0.1 +PIL._version 9.0.1 +_csv 1.0 +_ctypes 1.1.0 +decimal 1.70 +argparse 1.1 +backcall 0.2.0 +bottleneck 1.3.2 +cffi 1.15.0 +colorama 0.4.4 +csv 1.0 +ctypes 1.1.0 +cycler 0.10.0 +dateutil 2.8.2 +decimal 1.70 +decorator 5.1.1 +defusedxml 0.7.1 +distutils 3.10.2 +executing 0.8.2 +executing.version 0.8.2 +jedi 0.17.2 +joblib 1.1.0 +joblib.externals.cloudpickle 2.0.0 +joblib.externals.loky 3.0.0 +json 2.0.9 +kiwisolver 1.3.2 +logging 0.5.1.2 +matplotlib 3.5.1 +matplotlib.backends.backend_qt 5.15.6 +matplotlib.backends.qt_compat 5.15.6 +matplotlib.backends.qt_editor._formlayout 1.0.10 +numexpr 2.7.3 +numpy 1.22.2 +numpy.core 1.22.2 +numpy.core._multiarray_umath 3.1 +numpy.lib 1.22.2 +numpy.linalg._umath_linalg 0.1.5 +numpy.version 1.22.2 +packaging 20.9 +packaging.__about__ 20.9 +pandas 1.4.1 +parso 0.7.1 +patsy 0.5.2 +patsy.version 0.5.2 +pexpect 4.8.0 +pickleshare 0.7.5 +platform 1.0.8 +prompt_toolkit 3.0.28 +psutil 5.9.0 +ptyprocess 0.7.0 +pure_eval 0.2.2 +pure_eval.version 0.2.2 +pygments 2.11.2 +pyparsing 2.4.7 +pytz 2021.3 +re 2.2.1 +scipy 1.8.0 +scipy._lib._uarray 0.8.2+14.gaf53966.scipy +scipy._lib.decorator 4.0.5 +scipy.integrate._dop 1.21.5 +scipy.integrate._lsoda 1.21.5 +scipy.integrate._ode $Id$ +scipy.integrate._odepack 1.9 +scipy.integrate._quadpack 1.13 +scipy.integrate._vode 1.21.5 +scipy.interpolate._fitpack 1.7 +scipy.interpolate.dfitpack 1.21.5 +scipy.linalg._fblas 1.21.5 +scipy.linalg._flapack 1.21.5 +scipy.linalg._flinalg 1.21.5 +scipy.linalg._interpolative 1.21.5 +scipy.optimize.__nnls 1.21.5 +scipy.optimize._cobyla 1.21.5 +scipy.optimize._lbfgsb 1.21.5 +scipy.optimize._minpack 1.10 +scipy.optimize._minpack2 1.21.5 +scipy.optimize._slsqp 1.21.5 +scipy.signal._spline 0.2 +scipy.sparse.linalg._eigen.arpack._arpack 1.21.5 +scipy.sparse.linalg._isolve._iterative 1.21.5 +scipy.special._specfun 1.21.5 +scipy.stats._mvn 1.21.5 +scipy.stats._statlib 1.21.5 +seaborn 0.11.2 +seaborn.external.husl 2.1.0 +six 1.16.0 +stack_data 0.2.0 +stack_data.version 0.2.0 +statsmodels 0.13.1 +statsmodels.__init__ 0.13.1 +statsmodels.api 0.13.1 +statsmodels.tools.web 0.13.1 +traitlets 5.1.1 +traitlets._version 5.1.1 +urllib.request 3.10 +wcwidth 0.2.5 +zlib 1.0 +#+end_example + +** Loading and inspecting data + +Let's start by reading data. + +#+begin_src python :results output :session :exports both +data = pd.read_csv("https://app-learninglab.inria.fr/moocrr/gitlab/moocrr-session3/moocrr-reproducibility-study/raw/master/data/shuttle.csv") +print(data) +#+end_src + +#+RESULTS: +#+begin_example + Date Count Temperature Pressure Malfunction +0 4/12/81 6 66 50 0 +1 11/12/81 6 70 50 1 +2 3/22/82 6 69 50 0 +3 11/11/82 6 68 50 0 +4 4/04/83 6 67 50 0 +5 6/18/82 6 72 50 0 +6 8/30/83 6 73 100 0 +7 11/28/83 6 70 100 0 +8 2/03/84 6 57 200 1 +9 4/06/84 6 63 200 1 +10 8/30/84 6 70 200 1 +11 10/05/84 6 78 200 0 +12 11/08/84 6 67 200 0 +13 1/24/85 6 53 200 2 +14 4/12/85 6 67 200 0 +15 4/29/85 6 75 200 0 +16 6/17/85 6 70 200 0 +17 7/2903/85 6 81 200 0 +18 8/27/85 6 76 200 0 +19 10/03/85 6 79 200 0 +20 10/30/85 6 75 200 2 +21 11/26/85 6 76 200 0 +22 1/12/86 6 58 200 1 +#+end_example + +** Remark :noexport: + here I had to change the url, the proposed url was not raw and + tried to download the html. + +We know from our previous experience on this data set that filtering +data is a really bad idea. We will therefore process it as such. + +#+begin_src python :results file :session :var matplot_lib_filename=(org-babel-temp-file "figure" ".png") :exports both +pd.set_option('mode.chained_assignment',None) # this removes a useless warning from pandas + +data["Frequency"]=data.Malfunction/data.Count +data.plot(x="Temperature",y="Frequency",kind="scatter",ylim=[0,1]) +plt.grid(True) +plt.tight_layout() + +plt.savefig(matplot_lib_filename) +matplot_lib_filename +#+end_src + +#+RESULTS: +[[file:/tmp/babel-3OurQA/figureQCD7m9.png]] + +** Logistic regression + +Let's assume O-rings independently fail with the same probability +which solely depends on temperature. A logistic regression should +allow us to estimate the influence of temperature. + +#+begin_src python :results output :session :exports both +import statsmodels.api as sm + +data["Success"]=data.Count-data.Malfunction +data["Intercept"]=1 + +logmodel=sm.GLM(data['Frequency'], data[['Intercept','Temperature']], + family=sm.families.Binomial(sm.families.links.logit())).fit() + +print(logmodel.summary()) +#+end_src + +#+RESULTS: +#+begin_example + Generalized Linear Model Regression Results +============================================================================== +Dep. Variable: Frequency No. Observations: 23 +Model: GLM Df Residuals: 21 +Model Family: Binomial Df Model: 1 +Link Function: logit Scale: 1.0000 +Method: IRLS Log-Likelihood: -3.9210 +Date: dim., 27 mars 2022 Deviance: 3.0144 +Time: 11:33:54 Pearson chi2: 5.00 +No. Iterations: 6 Pseudo R-squ. (CS): 0.04355 +Covariance Type: nonrobust +=============================================================================== + coef std err z P>|z| [0.025 0.975] +------------------------------------------------------------------------------- +Intercept 5.0850 7.477 0.680 0.496 -9.570 19.740 +Temperature -0.1156 0.115 -1.004 0.316 -0.341 0.110 +=============================================================================== +#+end_example + +** Remark :noexport: + I changed the code in the last cell because statsmodel API + changed. Now the link function has to be called before being given + as argument. + +The maximum likelyhood estimator of the intercept and of Temperature +are thus *$\hat{\alpha}$ = 5.0850* and *$\hat{\beta}$ = -0.1156*. This *corresponds* +to the values from the article of Dalal /et al./ The standard errors are +/$s_{\hat{\alpha}}$ = 7.477/ and /$s_{\hat{\beta}}$ = 0.115/, which is *different* from +the *3.052* and *0.04702* reported by Dallal /et al./ The deviance is +/3.01444/ with *21* degrees of freedom. I cannot find any value similar +to the Goodness of fit (*$G^2$ = 18.086*) reported by Dalal /et al./ There +seems to be something wrong. Oh I know, I haven't indicated that my +observations are actually the result of 6 observations for each rocket +launch. Let's indicate these weights (since the weights are always the +same throughout all experiments, it does not change the estimates of +the fit but it does influence the variance estimates). + +#+begin_src python :results output :session :exports both +logmodel=sm.GLM(data['Frequency'], data[['Intercept','Temperature']], + family=sm.families.Binomial(sm.families.links.logit()), + var_weights=data['Count']).fit() + +print(logmodel.summary()) +#+end_src + +#+RESULTS: +#+begin_example + Generalized Linear Model Regression Results +============================================================================== +Dep. Variable: Frequency No. Observations: 23 +Model: GLM Df Residuals: 21 +Model Family: Binomial Df Model: 1 +Link Function: logit Scale: 1.0000 +Method: IRLS Log-Likelihood: -23.526 +Date: dim., 27 mars 2022 Deviance: 18.086 +Time: 12:07:08 Pearson chi2: 30.0 +No. Iterations: 6 Pseudo R-squ. (CS): 0.2344 +Covariance Type: nonrobust +=============================================================================== + coef std err z P>|z| [0.025 0.975] +------------------------------------------------------------------------------- +Intercept 5.0850 3.052 1.666 0.096 -0.898 11.068 +Temperature -0.1156 0.047 -2.458 0.014 -0.208 -0.023 +=============================================================================== +#+end_example + +** Remark :noexport: + I changed the code in the last cell because statsmodel API. + +Good, now I have recovered the asymptotic standard errors +*$s_{\hat{\alpha}}$ = 3.052* and *$s_{\hat{\beta}}$ = 0.047*. The Goodness of fit +(Deviance) indicated for this model is *$G^2$ = 18.086* with *21* degrees +of freedom (Df Residuals). + +*I have therefore managed to fully replicate the results of the Dalal +/et al./ article*. + +** Predicting failure probability + +The temperature when launching the shuttle was 31°F. Let's try to +estimate the failure probability for such temperature using our model: + +#+begin_src python :results output :session :exports both +X = np.linspace(start=30, stop=90, num=121) +res =logmodel.get_prediction(sm.add_constant(X)) +predictions = res.summary_frame(alpha=0.05) +print(predictions.head()) +#+end_src + +#+RESULTS: +: mean mean_se mean_ci_lower mean_ci_upper +: 0 0.834373 0.229330 0.163069 0.992381 +: 1 0.826230 0.234961 0.161330 0.991563 +: 2 0.817774 0.240453 0.159601 0.990658 +: 3 0.809002 0.245782 0.157884 0.989658 +: 4 0.799911 0.250921 0.156176 0.988552 + +#+begin_src python :results file :session :var matplot_lib_filename=(org-babel-temp-file "figure" ".png") :exports both + +import matplotlib.pyplot as plt + +plt.scatter(x=data["Temperature"],y=data["Frequency"]) +plt.plot(X,predictions['mean'], label='Estimation') +plt.plot(X,predictions['mean_ci_lower'], color="red", label='Ci', linewidth=0.5) +plt.plot(X,predictions['mean_ci_upper'], color="red", linewidth=0.5) + +plt.legend() +plt.grid(True) + +plt.savefig(matplot_lib_filename) +matplot_lib_filename +#+end_src + +#+RESULTS: +[[file:/tmp/babel-3OurQA/figurehOIHB0.png]] + +This figure is very similar to the Figure 4 of Dalal /et al./ *I have +managed to replicate the Figure 4 of the Dalal /et al./ article.* + + +**I think I have managed to correctly compute and plot the uncertainty + of my prediction.** + Contrary to the solution proposed by the mooc, the point 63 is not + outside the confidence interval for me, I have a closer match to + what is obtained with R. This is likely due to the difference + between parametric confidence intervals (stasmodel and R confint) + versus non-parametric confidence intervals (seaborn confidence + interval). It seems logical that the non-parametric version is a + little wider. + + + + diff --git a/src_Python3_challenger_Python_org.pdf b/src_Python3_challenger_Python_org.pdf new file mode 100644 index 0000000000000000000000000000000000000000..3485c678861d8bff3079151feb4f3f030bcb1038 Binary files /dev/null and b/src_Python3_challenger_Python_org.pdf differ