Add source and pdf for challenger exercise

59c9dc3a · 49d86b4b3e98f552beab7f2514d218f6 · 6707fd56 · 59c9dc3a · 59c9dc3a
Commit 59c9dc3a authored Mar 27, 2022 by 49d86b4b3e98f552beab7f2514d218f6
Hide whitespace changes
Inline Side-by-side

Showing with 411 additions and 0 deletions

src_Python3_challenger_Python_org.org src_Python3_challenger_Python_org.org +411 -0

src_Python3_challenger_Python_org.pdf src_Python3_challenger_Python_org.pdf +0 -0

No files found.
--- a/src_Python3_challenger_Python_org.org
+++ b/src_Python3_challenger_Python_org.org
+# -*- coding: utf-8 -*-
+# -*- mode: org -*-
+#+STARTUP: overview indent inlineimages logdrawer
+#+LANGUAGE:  en
+#+OPTIONS:   num:nil toc:t \n:nil @:t ::t |:t ^:nil -:t f:t *:t <:t
+#+OPTIONS:   TeX:t LaTeX:t skip:nil d:nil todo:t pri:nil tags:not-in-toc
+#+OPTIONS:   email:nil creator:nil timestamp:t
+#+OPTIONS:  title:nil
+#+TAGS: noexport(n) deprecated(d) nofinalexport(f)
+#+EXPORT_SELECT_TAGS: export
+#+EXPORT_EXCLUDE_TAGS: noexport
+#+LATEX_HEADER: \usepackage{listings}
+#+TITLE: Challenger - Python - Emacs - Linux
+# # Define listing style for latex
+#+begin_export latex
+\lstdefinestyle{customc}{
+  belowcaptionskip=1\baselineskip,
+  breaklines=true,
+  frame=L,
+  xleftmargin=\parindent,
+  language=python,
+  showstringspaces=false,
+  basicstyle=\small\ttfamily,
+  keywordstyle=\bfseries\color{green!40!black},
+  commentstyle=\itshape\color{purple!40!black},
+  identifierstyle=\color{blue},
+  stringstyle=\color{orange},
+}
+\lstdefinestyle{customasm}{
+  belowcaptionskip=1\baselineskip,
+  frame=L,
+  xleftmargin=\parindent,
+  language=[x86masm]Assembler,
+  basicstyle=\footnotesize\ttfamily,
+  commentstyle=\itshape\color{purple!40!black},
+}
+\lstset{escapechar=@,style=customc}
+#+end_export
+* Risk Analysis of the Space Shuttle: Pre-Challenger Prediction of Failure
+In this document we reperform some of the analysis provided in
+/Risk Analysis of the Space Shuttle: Pre-Challenger Prediction of
+Failure/ by /Siddhartha R. Dalal, Edward B. Fowlkes, Bruce Hoadley/
+published in /Journal of the American Statistical Association/, Vol. 84,
+No. 408 (Dec., 1989), pp. 945-957 and available at
+http://www.jstor.org/stable/2290069.
+On the fourth page of this article, they indicate that the maximum
+likelihood estimates of the logistic regression using only temperature
+are: *$\hat{\alpha}$Â =Â 5.085* and *$\hat{\beta}$Â =Â -0.1156* and their
+asymptotic standard errors are *$s_{\hat{\alpha}}$Â =Â 3.052* and
+*$s_{\hat{\beta}}$Â =Â 0.047*. The Goodness of fit indicated for this model was
+*$G^2$Â =Â 18.086* with *21* degrees of freedom. Our goal is to reproduce
+the computation behind these values and the Figure 4 of this article,
+possibly in a nicer looking way.
+** Technical information on the computer on which the analysis is run
+We will be using the Python 3 language using the pandas, statsmodels,
+and numpy library.
+#+begin_src python :results output :session :exports both
+def print_imported_modules():
+    import sys
+    for name, val in sorted(sys.modules.items()):
+        if(hasattr(val, '__version__')):
+            print(val.__name__, val.__version__)
+#        else:
+#            print(val.__name__, "(unknown version)")
+def print_sys_info():
+    import sys
+    import platform
+    print(sys.version)
+    print(platform.uname())
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import statsmodels.api as sm
+import seaborn as sns
+print_sys_info()
+print_imported_modules()
+#+end_src
+#+RESULTS:
+#+begin_example
+/usr/lib/python3.10/site-packages/statsmodels/compat/pandas.py:65: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
+  from pandas import Int64Index as NumericIndex
+3.10.2 (main, Jan 15 2022, 19:56:27) [GCC 11.1.0]
+uname_result(system='Linux', node='dellarch', release='5.15.24-1-lts', version='#1 SMP Wed, 16 Feb 2022 16:04:21 +0000', machine='x86_64')
+IPython 8.0.1
+IPython.core.release 8.0.1
+PIL 9.0.1
+PIL.Image 9.0.1
+PIL._version 9.0.1
+_csv 1.0
+_ctypes 1.1.0
+decimal 1.70
+argparse 1.1
+backcall 0.2.0
+bottleneck 1.3.2
+cffi 1.15.0
+colorama 0.4.4
+csv 1.0
+ctypes 1.1.0
+cycler 0.10.0
+dateutil 2.8.2
+decimal 1.70
+decorator 5.1.1
+defusedxml 0.7.1
+distutils 3.10.2
+executing 0.8.2
+executing.version 0.8.2
+jedi 0.17.2
+joblib 1.1.0
+joblib.externals.cloudpickle 2.0.0
+joblib.externals.loky 3.0.0
+json 2.0.9
+kiwisolver 1.3.2
+logging 0.5.1.2
+matplotlib 3.5.1
+matplotlib.backends.backend_qt 5.15.6
+matplotlib.backends.qt_compat 5.15.6
+matplotlib.backends.qt_editor._formlayout 1.0.10
+numexpr 2.7.3
+numpy 1.22.2
+numpy.core 1.22.2
+numpy.core._multiarray_umath 3.1
+numpy.lib 1.22.2
+numpy.linalg._umath_linalg 0.1.5
+numpy.version 1.22.2
+packaging 20.9
+packaging.__about__ 20.9
+pandas 1.4.1
+parso 0.7.1
+patsy 0.5.2
+patsy.version 0.5.2
+pexpect 4.8.0
+pickleshare 0.7.5
+platform 1.0.8
+prompt_toolkit 3.0.28
+psutil 5.9.0
+ptyprocess 0.7.0
+pure_eval 0.2.2
+pure_eval.version 0.2.2
+pygments 2.11.2
+pyparsing 2.4.7
+pytz 2021.3
+re 2.2.1
+scipy 1.8.0
+scipy._lib._uarray 0.8.2+14.gaf53966.scipy
+scipy._lib.decorator 4.0.5
+scipy.integrate._dop 1.21.5
+scipy.integrate._lsoda 1.21.5
+scipy.integrate._ode $Id$
+scipy.integrate._odepack  1.9 
+scipy.integrate._quadpack  1.13 
+scipy.integrate._vode 1.21.5
+scipy.interpolate._fitpack  1.7 
+scipy.interpolate.dfitpack 1.21.5
+scipy.linalg._fblas 1.21.5
+scipy.linalg._flapack 1.21.5
+scipy.linalg._flinalg 1.21.5
+scipy.linalg._interpolative 1.21.5
+scipy.optimize.__nnls 1.21.5
+scipy.optimize._cobyla 1.21.5
+scipy.optimize._lbfgsb 1.21.5
+scipy.optimize._minpack  1.10 
+scipy.optimize._minpack2 1.21.5
+scipy.optimize._slsqp 1.21.5
+scipy.signal._spline 0.2
+scipy.sparse.linalg._eigen.arpack._arpack 1.21.5
+scipy.sparse.linalg._isolve._iterative 1.21.5
+scipy.special._specfun 1.21.5
+scipy.stats._mvn 1.21.5
+scipy.stats._statlib 1.21.5
+seaborn 0.11.2
+seaborn.external.husl 2.1.0
+six 1.16.0
+stack_data 0.2.0
+stack_data.version 0.2.0
+statsmodels 0.13.1
+statsmodels.__init__ 0.13.1
+statsmodels.api 0.13.1
+statsmodels.tools.web 0.13.1
+traitlets 5.1.1
+traitlets._version 5.1.1
+urllib.request 3.10
+wcwidth 0.2.5
+zlib 1.0
+#+end_example
+** Loading and inspecting data
+Let's start by reading data.
+#+begin_src python :results output :session :exports both
+data = pd.read_csv("https://app-learninglab.inria.fr/moocrr/gitlab/moocrr-session3/moocrr-reproducibility-study/raw/master/data/shuttle.csv")
+print(data)
+#+end_src
+#+RESULTS:
+#+begin_example
+         Date  Count  Temperature  Pressure  Malfunction
+0     4/12/81      6           66        50            0
+1    11/12/81      6           70        50            1
+2     3/22/82      6           69        50            0
+3    11/11/82      6           68        50            0
+4     4/04/83      6           67        50            0
+5     6/18/82      6           72        50            0
+6     8/30/83      6           73       100            0
+7    11/28/83      6           70       100            0
+8     2/03/84      6           57       200            1
+9     4/06/84      6           63       200            1
+10    8/30/84      6           70       200            1
+11   10/05/84      6           78       200            0
+12   11/08/84      6           67       200            0
+13    1/24/85      6           53       200            2
+14    4/12/85      6           67       200            0
+15    4/29/85      6           75       200            0
+16    6/17/85      6           70       200            0
+17  7/2903/85      6           81       200            0
+18    8/27/85      6           76       200            0
+19   10/03/85      6           79       200            0
+20   10/30/85      6           75       200            2
+21   11/26/85      6           76       200            0
+22    1/12/86      6           58       200            1
+#+end_example
+** Remark                                                          :noexport:
+   here I had to change the url, the proposed url was not raw and
+   tried to download the html.
+We know from our previous experience on this data set that filtering
+data is a really bad idea. We will therefore process it as such.
+#+begin_src python :results file :session :var matplot_lib_filename=(org-babel-temp-file "figure" ".png") :exports both
+pd.set_option('mode.chained_assignment',None) # this removes a useless warning from pandas
+data["Frequency"]=data.Malfunction/data.Count
+data.plot(x="Temperature",y="Frequency",kind="scatter",ylim=[0,1])
+plt.grid(True)
+plt.tight_layout()
+plt.savefig(matplot_lib_filename)
+matplot_lib_filename
+#+end_src
+#+RESULTS:
+[[file:/tmp/babel-3OurQA/figureQCD7m9.png]]
+** Logistic regression
+Let's assume O-rings independently fail with the same probability
+which solely depends on temperature. A logistic regression should
+allow us to estimate the influence of temperature.
+#+begin_src python :results output :session :exports both
+import statsmodels.api as sm
+data["Success"]=data.Count-data.Malfunction
+data["Intercept"]=1
+logmodel=sm.GLM(data['Frequency'], data[['Intercept','Temperature']],
+                family=sm.families.Binomial(sm.families.links.logit())).fit()
+print(logmodel.summary())
+#+end_src
+#+RESULTS:
+#+begin_example
+                 Generalized Linear Model Regression Results                  
+==============================================================================
+Dep. Variable:              Frequency   No. Observations:                   23
+Model:                            GLM   Df Residuals:                       21
+Model Family:                Binomial   Df Model:                            1
+Link Function:                  logit   Scale:                          1.0000
+Method:                          IRLS   Log-Likelihood:                -3.9210
+Date:              dim., 27 mars 2022   Deviance:                       3.0144
+Time:                        11:33:54   Pearson chi2:                     5.00
+No. Iterations:                     6   Pseudo R-squ. (CS):            0.04355
+Covariance Type:            nonrobust                                         
+===============================================================================
+                  coef    std err          z      P>|z|      [0.025      0.975]
+-------------------------------------------------------------------------------
+Intercept       5.0850      7.477      0.680      0.496      -9.570      19.740
+Temperature    -0.1156      0.115     -1.004      0.316      -0.341       0.110
+===============================================================================
+#+end_example
+** Remark                                                          :noexport:
+   I changed the code in the last cell because statsmodel API
+   changed. Now the link function has to be called before being given
+   as argument.
+The maximum likelyhood estimator of the intercept and of Temperature
+are thus *$\hat{\alpha}$Â =Â 5.0850* and *$\hat{\beta}$Â =Â -0.1156*. This *corresponds*
+to the values from the article of Dalal /et al./ The standard errors are
+/$s_{\hat{\alpha}}$Â =Â 7.477/ and /$s_{\hat{\beta}}$Â =Â 0.115/, which is *different* from
+the *3.052* and *0.04702* reported by Dallal /et al./ The deviance is
+/3.01444/ with *21* degrees of freedom. I cannot find any value similar
+to the Goodness of fit (*$G^2$Â =Â 18.086*) reported by Dalal /et al./ There
+seems to be something wrong. Oh I know, I haven't indicated that my
+observations are actually the result of 6 observations for each rocket
+launch. Let's indicate these weights (since the weights are always the
+same throughout all experiments, it does not change the estimates of
+the fit but it does influence the variance estimates).
+#+begin_src python :results output :session :exports both
+logmodel=sm.GLM(data['Frequency'], data[['Intercept','Temperature']],
+                family=sm.families.Binomial(sm.families.links.logit()),
+                var_weights=data['Count']).fit()
+print(logmodel.summary())
+#+end_src
+#+RESULTS:
+#+begin_example
+                 Generalized Linear Model Regression Results                  
+==============================================================================
+Dep. Variable:              Frequency   No. Observations:                   23
+Model:                            GLM   Df Residuals:                       21
+Model Family:                Binomial   Df Model:                            1
+Link Function:                  logit   Scale:                          1.0000
+Method:                          IRLS   Log-Likelihood:                -23.526
+Date:              dim., 27 mars 2022   Deviance:                       18.086
+Time:                        12:07:08   Pearson chi2:                     30.0
+No. Iterations:                     6   Pseudo R-squ. (CS):             0.2344
+Covariance Type:            nonrobust                                         
+===============================================================================
+                  coef    std err          z      P>|z|      [0.025      0.975]
+-------------------------------------------------------------------------------
+Intercept       5.0850      3.052      1.666      0.096      -0.898      11.068
+Temperature    -0.1156      0.047     -2.458      0.014      -0.208      -0.023
+===============================================================================
+#+end_example
+** Remark                                                          :noexport:
+   I changed the code in the last cell because statsmodel API. 
+Good, now I have recovered the asymptotic standard errors
+*$s_{\hat{\alpha}}$Â =Â 3.052* and *$s_{\hat{\beta}}$Â =Â 0.047*. The Goodness of fit
+(Deviance) indicated for this model is *$G^2$Â =Â 18.086* with *21* degrees
+of freedom (Df Residuals).
+*I have therefore managed to fully replicate the results of the Dalal
+/et al./ article*.
+** Predicting failure probability
+The temperature when launching the shuttle was 31Â°F. Let's try to
+estimate the failure probability for such temperature using our model:
+#+begin_src python :results output :session :exports both
+X = np.linspace(start=30, stop=90, num=121)
+res =logmodel.get_prediction(sm.add_constant(X))
+predictions = res.summary_frame(alpha=0.05)
+print(predictions.head())
+#+end_src
+#+RESULTS:
+:        mean   mean_se  mean_ci_lower  mean_ci_upper
+: 0  0.834373  0.229330       0.163069       0.992381
+: 1  0.826230  0.234961       0.161330       0.991563
+: 2  0.817774  0.240453       0.159601       0.990658
+: 3  0.809002  0.245782       0.157884       0.989658
+: 4  0.799911  0.250921       0.156176       0.988552
+#+begin_src python :results file :session :var matplot_lib_filename=(org-babel-temp-file "figure" ".png") :exports both
+import matplotlib.pyplot as plt
+plt.scatter(x=data["Temperature"],y=data["Frequency"])
+plt.plot(X,predictions['mean'], label='Estimation')
+plt.plot(X,predictions['mean_ci_lower'], color="red", label='Ci', linewidth=0.5)
+plt.plot(X,predictions['mean_ci_upper'], color="red", linewidth=0.5)
+plt.legend()
+plt.grid(True)
+plt.savefig(matplot_lib_filename)
+matplot_lib_filename
+#+end_src
+#+RESULTS:
+[[file:/tmp/babel-3OurQA/figurehOIHB0.png]]
+This figure is very similar to the Figure 4 of Dalal /et al./ *I have
+managed to replicate the Figure 4 of the Dalal /et al./ article.*
+**I think I have managed to correctly compute and plot the uncertainty
+  of my prediction.** 
+  Contrary to the solution proposed by the mooc, the point 63 is not
+  outside the confidence interval for me, I have a closer match to
+  what is obtained with R. This is likely due to the difference
+  between parametric confidence intervals (stasmodel and R confint)
+  versus non-parametric confidence intervals (seaborn confidence
+  interval). It seems logical that the non-parametric version is a
+  little wider.
--- a/src_Python3_challenger_Python_org.pdf
+++ b/src_Python3_challenger_Python_org.pdf