From 972e5ef11d1e55a0f00557eb1dbb7e1dc5b246e1 Mon Sep 17 00:00:00 2001 From: Lisa Fourtune Date: Wed, 12 Jan 2022 16:53:12 +0100 Subject: [PATCH] Version html de l'analyse --- module4/MonAnalyse.html | 434 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 434 insertions(+) create mode 100644 module4/MonAnalyse.html diff --git a/module4/MonAnalyse.html b/module4/MonAnalyse.html new file mode 100644 index 0000000..49f2909 --- /dev/null +++ b/module4/MonAnalyse.html @@ -0,0 +1,434 @@ + + + + + + + + + + + + + + + +Risk Analysis of the Space Shuttle: Pre-Challenger Prediction of Failure + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +

In this document we reperform some of the analysis provided in Risk Analysis of the Space Shuttle: Pre-Challenger Prediction of Failure by Siddhartha R. Dalal, Edward B. Fowlkes, Bruce Hoadley published in Journal of the American Statistical Association, Vol. 84, No. 408 (Dec., 1989), pp. 945-957 and available here.

+

On the fourth page of this article, they indicate that the maximum likelihood estimates of the logistic regression using only temperature are: \(\hat{\alpha}=5.085\) and \(\hat{\beta}=-0.1156\) and their asymptotic standard errors are \(s_{\hat{\alpha}}=3.052\) and \(s_{\hat{\beta}}=0.047\). The Goodness of fit indicated for this model was \(G^2=18.086\) with 21 degrees of freedom. Our goal is to reproduce the computation behind these values and the Figure 4 of this article, possibly in a nicer looking way.

+
+

Technical information on the computer on which the analysis is run

+

We will be using the R language using the ggplot2 library.

+
library(ggplot2)
+sessionInfo()
+
## R version 4.1.2 (2021-11-01)
+## Platform: x86_64-w64-mingw32/x64 (64-bit)
+## Running under: Windows 10 x64 (build 19043)
+## 
+## Matrix products: default
+## 
+## locale:
+## [1] LC_COLLATE=French_France.1252  LC_CTYPE=French_France.1252   
+## [3] LC_MONETARY=French_France.1252 LC_NUMERIC=C                  
+## [5] LC_TIME=French_France.1252    
+## 
+## attached base packages:
+## [1] stats     graphics  grDevices utils     datasets  methods   base     
+## 
+## other attached packages:
+## [1] ggplot2_3.3.5
+## 
+## loaded via a namespace (and not attached):
+##  [1] knitr_1.37       magrittr_2.0.1   munsell_0.5.0    colorspace_2.0-2
+##  [5] R6_2.5.1         rlang_0.4.12     fastmap_1.1.0    fansi_0.5.0     
+##  [9] stringr_1.4.0    tools_4.1.2      grid_4.1.2       gtable_0.3.0    
+## [13] xfun_0.29        utf8_1.2.2       withr_2.4.3      jquerylib_0.1.4 
+## [17] htmltools_0.5.2  ellipsis_0.3.2   yaml_2.2.1       digest_0.6.29   
+## [21] tibble_3.1.6     lifecycle_1.0.1  crayon_1.4.2     vctrs_0.3.8     
+## [25] glue_1.6.0       evaluate_0.14    rmarkdown_2.11   stringi_1.7.6   
+## [29] compiler_4.1.2   pillar_1.6.4     scales_1.1.1     pkgconfig_2.0.3
+

Here are the available libraries

+
devtools::session_info()
+
## - Session info ---------------------------------------------------------------
+##  setting  value
+##  version  R version 4.1.2 (2021-11-01)
+##  os       Windows 10 x64 (build 19043)
+##  system   x86_64, mingw32
+##  ui       RTerm
+##  language (EN)
+##  collate  French_France.1252
+##  ctype    French_France.1252
+##  tz       Europe/Paris
+##  date     2022-01-12
+##  pandoc   2.14.0.3 @ C:/Program Files/RStudio/bin/pandoc/ (via rmarkdown)
+## 
+## - Packages -------------------------------------------------------------------
+##  package     * version date (UTC) lib source
+##  cachem        1.0.6   2021-08-19 [1] CRAN (R 4.1.2)
+##  callr         3.7.0   2021-04-20 [1] CRAN (R 4.1.2)
+##  cli           3.1.0   2021-10-27 [1] CRAN (R 4.1.2)
+##  colorspace    2.0-2   2021-06-24 [1] CRAN (R 4.1.2)
+##  crayon        1.4.2   2021-10-29 [1] CRAN (R 4.1.2)
+##  desc          1.4.0   2021-09-28 [1] CRAN (R 4.1.2)
+##  devtools      2.4.3   2021-11-30 [1] CRAN (R 4.1.2)
+##  digest        0.6.29  2021-12-01 [1] CRAN (R 4.1.2)
+##  ellipsis      0.3.2   2021-04-29 [1] CRAN (R 4.1.2)
+##  evaluate      0.14    2019-05-28 [1] CRAN (R 4.1.2)
+##  fansi         0.5.0   2021-05-25 [1] CRAN (R 4.1.2)
+##  fastmap       1.1.0   2021-01-25 [1] CRAN (R 4.1.2)
+##  fs            1.5.2   2021-12-08 [1] CRAN (R 4.1.2)
+##  ggplot2     * 3.3.5   2021-06-25 [1] CRAN (R 4.1.2)
+##  glue          1.6.0   2021-12-17 [1] CRAN (R 4.1.2)
+##  gtable        0.3.0   2019-03-25 [1] CRAN (R 4.1.2)
+##  htmltools     0.5.2   2021-08-25 [1] CRAN (R 4.1.2)
+##  jquerylib     0.1.4   2021-04-26 [1] CRAN (R 4.1.2)
+##  knitr         1.37    2021-12-16 [1] CRAN (R 4.1.2)
+##  lifecycle     1.0.1   2021-09-24 [1] CRAN (R 4.1.2)
+##  magrittr      2.0.1   2020-11-17 [1] CRAN (R 4.1.2)
+##  memoise       2.0.1   2021-11-26 [1] CRAN (R 4.1.2)
+##  munsell       0.5.0   2018-06-12 [1] CRAN (R 4.1.2)
+##  pillar        1.6.4   2021-10-18 [1] CRAN (R 4.1.2)
+##  pkgbuild      1.3.1   2021-12-20 [1] CRAN (R 4.1.2)
+##  pkgconfig     2.0.3   2019-09-22 [1] CRAN (R 4.1.2)
+##  pkgload       1.2.4   2021-11-30 [1] CRAN (R 4.1.2)
+##  prettyunits   1.1.1   2020-01-24 [1] CRAN (R 4.1.2)
+##  processx      3.5.2   2021-04-30 [1] CRAN (R 4.1.2)
+##  ps            1.6.0   2021-02-28 [1] CRAN (R 4.1.2)
+##  purrr         0.3.4   2020-04-17 [1] CRAN (R 4.1.2)
+##  R6            2.5.1   2021-08-19 [1] CRAN (R 4.1.2)
+##  remotes       2.4.2   2021-11-30 [1] CRAN (R 4.1.2)
+##  rlang         0.4.12  2021-10-18 [1] CRAN (R 4.1.2)
+##  rmarkdown     2.11    2021-09-14 [1] CRAN (R 4.1.2)
+##  rprojroot     2.0.2   2020-11-15 [1] CRAN (R 4.1.2)
+##  rstudioapi    0.13    2020-11-12 [1] CRAN (R 4.1.2)
+##  scales        1.1.1   2020-05-11 [1] CRAN (R 4.1.2)
+##  sessioninfo   1.2.2   2021-12-06 [1] CRAN (R 4.1.2)
+##  stringi       1.7.6   2021-11-29 [1] CRAN (R 4.1.2)
+##  stringr       1.4.0   2019-02-10 [1] CRAN (R 4.1.2)
+##  testthat      3.1.1   2021-12-03 [1] CRAN (R 4.1.2)
+##  tibble        3.1.6   2021-11-07 [1] CRAN (R 4.1.2)
+##  usethis       2.1.5   2021-12-09 [1] CRAN (R 4.1.2)
+##  utf8          1.2.2   2021-07-24 [1] CRAN (R 4.1.2)
+##  vctrs         0.3.8   2021-04-29 [1] CRAN (R 4.1.2)
+##  withr         2.4.3   2021-11-30 [1] CRAN (R 4.1.2)
+##  xfun          0.29    2021-12-14 [1] CRAN (R 4.1.2)
+##  yaml          2.2.1   2020-02-01 [1] CRAN (R 4.1.1)
+## 
+##  [1] D:/docs_nfourtun/R/win-library/4.1
+##  [2] C:/Program Files/R/R-4.1.2/library
+## 
+## ------------------------------------------------------------------------------
+
+
+

Loading and inspecting data

+

Let’s start by reading data:

+
data = read.csv("https://app-learninglab.inria.fr/moocrr/gitlab/moocrr-session3/moocrr-reproducibility-study/raw/master/data/shuttle.csv?inline=false",header=T)
+data
+
+ +
+

Let’s visually inspect how temperature affects malfunction:

+
plot(data=data, Malfunction/Count ~ Temperature, ylim=c(0,1))
+

+
+
+

Logistic regression

+

I’ll only work on the raw data, because I find it more elegant than working on ratios. Let’s provide the “raw” data to R.

+
data_flat=data.frame()
+for(i in 1:nrow(data)) {
+  temperature = data[i,"Temperature"];
+  malfunction = data[i,"Malfunction"];
+  d = data.frame(Temperature=temperature,Malfunction=rep(0,times = data[i,"Count"]))
+  if(malfunction>0) {
+      d[1:malfunction, "Malfunction"]=1;
+  }
+  data_flat=rbind(data_flat,d)
+}
+dim(data_flat)
+
## [1] 138   2
+
str(data_flat)
+
## 'data.frame':    138 obs. of  2 variables:
+##  $ Temperature: int  66 66 66 66 66 66 70 70 70 70 ...
+##  $ Malfunction: num  0 0 0 0 0 0 1 0 0 0 ...
+

Computing regression parameters.

+
logistic_reg_flat = glm(data=data_flat, Malfunction ~ Temperature, family=binomial(link='logit'))
+summary(logistic_reg_flat)
+
## 
+## Call:
+## glm(formula = Malfunction ~ Temperature, family = binomial(link = "logit"), 
+##     data = data_flat)
+## 
+## Deviance Residuals: 
+##     Min       1Q   Median       3Q      Max  
+## -0.7774  -0.3677  -0.3106  -0.2209   2.6879  
+## 
+## Coefficients:
+##             Estimate Std. Error z value Pr(>|z|)  
+## (Intercept)  5.08498    3.05248   1.666   0.0957 .
+## Temperature -0.11560    0.04702  -2.458   0.0140 *
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+## 
+## (Dispersion parameter for binomial family taken to be 1)
+## 
+##     Null deviance: 66.540  on 137  degrees of freedom
+## Residual deviance: 60.396  on 136  degrees of freedom
+## AIC: 64.396
+## 
+## Number of Fisher Scoring iterations: 6
+

Perfect. The estimates and the standard errors are the same as in the 1989 paper.

+
ggplot(data=data_flat, aes(y=Malfunction, x=Temperature)) +
+  geom_smooth(method = "glm", method.args = list(family = "binomial"), fullrange=T) +
+  geom_point(alpha=.5, size = .5) +
+  xlim(30,90) + ylim(0,1) + theme_bw()
+
## `geom_smooth()` using formula 'y ~ x'
+

+

Let’s check whether this confidence interval corresponds to the prediction obtained when calling directly predict. Obtaining the prediction can be done directly or through the link function.

+

Here is the “direct” (response) version I used in my very first plot:

+
pred = predict(logistic_reg_flat,list(Temperature=30),type="response",se.fit = T)
+pred
+
## $fit
+##        1 
+## 0.834373 
+## 
+## $se.fit
+##         1 
+## 0.2293304 
+## 
+## $residual.scale
+## [1] 1
+

The estimated Failure probability for 30° is thus \(0.834\). However, the \(se.fit\) value seems pretty hard to use as I can obviously not simply add \(\pm 2 se.fit\) to \(fit\) to compute a confidence interval.

+

Here is the “link” version:

+
pred_link = predict(logistic_reg_flat,list(Temperature=30), type="link", se.fit = T)
+pred_link
+
## $fit
+##        1 
+## 1.616942 
+## 
+## $se.fit
+## [1] 1.659473
+## 
+## $residual.scale
+## [1] 1
+
logistic_reg_flat$family$linkinv(pred_link$fit)
+
##        1 
+## 0.834373
+

I recover \(0.834\) for the estimated Failure probability at 30°. But now, going through the linkinv function, we can use \(se.fit\):

+
critval = 1.96
+logistic_reg_flat$family$linkinv(c(pred_link$fit-critval*pred_link$se.fit,
+                              pred_link$fit+critval*pred_link$se.fit))
+
##         1         1 
+## 0.1630612 0.9923814
+

The 95% confidence interval for our estimation is thus [0.163,0.992]. This is what ggplot2 just plotted me. This seems coherent.

+

I am now rather confident that I have managed to correctly compute and plot the uncertainty of my prediction.

+
+ + + + +
+ + + + + + + + + + + + + + + -- 2.18.1