Information technique pour lesquelles l’analyse est executé:

Librairie ggplot2

library(ggplot2)
sessionInfo()
## R version 4.1.1 (2021-08-10)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19041)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=French_France.1252  LC_CTYPE=French_France.1252   
## [3] LC_MONETARY=French_France.1252 LC_NUMERIC=C                  
## [5] LC_TIME=French_France.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ggplot2_3.3.5
## 
## loaded via a namespace (and not attached):
##  [1] pillar_1.6.2     compiler_4.1.1   jquerylib_0.1.4  tools_4.1.1     
##  [5] digest_0.6.27    evaluate_0.14    lifecycle_1.0.0  tibble_3.1.4    
##  [9] gtable_0.3.0     pkgconfig_2.0.3  rlang_0.4.11     DBI_1.1.1       
## [13] yaml_2.2.1       xfun_0.25        fastmap_1.1.0    withr_2.4.2     
## [17] stringr_1.4.0    dplyr_1.0.7      knitr_1.34       generics_0.1.0  
## [21] vctrs_0.3.8      grid_4.1.1       tidyselect_1.1.1 glue_1.4.2      
## [25] R6_2.5.1         fansi_0.5.0      rmarkdown_2.11   purrr_0.3.4     
## [29] magrittr_2.0.1   scales_1.1.1     ellipsis_0.3.2   htmltools_0.5.2 
## [33] assertthat_0.2.1 colorspace_2.0-2 utf8_1.2.2       stringi_1.7.4   
## [37] munsell_0.5.0    crayon_1.4.1

Chargement et inspection des données

data = read.csv("https://app-learninglab.inria.fr/moocrr/gitlab/moocrr-session3/moocrr-reproducibility-study/raw/master/data/shuttle.csv?inline=false")
data
##         Date Count Temperature Pressure Malfunction
## 1    4/12/81     6          66       50           0
## 2   11/12/81     6          70       50           1
## 3    3/22/82     6          69       50           0
## 4   11/11/82     6          68       50           0
## 5    4/04/83     6          67       50           0
## 6    6/18/82     6          72       50           0
## 7    8/30/83     6          73      100           0
## 8   11/28/83     6          70      100           0
## 9    2/03/84     6          57      200           1
## 10   4/06/84     6          63      200           1
## 11   8/30/84     6          70      200           1
## 12  10/05/84     6          78      200           0
## 13  11/08/84     6          67      200           0
## 14   1/24/85     6          53      200           2
## 15   4/12/85     6          67      200           0
## 16   4/29/85     6          75      200           0
## 17   6/17/85     6          70      200           0
## 18 7/2903/85     6          81      200           0
## 19   8/27/85     6          76      200           0
## 20  10/03/85     6          79      200           0
## 21  10/30/85     6          75      200           2
## 22  11/26/85     6          76      200           0
## 23   1/12/86     6          58      200           1

Trion les données même si nous savons que ce n’est pas une bonne idée.

On commence par inspecter les données visuellement.

plot(data=data, Malfunction/Count ~ Temperature, ylim=c(0,1))

## Regression logistique

On assume que le seul parmètre qui influence le risque de défaillance est la température.

On peut donc faire une regression logistique pour estimer l’effet de la température sur le risque de défaillance.

logistic_reg = glm(data=data, Malfunction/Count ~ Temperature, weights=Count, family=binomial(link='logit'))
summary(logistic_reg)
## 
## Call:
## glm(formula = Malfunction/Count ~ Temperature, family = binomial(link = "logit"), 
##     data = data, weights = Count)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.95227  -0.78299  -0.54117  -0.04379   2.65152  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  5.08498    3.05247   1.666   0.0957 .
## Temperature -0.11560    0.04702  -2.458   0.0140 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 24.230  on 22  degrees of freedom
## Residual deviance: 18.086  on 21  degrees of freedom
## AIC: 35.647
## 
## Number of Fisher Scoring iterations: 5

The maximum likelyhood estimator of the intercept and of Temperature are thus \(\alpha = 5.0849\) and \(\beta = −0.1156\) and their standard errors are \(s_\alpha = 3.052\) and \(s_\beta = 0.04702\). The Residual deviance corresponds to the Goodness of fit G2 = 18.086 with 21 degrees of freedom. I have therefore managed to replicate the results of the Dalal et al. article.

Prediction de la probabilité de defaillance:

# shuttle=shuttle[shuttle$r!=0,]
tempv = seq(from=30, to=90, by = .5)
rmv <- predict(logistic_reg,list(Temperature=tempv),type="response")
plot(tempv,rmv,type="l",ylim=c(0,1))
points(data=data, Malfunction/Count ~ Temperature)

Figure similaire à la figure 4 de l’article.

Intervale de confiance de la prédiction

ggplot(data, aes(y=Malfunction/Count, x=Temperature)) + geom_point(alpha=.2, size = 2, color="blue") +
geom_smooth(method = "glm", method.args = list(family = "binomial"), fullrange=T) +
xlim(30,90) + ylim(0,1) + theme_bw()
## `geom_smooth()` using formula 'y ~ x'
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

Message d’erreur de ggplot.

On donne les donnée brute à ggplot

data_flat=data.frame()
for(i in 1:nrow(data)) {
temperature = data[i,"Temperature"];
malfunction = data[i,"Malfunction"];
d = data.frame(Temperature=temperature,Malfunction=rep(0,times = data[i,"Count"]))
if(malfunction>0) {
d[1:malfunction, "Malfunction"]=1;
}
data_flat=rbind(data_flat,d)
}
dim(data_flat)
## [1] 138   2
str(data_flat)
## 'data.frame':    138 obs. of  2 variables:
##  $ Temperature: int  66 66 66 66 66 66 70 70 70 70 ...
##  $ Malfunction: num  0 0 0 0 0 0 1 0 0 0 ...
logistic_reg_flat = glm(data=data_flat, Malfunction ~ Temperature, family=binomial(link='logit'))
summary(logistic_reg)
## 
## Call:
## glm(formula = Malfunction/Count ~ Temperature, family = binomial(link = "logit"), 
##     data = data, weights = Count)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.95227  -0.78299  -0.54117  -0.04379   2.65152  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  5.08498    3.05247   1.666   0.0957 .
## Temperature -0.11560    0.04702  -2.458   0.0140 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 24.230  on 22  degrees of freedom
## Residual deviance: 18.086  on 21  degrees of freedom
## AIC: 35.647
## 
## Number of Fisher Scoring iterations: 5
ggplot(data=data_flat, aes(y=Malfunction, x=Temperature)) +
geom_smooth(method = "glm", method.args = list(family = "binomial"), fullrange=T) +
geom_point(data=data, aes(y=Malfunction/Count, x=Temperature),alpha=.2, size = 2, color="blue") +
geom_point(alpha=.5, size = .5) +
xlim(30,90) + ylim(0,1) + theme_bw()
## `geom_smooth()` using formula 'y ~ x'

pred = predict(logistic_reg_flat,list(Temperature=30),type="response",se.fit = T)
pred
## $fit
##        1 
## 0.834373 
## 
## $se.fit
##         1 
## 0.2293304 
## 
## $residual.scale
## [1] 1
pred_link = predict(logistic_reg_flat,list(Temperature=30),type="link",se.fit = T)
pred_link
## $fit
##        1 
## 1.616942 
## 
## $se.fit
## [1] 1.659473
## 
## $residual.scale
## [1] 1
logistic_reg$family$linkinv(pred_link$fit)
##        1 
## 0.834373
critval = 1.96
logistic_reg$family$linkinv(c(pred_link$fit-critval*pred_link$se.fit,
pred_link$fit+critval*pred_link$se.fit))
##         1         1 
## 0.1630612 0.9923814