Librairie ggplot2
library(ggplot2)
sessionInfo()
## R version 4.1.1 (2021-08-10)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19041)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=French_France.1252 LC_CTYPE=French_France.1252
## [3] LC_MONETARY=French_France.1252 LC_NUMERIC=C
## [5] LC_TIME=French_France.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggplot2_3.3.5
##
## loaded via a namespace (and not attached):
## [1] pillar_1.6.2 compiler_4.1.1 jquerylib_0.1.4 tools_4.1.1
## [5] digest_0.6.27 evaluate_0.14 lifecycle_1.0.0 tibble_3.1.4
## [9] gtable_0.3.0 pkgconfig_2.0.3 rlang_0.4.11 DBI_1.1.1
## [13] yaml_2.2.1 xfun_0.25 fastmap_1.1.0 withr_2.4.2
## [17] stringr_1.4.0 dplyr_1.0.7 knitr_1.34 generics_0.1.0
## [21] vctrs_0.3.8 grid_4.1.1 tidyselect_1.1.1 glue_1.4.2
## [25] R6_2.5.1 fansi_0.5.0 rmarkdown_2.11 purrr_0.3.4
## [29] magrittr_2.0.1 scales_1.1.1 ellipsis_0.3.2 htmltools_0.5.2
## [33] assertthat_0.2.1 colorspace_2.0-2 utf8_1.2.2 stringi_1.7.4
## [37] munsell_0.5.0 crayon_1.4.1
data = read.csv("https://app-learninglab.inria.fr/moocrr/gitlab/moocrr-session3/moocrr-reproducibility-study/raw/master/data/shuttle.csv?inline=false")
data
## Date Count Temperature Pressure Malfunction
## 1 4/12/81 6 66 50 0
## 2 11/12/81 6 70 50 1
## 3 3/22/82 6 69 50 0
## 4 11/11/82 6 68 50 0
## 5 4/04/83 6 67 50 0
## 6 6/18/82 6 72 50 0
## 7 8/30/83 6 73 100 0
## 8 11/28/83 6 70 100 0
## 9 2/03/84 6 57 200 1
## 10 4/06/84 6 63 200 1
## 11 8/30/84 6 70 200 1
## 12 10/05/84 6 78 200 0
## 13 11/08/84 6 67 200 0
## 14 1/24/85 6 53 200 2
## 15 4/12/85 6 67 200 0
## 16 4/29/85 6 75 200 0
## 17 6/17/85 6 70 200 0
## 18 7/2903/85 6 81 200 0
## 19 8/27/85 6 76 200 0
## 20 10/03/85 6 79 200 0
## 21 10/30/85 6 75 200 2
## 22 11/26/85 6 76 200 0
## 23 1/12/86 6 58 200 1
Trion les données même si nous savons que ce n’est pas une bonne idée.
On commence par inspecter les données visuellement.
plot(data=data, Malfunction/Count ~ Temperature, ylim=c(0,1))
## Regression logistique
On assume que le seul parmètre qui influence le risque de défaillance est la température.
On peut donc faire une regression logistique pour estimer l’effet de la température sur le risque de défaillance.
logistic_reg = glm(data=data, Malfunction/Count ~ Temperature, weights=Count, family=binomial(link='logit'))
summary(logistic_reg)
##
## Call:
## glm(formula = Malfunction/Count ~ Temperature, family = binomial(link = "logit"),
## data = data, weights = Count)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.95227 -0.78299 -0.54117 -0.04379 2.65152
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.08498 3.05247 1.666 0.0957 .
## Temperature -0.11560 0.04702 -2.458 0.0140 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 24.230 on 22 degrees of freedom
## Residual deviance: 18.086 on 21 degrees of freedom
## AIC: 35.647
##
## Number of Fisher Scoring iterations: 5
The maximum likelyhood estimator of the intercept and of Temperature are thus \(\alpha = 5.0849\) and \(\beta = −0.1156\) and their standard errors are \(s_\alpha = 3.052\) and \(s_\beta = 0.04702\). The Residual deviance corresponds to the Goodness of fit G2 = 18.086 with 21 degrees of freedom. I have therefore managed to replicate the results of the Dalal et al. article.
# shuttle=shuttle[shuttle$r!=0,]
tempv = seq(from=30, to=90, by = .5)
rmv <- predict(logistic_reg,list(Temperature=tempv),type="response")
plot(tempv,rmv,type="l",ylim=c(0,1))
points(data=data, Malfunction/Count ~ Temperature)
Figure similaire à la figure 4 de l’article.
ggplot(data, aes(y=Malfunction/Count, x=Temperature)) + geom_point(alpha=.2, size = 2, color="blue") +
geom_smooth(method = "glm", method.args = list(family = "binomial"), fullrange=T) +
xlim(30,90) + ylim(0,1) + theme_bw()
## `geom_smooth()` using formula 'y ~ x'
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
Message d’erreur de ggplot.
On donne les donnée brute à ggplot
data_flat=data.frame()
for(i in 1:nrow(data)) {
temperature = data[i,"Temperature"];
malfunction = data[i,"Malfunction"];
d = data.frame(Temperature=temperature,Malfunction=rep(0,times = data[i,"Count"]))
if(malfunction>0) {
d[1:malfunction, "Malfunction"]=1;
}
data_flat=rbind(data_flat,d)
}
dim(data_flat)
## [1] 138 2
str(data_flat)
## 'data.frame': 138 obs. of 2 variables:
## $ Temperature: int 66 66 66 66 66 66 70 70 70 70 ...
## $ Malfunction: num 0 0 0 0 0 0 1 0 0 0 ...
logistic_reg_flat = glm(data=data_flat, Malfunction ~ Temperature, family=binomial(link='logit'))
summary(logistic_reg)
##
## Call:
## glm(formula = Malfunction/Count ~ Temperature, family = binomial(link = "logit"),
## data = data, weights = Count)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.95227 -0.78299 -0.54117 -0.04379 2.65152
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.08498 3.05247 1.666 0.0957 .
## Temperature -0.11560 0.04702 -2.458 0.0140 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 24.230 on 22 degrees of freedom
## Residual deviance: 18.086 on 21 degrees of freedom
## AIC: 35.647
##
## Number of Fisher Scoring iterations: 5
ggplot(data=data_flat, aes(y=Malfunction, x=Temperature)) +
geom_smooth(method = "glm", method.args = list(family = "binomial"), fullrange=T) +
geom_point(data=data, aes(y=Malfunction/Count, x=Temperature),alpha=.2, size = 2, color="blue") +
geom_point(alpha=.5, size = .5) +
xlim(30,90) + ylim(0,1) + theme_bw()
## `geom_smooth()` using formula 'y ~ x'
pred = predict(logistic_reg_flat,list(Temperature=30),type="response",se.fit = T)
pred
## $fit
## 1
## 0.834373
##
## $se.fit
## 1
## 0.2293304
##
## $residual.scale
## [1] 1
pred_link = predict(logistic_reg_flat,list(Temperature=30),type="link",se.fit = T)
pred_link
## $fit
## 1
## 1.616942
##
## $se.fit
## [1] 1.659473
##
## $residual.scale
## [1] 1
logistic_reg$family$linkinv(pred_link$fit)
## 1
## 0.834373
critval = 1.96
logistic_reg$family$linkinv(c(pred_link$fit-critval*pred_link$se.fit,
pred_link$fit+critval*pred_link$se.fit))
## 1 1
## 0.1630612 0.9923814