diff --git a/module4/exo.html b/module4/exo.html new file mode 100644 index 0000000000000000000000000000000000000000..d6b4a5115092f25e9ce7854807a761e336a23ace --- /dev/null +++ b/module4/exo.html @@ -0,0 +1,596 @@ + + + + +
+ + + + + + + + + +Librairie ggplot2
library(ggplot2)
+sessionInfo()
+## R version 4.1.1 (2021-08-10)
+## Platform: x86_64-w64-mingw32/x64 (64-bit)
+## Running under: Windows 10 x64 (build 19041)
+##
+## Matrix products: default
+##
+## locale:
+## [1] LC_COLLATE=French_France.1252 LC_CTYPE=French_France.1252
+## [3] LC_MONETARY=French_France.1252 LC_NUMERIC=C
+## [5] LC_TIME=French_France.1252
+##
+## attached base packages:
+## [1] stats graphics grDevices utils datasets methods base
+##
+## other attached packages:
+## [1] ggplot2_3.3.5
+##
+## loaded via a namespace (and not attached):
+## [1] pillar_1.6.2 compiler_4.1.1 jquerylib_0.1.4 tools_4.1.1
+## [5] digest_0.6.27 evaluate_0.14 lifecycle_1.0.0 tibble_3.1.4
+## [9] gtable_0.3.0 pkgconfig_2.0.3 rlang_0.4.11 DBI_1.1.1
+## [13] yaml_2.2.1 xfun_0.25 fastmap_1.1.0 withr_2.4.2
+## [17] stringr_1.4.0 dplyr_1.0.7 knitr_1.34 generics_0.1.0
+## [21] vctrs_0.3.8 grid_4.1.1 tidyselect_1.1.1 glue_1.4.2
+## [25] R6_2.5.1 fansi_0.5.0 rmarkdown_2.11 purrr_0.3.4
+## [29] magrittr_2.0.1 scales_1.1.1 ellipsis_0.3.2 htmltools_0.5.2
+## [33] assertthat_0.2.1 colorspace_2.0-2 utf8_1.2.2 stringi_1.7.4
+## [37] munsell_0.5.0 crayon_1.4.1
+data = read.csv("https://app-learninglab.inria.fr/moocrr/gitlab/moocrr-session3/moocrr-reproducibility-study/raw/master/data/shuttle.csv?inline=false")
+data
+## Date Count Temperature Pressure Malfunction
+## 1 4/12/81 6 66 50 0
+## 2 11/12/81 6 70 50 1
+## 3 3/22/82 6 69 50 0
+## 4 11/11/82 6 68 50 0
+## 5 4/04/83 6 67 50 0
+## 6 6/18/82 6 72 50 0
+## 7 8/30/83 6 73 100 0
+## 8 11/28/83 6 70 100 0
+## 9 2/03/84 6 57 200 1
+## 10 4/06/84 6 63 200 1
+## 11 8/30/84 6 70 200 1
+## 12 10/05/84 6 78 200 0
+## 13 11/08/84 6 67 200 0
+## 14 1/24/85 6 53 200 2
+## 15 4/12/85 6 67 200 0
+## 16 4/29/85 6 75 200 0
+## 17 6/17/85 6 70 200 0
+## 18 7/2903/85 6 81 200 0
+## 19 8/27/85 6 76 200 0
+## 20 10/03/85 6 79 200 0
+## 21 10/30/85 6 75 200 2
+## 22 11/26/85 6 76 200 0
+## 23 1/12/86 6 58 200 1
+Trion les données même si nous savons que ce n’est pas une bonne idée.
+On commence par inspecter les données visuellement.
+plot(data=data, Malfunction/Count ~ Temperature, ylim=c(0,1))
+ ## Regression logistique
On assume que le seul parmètre qui influence le risque de défaillance est la température.
+On peut donc faire une regression logistique pour estimer l’effet de la température sur le risque de défaillance.
+logistic_reg = glm(data=data, Malfunction/Count ~ Temperature, weights=Count, family=binomial(link='logit'))
+summary(logistic_reg)
+##
+## Call:
+## glm(formula = Malfunction/Count ~ Temperature, family = binomial(link = "logit"),
+## data = data, weights = Count)
+##
+## Deviance Residuals:
+## Min 1Q Median 3Q Max
+## -0.95227 -0.78299 -0.54117 -0.04379 2.65152
+##
+## Coefficients:
+## Estimate Std. Error z value Pr(>|z|)
+## (Intercept) 5.08498 3.05247 1.666 0.0957 .
+## Temperature -0.11560 0.04702 -2.458 0.0140 *
+## ---
+## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+##
+## (Dispersion parameter for binomial family taken to be 1)
+##
+## Null deviance: 24.230 on 22 degrees of freedom
+## Residual deviance: 18.086 on 21 degrees of freedom
+## AIC: 35.647
+##
+## Number of Fisher Scoring iterations: 5
+The maximum likelyhood estimator of the intercept and of Temperature are thus \(\alpha = 5.0849\) and \(\beta = −0.1156\) and their standard errors are \(s_\alpha = 3.052\) and \(s_\beta = 0.04702\). The Residual deviance corresponds to the Goodness of fit G2 = 18.086 with 21 degrees of freedom. I have therefore managed to replicate the results of the Dalal et al. article.
+# shuttle=shuttle[shuttle$r!=0,]
+tempv = seq(from=30, to=90, by = .5)
+rmv <- predict(logistic_reg,list(Temperature=tempv),type="response")
+plot(tempv,rmv,type="l",ylim=c(0,1))
+points(data=data, Malfunction/Count ~ Temperature)
+ Figure similaire à la figure 4 de l’article.
ggplot(data, aes(y=Malfunction/Count, x=Temperature)) + geom_point(alpha=.2, size = 2, color="blue") +
+geom_smooth(method = "glm", method.args = list(family = "binomial"), fullrange=T) +
+xlim(30,90) + ylim(0,1) + theme_bw()
+## `geom_smooth()` using formula 'y ~ x'
+## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
+Message d’erreur de ggplot.
+On donne les donnée brute à ggplot
+data_flat=data.frame()
+for(i in 1:nrow(data)) {
+temperature = data[i,"Temperature"];
+malfunction = data[i,"Malfunction"];
+d = data.frame(Temperature=temperature,Malfunction=rep(0,times = data[i,"Count"]))
+if(malfunction>0) {
+d[1:malfunction, "Malfunction"]=1;
+}
+data_flat=rbind(data_flat,d)
+}
+dim(data_flat)
+## [1] 138 2
+str(data_flat)
+## 'data.frame': 138 obs. of 2 variables:
+## $ Temperature: int 66 66 66 66 66 66 70 70 70 70 ...
+## $ Malfunction: num 0 0 0 0 0 0 1 0 0 0 ...
+logistic_reg_flat = glm(data=data_flat, Malfunction ~ Temperature, family=binomial(link='logit'))
+summary(logistic_reg)
+##
+## Call:
+## glm(formula = Malfunction/Count ~ Temperature, family = binomial(link = "logit"),
+## data = data, weights = Count)
+##
+## Deviance Residuals:
+## Min 1Q Median 3Q Max
+## -0.95227 -0.78299 -0.54117 -0.04379 2.65152
+##
+## Coefficients:
+## Estimate Std. Error z value Pr(>|z|)
+## (Intercept) 5.08498 3.05247 1.666 0.0957 .
+## Temperature -0.11560 0.04702 -2.458 0.0140 *
+## ---
+## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+##
+## (Dispersion parameter for binomial family taken to be 1)
+##
+## Null deviance: 24.230 on 22 degrees of freedom
+## Residual deviance: 18.086 on 21 degrees of freedom
+## AIC: 35.647
+##
+## Number of Fisher Scoring iterations: 5
+ggplot(data=data_flat, aes(y=Malfunction, x=Temperature)) +
+geom_smooth(method = "glm", method.args = list(family = "binomial"), fullrange=T) +
+geom_point(data=data, aes(y=Malfunction/Count, x=Temperature),alpha=.2, size = 2, color="blue") +
+geom_point(alpha=.5, size = .5) +
+xlim(30,90) + ylim(0,1) + theme_bw()
+## `geom_smooth()` using formula 'y ~ x'
+pred = predict(logistic_reg_flat,list(Temperature=30),type="response",se.fit = T)
+pred
+## $fit
+## 1
+## 0.834373
+##
+## $se.fit
+## 1
+## 0.2293304
+##
+## $residual.scale
+## [1] 1
+pred_link = predict(logistic_reg_flat,list(Temperature=30),type="link",se.fit = T)
+pred_link
+## $fit
+## 1
+## 1.616942
+##
+## $se.fit
+## [1] 1.659473
+##
+## $residual.scale
+## [1] 1
+logistic_reg$family$linkinv(pred_link$fit)
+## 1
+## 0.834373
+critval = 1.96
+logistic_reg$family$linkinv(c(pred_link$fit-critval*pred_link$se.fit,
+pred_link$fit+critval*pred_link$se.fit))
+## 1 1
+## 0.1630612 0.9923814
+