From c2717bc84c7a8186bdb9fa880f48bcb975a011ab Mon Sep 17 00:00:00 2001 From: Pierre LACOSTE Date: Wed, 15 Dec 2021 04:11:21 +0100 Subject: [PATCH] Exo module 4 --- module4/exo.html | 596 +++++++++++++++++++++++++++++++++++++++++++++++ module4/exo.rmd | 125 ++++++++++ 2 files changed, 721 insertions(+) create mode 100644 module4/exo.html create mode 100644 module4/exo.rmd diff --git a/module4/exo.html b/module4/exo.html new file mode 100644 index 0000000..d6b4a51 --- /dev/null +++ b/module4/exo.html @@ -0,0 +1,596 @@ + + + + + + + + + + + + + + +Chalenger + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +
+

Information technique pour lesquelles l’analyse est executé:

+

Librairie ggplot2

+
library(ggplot2)
+sessionInfo()
+
## R version 4.1.1 (2021-08-10)
+## Platform: x86_64-w64-mingw32/x64 (64-bit)
+## Running under: Windows 10 x64 (build 19041)
+## 
+## Matrix products: default
+## 
+## locale:
+## [1] LC_COLLATE=French_France.1252  LC_CTYPE=French_France.1252   
+## [3] LC_MONETARY=French_France.1252 LC_NUMERIC=C                  
+## [5] LC_TIME=French_France.1252    
+## 
+## attached base packages:
+## [1] stats     graphics  grDevices utils     datasets  methods   base     
+## 
+## other attached packages:
+## [1] ggplot2_3.3.5
+## 
+## loaded via a namespace (and not attached):
+##  [1] pillar_1.6.2     compiler_4.1.1   jquerylib_0.1.4  tools_4.1.1     
+##  [5] digest_0.6.27    evaluate_0.14    lifecycle_1.0.0  tibble_3.1.4    
+##  [9] gtable_0.3.0     pkgconfig_2.0.3  rlang_0.4.11     DBI_1.1.1       
+## [13] yaml_2.2.1       xfun_0.25        fastmap_1.1.0    withr_2.4.2     
+## [17] stringr_1.4.0    dplyr_1.0.7      knitr_1.34       generics_0.1.0  
+## [21] vctrs_0.3.8      grid_4.1.1       tidyselect_1.1.1 glue_1.4.2      
+## [25] R6_2.5.1         fansi_0.5.0      rmarkdown_2.11   purrr_0.3.4     
+## [29] magrittr_2.0.1   scales_1.1.1     ellipsis_0.3.2   htmltools_0.5.2 
+## [33] assertthat_0.2.1 colorspace_2.0-2 utf8_1.2.2       stringi_1.7.4   
+## [37] munsell_0.5.0    crayon_1.4.1
+
+
+

Chargement et inspection des données

+
data = read.csv("https://app-learninglab.inria.fr/moocrr/gitlab/moocrr-session3/moocrr-reproducibility-study/raw/master/data/shuttle.csv?inline=false")
+data
+
##         Date Count Temperature Pressure Malfunction
+## 1    4/12/81     6          66       50           0
+## 2   11/12/81     6          70       50           1
+## 3    3/22/82     6          69       50           0
+## 4   11/11/82     6          68       50           0
+## 5    4/04/83     6          67       50           0
+## 6    6/18/82     6          72       50           0
+## 7    8/30/83     6          73      100           0
+## 8   11/28/83     6          70      100           0
+## 9    2/03/84     6          57      200           1
+## 10   4/06/84     6          63      200           1
+## 11   8/30/84     6          70      200           1
+## 12  10/05/84     6          78      200           0
+## 13  11/08/84     6          67      200           0
+## 14   1/24/85     6          53      200           2
+## 15   4/12/85     6          67      200           0
+## 16   4/29/85     6          75      200           0
+## 17   6/17/85     6          70      200           0
+## 18 7/2903/85     6          81      200           0
+## 19   8/27/85     6          76      200           0
+## 20  10/03/85     6          79      200           0
+## 21  10/30/85     6          75      200           2
+## 22  11/26/85     6          76      200           0
+## 23   1/12/86     6          58      200           1
+

Trion les données même si nous savons que ce n’est pas une bonne idée.

+

On commence par inspecter les données visuellement.

+
plot(data=data, Malfunction/Count ~ Temperature, ylim=c(0,1))
+

## Regression logistique

+

On assume que le seul parmètre qui influence le risque de défaillance est la température.

+

On peut donc faire une regression logistique pour estimer l’effet de la température sur le risque de défaillance.

+
logistic_reg = glm(data=data, Malfunction/Count ~ Temperature, weights=Count, family=binomial(link='logit'))
+summary(logistic_reg)
+
## 
+## Call:
+## glm(formula = Malfunction/Count ~ Temperature, family = binomial(link = "logit"), 
+##     data = data, weights = Count)
+## 
+## Deviance Residuals: 
+##      Min        1Q    Median        3Q       Max  
+## -0.95227  -0.78299  -0.54117  -0.04379   2.65152  
+## 
+## Coefficients:
+##             Estimate Std. Error z value Pr(>|z|)  
+## (Intercept)  5.08498    3.05247   1.666   0.0957 .
+## Temperature -0.11560    0.04702  -2.458   0.0140 *
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+## 
+## (Dispersion parameter for binomial family taken to be 1)
+## 
+##     Null deviance: 24.230  on 22  degrees of freedom
+## Residual deviance: 18.086  on 21  degrees of freedom
+## AIC: 35.647
+## 
+## Number of Fisher Scoring iterations: 5
+

The maximum likelyhood estimator of the intercept and of Temperature are thus \(\alpha = 5.0849\) and \(\beta = −0.1156\) and their standard errors are \(s_\alpha = 3.052\) and \(s_\beta = 0.04702\). The Residual deviance corresponds to the Goodness of fit G2 = 18.086 with 21 degrees of freedom. I have therefore managed to replicate the results of the Dalal et al. article.

+
+
+

Prediction de la probabilité de defaillance:

+
# shuttle=shuttle[shuttle$r!=0,]
+tempv = seq(from=30, to=90, by = .5)
+rmv <- predict(logistic_reg,list(Temperature=tempv),type="response")
+plot(tempv,rmv,type="l",ylim=c(0,1))
+points(data=data, Malfunction/Count ~ Temperature)
+

Figure similaire à la figure 4 de l’article.

+
+
+

Intervale de confiance de la prédiction

+
ggplot(data, aes(y=Malfunction/Count, x=Temperature)) + geom_point(alpha=.2, size = 2, color="blue") +
+geom_smooth(method = "glm", method.args = list(family = "binomial"), fullrange=T) +
+xlim(30,90) + ylim(0,1) + theme_bw()
+
## `geom_smooth()` using formula 'y ~ x'
+
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
+

+

Message d’erreur de ggplot.

+

On donne les donnée brute à ggplot

+
data_flat=data.frame()
+for(i in 1:nrow(data)) {
+temperature = data[i,"Temperature"];
+malfunction = data[i,"Malfunction"];
+d = data.frame(Temperature=temperature,Malfunction=rep(0,times = data[i,"Count"]))
+if(malfunction>0) {
+d[1:malfunction, "Malfunction"]=1;
+}
+data_flat=rbind(data_flat,d)
+}
+dim(data_flat)
+
## [1] 138   2
+
str(data_flat)
+
## 'data.frame':    138 obs. of  2 variables:
+##  $ Temperature: int  66 66 66 66 66 66 70 70 70 70 ...
+##  $ Malfunction: num  0 0 0 0 0 0 1 0 0 0 ...
+
logistic_reg_flat = glm(data=data_flat, Malfunction ~ Temperature, family=binomial(link='logit'))
+summary(logistic_reg)
+
## 
+## Call:
+## glm(formula = Malfunction/Count ~ Temperature, family = binomial(link = "logit"), 
+##     data = data, weights = Count)
+## 
+## Deviance Residuals: 
+##      Min        1Q    Median        3Q       Max  
+## -0.95227  -0.78299  -0.54117  -0.04379   2.65152  
+## 
+## Coefficients:
+##             Estimate Std. Error z value Pr(>|z|)  
+## (Intercept)  5.08498    3.05247   1.666   0.0957 .
+## Temperature -0.11560    0.04702  -2.458   0.0140 *
+## ---
+## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+## 
+## (Dispersion parameter for binomial family taken to be 1)
+## 
+##     Null deviance: 24.230  on 22  degrees of freedom
+## Residual deviance: 18.086  on 21  degrees of freedom
+## AIC: 35.647
+## 
+## Number of Fisher Scoring iterations: 5
+
ggplot(data=data_flat, aes(y=Malfunction, x=Temperature)) +
+geom_smooth(method = "glm", method.args = list(family = "binomial"), fullrange=T) +
+geom_point(data=data, aes(y=Malfunction/Count, x=Temperature),alpha=.2, size = 2, color="blue") +
+geom_point(alpha=.5, size = .5) +
+xlim(30,90) + ylim(0,1) + theme_bw()
+
## `geom_smooth()` using formula 'y ~ x'
+

+
pred = predict(logistic_reg_flat,list(Temperature=30),type="response",se.fit = T)
+pred
+
## $fit
+##        1 
+## 0.834373 
+## 
+## $se.fit
+##         1 
+## 0.2293304 
+## 
+## $residual.scale
+## [1] 1
+
pred_link = predict(logistic_reg_flat,list(Temperature=30),type="link",se.fit = T)
+pred_link
+
## $fit
+##        1 
+## 1.616942 
+## 
+## $se.fit
+## [1] 1.659473
+## 
+## $residual.scale
+## [1] 1
+
logistic_reg$family$linkinv(pred_link$fit)
+
##        1 
+## 0.834373
+
critval = 1.96
+logistic_reg$family$linkinv(c(pred_link$fit-critval*pred_link$se.fit,
+pred_link$fit+critval*pred_link$se.fit))
+
##         1         1 
+## 0.1630612 0.9923814
+
+ + + + +
+ + + + + + + + + + + + + + + diff --git a/module4/exo.rmd b/module4/exo.rmd new file mode 100644 index 0000000..63e8823 --- /dev/null +++ b/module4/exo.rmd @@ -0,0 +1,125 @@ +--- +title: "Chalenger" +author: "Pierre Lacoste" +date: "14/12/2021" +output: html_document +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + +## Information technique pour lesquelles l'analyse est executé: + +Librairie `ggplot2` + +```{r} +library(ggplot2) +sessionInfo() +``` + +## Chargement et inspection des données + +```{r} +data = read.csv("https://app-learninglab.inria.fr/moocrr/gitlab/moocrr-session3/moocrr-reproducibility-study/raw/master/data/shuttle.csv?inline=false") +data +``` + +Trion les données même si nous savons que ce n'est pas une bonne idée. + +On commence par inspecter les données visuellement. + +```{r} +plot(data=data, Malfunction/Count ~ Temperature, ylim=c(0,1)) +``` +## Regression logistique + +On assume que le seul parmètre qui influence le risque de défaillance est la température. + +On peut donc faire une regression logistique pour estimer l'effet de la température sur le risque de défaillance. + +```{r} +logistic_reg = glm(data=data, Malfunction/Count ~ Temperature, weights=Count, family=binomial(link='logit')) +summary(logistic_reg) + +``` + +The maximum likelyhood estimator of the intercept and of Temperature are thus $\alpha = 5.0849$ and $\beta = −0.1156$ and their standard errors are $s_\alpha = 3.052$ and $s_\beta = 0.04702$. The Residual deviance corresponds to the +Goodness of fit G2 = 18.086 with 21 degrees of freedom. I have therefore managed to replicate the +results of the Dalal et al. article. + +## Prediction de la probabilité de defaillance: + +```{r} +# shuttle=shuttle[shuttle$r!=0,] +tempv = seq(from=30, to=90, by = .5) +rmv <- predict(logistic_reg,list(Temperature=tempv),type="response") +plot(tempv,rmv,type="l",ylim=c(0,1)) +points(data=data, Malfunction/Count ~ Temperature) +``` +Figure similaire à la figure 4 de l'article. + +## Intervale de confiance de la prédiction + +```{r} +ggplot(data, aes(y=Malfunction/Count, x=Temperature)) + geom_point(alpha=.2, size = 2, color="blue") + +geom_smooth(method = "glm", method.args = list(family = "binomial"), fullrange=T) + +xlim(30,90) + ylim(0,1) + theme_bw() +``` + +Message d'erreur de ggplot. + +On donne les donnée brute à ggplot + +```{r} +data_flat=data.frame() +for(i in 1:nrow(data)) { +temperature = data[i,"Temperature"]; +malfunction = data[i,"Malfunction"]; +d = data.frame(Temperature=temperature,Malfunction=rep(0,times = data[i,"Count"])) +if(malfunction>0) { +d[1:malfunction, "Malfunction"]=1; +} +data_flat=rbind(data_flat,d) +} +dim(data_flat) +``` + +```{r} +str(data_flat) +``` +```{r} +logistic_reg_flat = glm(data=data_flat, Malfunction ~ Temperature, family=binomial(link='logit')) +summary(logistic_reg) +``` + + + +```{r} +ggplot(data=data_flat, aes(y=Malfunction, x=Temperature)) + +geom_smooth(method = "glm", method.args = list(family = "binomial"), fullrange=T) + +geom_point(data=data, aes(y=Malfunction/Count, x=Temperature),alpha=.2, size = 2, color="blue") + +geom_point(alpha=.5, size = .5) + +xlim(30,90) + ylim(0,1) + theme_bw() +``` + +```{r} +pred = predict(logistic_reg_flat,list(Temperature=30),type="response",se.fit = T) +pred +``` + +```{r} +pred_link = predict(logistic_reg_flat,list(Temperature=30),type="link",se.fit = T) +pred_link +``` + +```{r} +logistic_reg$family$linkinv(pred_link$fit) +``` + +```{r} +critval = 1.96 +logistic_reg$family$linkinv(c(pred_link$fit-critval*pred_link$se.fit, +pred_link$fit+critval*pred_link$se.fit)) +``` + -- 2.18.1