From c20f962a1bc431b23f2b4dcd913963afd11b3acc Mon Sep 17 00:00:00 2001 From: 2f72555a2613970ebb90637b2e23f177 <2f72555a2613970ebb90637b2e23f177@app-learninglab.inria.fr> Date: Tue, 3 Sep 2024 08:39:13 +0000 Subject: [PATCH] Upload exercice 4_1 done with RStudio --- module4/exercices_module4_1.Rmd | 120 ++++++++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100644 module4/exercices_module4_1.Rmd diff --git a/module4/exercices_module4_1.Rmd b/module4/exercices_module4_1.Rmd new file mode 100644 index 0000000..9eb9c6b --- /dev/null +++ b/module4/exercices_module4_1.Rmd @@ -0,0 +1,120 @@ +--- +title: "exercices_module4_1" +author: "Jerome Riera" +date: "2024-08-31" +output: + pdf_document: default + html_document: default +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` +## R version +```{r} +library(ggplot2) +sessionInfo() +``` +## Installed libraries + +```{r} +library(usethis) +library(devtools) +devtools :: session_info() +``` +## Loading data + +```{r} +data_full<-read.csv("/Users/jeromeriera/Library/Mobile Documents/com~apple~CloudDocs/Thè€se/MOOC RR/mooc rr/module2/exo5/shuttle.csv", header = TRUE) +``` + +### Data inspection + +```{r} +plot(data=data_full, Malfunction/Count ~Temperature, ylim=c(0,1)) +``` + +### Logistic regression + +```{r} +logistic_reg=glm(data=data_full, Malfunction/Count ~ Temperature, weights=Count, family = binomial(link = "logit")) +summary(logistic_reg) + +``` +**Results of Dalal *et al.* have successfully been replicated here.** + +### Predicitng failure probability + +```{r} +# shuttle=shuttle[shuttle$r!=0,] +tempv=seq(from=30, to=90, by=.5) +rmv <- predict(logistic_reg, list(Temperature=tempv), type='response') +plot(tempv, rmv, type = 'l', ylim=c(0,1)) +points(data=data_full, Malfunction/Count ~Temperature) +``` + +**Figure 4 od Dalal *et al* have successfully been replicated.** + +### Confidence of the prediction + +```{r} +ggplot(data_full, aes(y=Malfunction/Count, x=Temperature)) + geom_point(alpha=.2, size = 2, color='blue') + geom_smooth(method='glm', method.args = list(family='binomial'), fullrange=T) + xlim(30,90) + ylim(0,1) + theme_bw() +``` +Same warning as in the example. Let's provide the raw data to ggplot2 +```{r} +data_flat=data.frame() +for(i in 1:nrow(data_full)) { + temperature=data_full[i, 'Temperature']; + malfunction=data_full[i,'Malfunction']; + d = data.frame(Temperature=temperature, Malfunction=rep(0, times=data_full[i,'Count'])) + if(malfunction>0) { + d[1:malfunction, 'Malfunction']=1; + } + data_flat=rbind(data_flat,d) +} +dim(data_flat) +``` + +```{r} +str(data_flat) +``` + +computation of the regression +```{r} +logistic_reg_flat=glm(data = data_flat, Malfunction~Temperature, family=binomial(link = 'logit')) +summary(logistic_reg_flat) +``` +Residual deviance is different now. Let's plot it again + +```{r} + +ggplot(data = data_flat, aes(y=Malfunction, x=Temperature))+ + geom_smooth(method = 'glm', method.args=list(family='binomial'), fullrange = T)+ + geom_point(data=data_full, aes(y=Malfunction/Count, x=Temperature), alpha =.2, size=2, color='blue')+ + geom_point(alpha=.5, size=.5)+ + xlim(30,90)+ylim(0,1)+theme_bw() +``` +Comparison with the direct prediction +```{r} +pred=predict(logistic_reg_flat, list(Temperature=30), type = 'response', se.fit = T) +pred +``` + + +```{r} +pred_link=predict(logistic_reg_flat, list(Temperature=30), type = 'link', se.fit = T) +pred_link +``` +```{r} +logistic_reg$family$linkinv(pred_link$fit) +``` + +```{r} +critval=1.96 +logistic_reg$family$linkinv(c(pred_link$fit-critval*pred_link$se.fit, + pred_link$fit+critval*pred_link$se.fit)) +``` + +This confident interval correspond to what we can observe with ggplot2. + +**Very interesting exercise for me. It overwhelmed my statistical and R knowledge a bit, but I managed to learn a lot while trying to understand what has been done here ** \ No newline at end of file -- 2.18.1