From c20f962a1bc431b23f2b4dcd913963afd11b3acc Mon Sep 17 00:00:00 2001
From: 2f72555a2613970ebb90637b2e23f177
 <2f72555a2613970ebb90637b2e23f177@app-learninglab.inria.fr>
Date: Tue, 3 Sep 2024 08:39:13 +0000
Subject: [PATCH] Upload exercice 4_1 done with RStudio

---
 module4/exercices_module4_1.Rmd | 120 ++++++++++++++++++++++++++++++++
 1 file changed, 120 insertions(+)
 create mode 100644 module4/exercices_module4_1.Rmd

diff --git a/module4/exercices_module4_1.Rmd b/module4/exercices_module4_1.Rmd
new file mode 100644
index 0000000..9eb9c6b
--- /dev/null
+++ b/module4/exercices_module4_1.Rmd
@@ -0,0 +1,120 @@
+---
+title: "exercices_module4_1"
+author: "Jerome Riera"
+date: "2024-08-31"
+output:
+  pdf_document: default
+  html_document: default
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+## R version
+```{r}
+library(ggplot2)
+sessionInfo()
+```
+## Installed libraries
+
+```{r}
+library(usethis)
+library(devtools)
+devtools :: session_info()
+```
+## Loading data
+
+```{r}
+data_full<-read.csv("/Users/jeromeriera/Library/Mobile Documents/com~apple~CloudDocs/Thèse/MOOC RR/mooc rr/module2/exo5/shuttle.csv", header = TRUE)
+```
+
+### Data inspection
+
+```{r}
+plot(data=data_full, Malfunction/Count ~Temperature, ylim=c(0,1))
+```
+
+### Logistic regression
+
+```{r}
+logistic_reg=glm(data=data_full, Malfunction/Count ~ Temperature, weights=Count, family = binomial(link = "logit"))
+summary(logistic_reg)
+              
+```
+**Results of Dalal *et al.* have successfully been replicated here.**
+
+### Predicitng failure probability
+
+```{r}
+# shuttle=shuttle[shuttle$r!=0,]
+tempv=seq(from=30, to=90, by=.5)
+rmv <- predict(logistic_reg, list(Temperature=tempv), type='response')
+plot(tempv, rmv, type = 'l', ylim=c(0,1))
+points(data=data_full, Malfunction/Count ~Temperature)
+```
+
+**Figure 4 od Dalal *et al* have successfully been replicated.**
+
+### Confidence of the prediction
+
+```{r}
+ggplot(data_full, aes(y=Malfunction/Count, x=Temperature)) + geom_point(alpha=.2, size = 2, color='blue') + geom_smooth(method='glm', method.args = list(family='binomial'), fullrange=T) + xlim(30,90) + ylim(0,1) + theme_bw()
+```
+Same warning as in the example. Let's provide the raw data to ggplot2
+```{r}
+data_flat=data.frame()
+for(i in 1:nrow(data_full)) {
+  temperature=data_full[i, 'Temperature'];
+  malfunction=data_full[i,'Malfunction'];
+  d = data.frame(Temperature=temperature, Malfunction=rep(0, times=data_full[i,'Count']))
+  if(malfunction>0) {
+    d[1:malfunction, 'Malfunction']=1;
+  }
+  data_flat=rbind(data_flat,d)
+}
+dim(data_flat)
+```
+
+```{r}
+str(data_flat)
+```
+
+computation of the regression
+```{r}
+logistic_reg_flat=glm(data = data_flat, Malfunction~Temperature, family=binomial(link = 'logit'))
+summary(logistic_reg_flat)
+```
+Residual deviance is different now. Let's plot it again
+
+```{r}
+
+ggplot(data = data_flat, aes(y=Malfunction, x=Temperature))+
+  geom_smooth(method = 'glm', method.args=list(family='binomial'), fullrange = T)+
+  geom_point(data=data_full, aes(y=Malfunction/Count, x=Temperature), alpha =.2, size=2, color='blue')+
+  geom_point(alpha=.5, size=.5)+
+  xlim(30,90)+ylim(0,1)+theme_bw()
+```
+Comparison with the direct prediction
+```{r}
+pred=predict(logistic_reg_flat, list(Temperature=30), type = 'response', se.fit = T)
+pred
+```
+
+
+```{r}
+pred_link=predict(logistic_reg_flat, list(Temperature=30), type = 'link', se.fit = T)
+pred_link
+```
+```{r}
+logistic_reg$family$linkinv(pred_link$fit)
+```
+
+```{r}
+critval=1.96
+logistic_reg$family$linkinv(c(pred_link$fit-critval*pred_link$se.fit,
+                              pred_link$fit+critval*pred_link$se.fit))
+```
+
+This confident interval correspond to what we can observe with ggplot2.
+
+**Very interesting exercise for me. It overwhelmed my statistical and R knowledge a bit,  but I managed to learn a lot while trying to understand what has been done here **
\ No newline at end of file
-- 
2.18.1