Environnement de travail

library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 7 x64 (build 7601) Service Pack 1
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=French_France.1252  LC_CTYPE=French_France.1252   
## [3] LC_MONETARY=French_France.1252 LC_NUMERIC=C                  
## [5] LC_TIME=French_France.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ggplot2_3.0.0 dplyr_0.7.6   readr_1.1.1  
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.18     knitr_1.20       bindr_0.1.1      magrittr_1.5    
##  [5] hms_0.4.2        munsell_0.5.0    tidyselect_0.2.4 colorspace_1.3-2
##  [9] R6_2.2.2         rlang_0.2.2      plyr_1.8.4       stringr_1.3.1   
## [13] tools_3.5.1      grid_3.5.1       gtable_0.2.0     withr_2.1.2     
## [17] htmltools_0.3.6  lazyeval_0.2.1   yaml_2.2.0       rprojroot_1.3-2 
## [21] digest_0.6.16    assertthat_0.2.0 tibble_1.4.2     crayon_1.3.4    
## [25] bindrcpp_0.2.2   purrr_0.2.5      glue_1.3.0       evaluate_0.11   
## [29] rmarkdown_1.10   stringi_1.1.7    compiler_3.5.1   pillar_1.3.0    
## [33] scales_1.0.0     backports_1.1.2  pkgconfig_2.0.2

Data

df <- read_csv("data/shuttle.csv")
## Parsed with column specification:
## cols(
##   Date = col_character(),
##   Count = col_integer(),
##   Temperature = col_integer(),
##   Pressure = col_integer(),
##   Malfunction = col_integer()
## )
head(df)
## # A tibble: 6 x 5
##   Date     Count Temperature Pressure Malfunction
##   <chr>    <int>       <int>    <int>       <int>
## 1 4/12/81      6          66       50           0
## 2 11/12/81     6          70       50           1
## 3 3/22/82      6          69       50           0
## 4 11/11/82     6          68       50           0
## 5 4/04/83      6          67       50           0
## 6 6/18/82      6          72       50           0

Analyse primaire

summary(df)
##      Date               Count    Temperature       Pressure    
##  Length:23          Min.   :6   Min.   :53.00   Min.   : 50.0  
##  Class :character   1st Qu.:6   1st Qu.:67.00   1st Qu.: 75.0  
##  Mode  :character   Median :6   Median :70.00   Median :200.0  
##                     Mean   :6   Mean   :69.57   Mean   :152.2  
##                     3rd Qu.:6   3rd Qu.:75.00   3rd Qu.:200.0  
##                     Max.   :6   Max.   :81.00   Max.   :200.0  
##   Malfunction    
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.3913  
##  3rd Qu.:1.0000  
##  Max.   :2.0000
ggplot(df) + geom_point(aes(x=Temperature,y=Malfunction/Count))

ggplot(df) + geom_point(aes(x=Pressure,y=Malfunction/Count))

Régression logistique

logreg <- glm( Malfunction/Count ~ Temperature, data = df,family = binomial(link="logit"),weights=Count)
summary(logreg)
## 
## Call:
## glm(formula = Malfunction/Count ~ Temperature, family = binomial(link = "logit"), 
##     data = df, weights = Count)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.95227  -0.78299  -0.54117  -0.04379   2.65152  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  5.08498    3.05247   1.666   0.0957 .
## Temperature -0.11560    0.04702  -2.458   0.0140 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 24.230  on 22  degrees of freedom
## Residual deviance: 18.086  on 21  degrees of freedom
## AIC: 35.647
## 
## Number of Fisher Scoring iterations: 5

Prévision

temp <- data.frame(Temperature = seq(30,90,0.5))
pred <- predict(logreg,newdata= temp,type="response")
ggplot() + 
  geom_point(aes(x=df$Temperature,y=df$Malfunction/df$Count)) + 
  geom_line(aes(x=temp$Temperature,y=pred))  +
  xlab("Temperature") + 
  ylab("Malfunction/Count") + 
  theme_bw()

## Intervalle de prévision

pred_se <- predict(logreg,newdata= temp,type="link",se.fit=TRUE) #type=link est le défaut
ICinf <- logreg$family$linkinv(pred_se$fit-1.96*pred_se$se.fit)
ICsup <- logreg$family$linkinv(pred_se$fit+1.96*pred_se$se.fit)
ggplot() + 
  geom_point(aes(x=df$Temperature,y=df$Malfunction/df$Count)) + 
  geom_line(aes(x=temp$Temperature,y=pred)) +
  geom_line(aes(x=temp$Temperature,y=ICinf),col='red') +
  geom_line(aes(x=temp$Temperature,y=ICsup),col='red') +
  xlab("Temperature") + 
  ylab("Malfunction/Count") + 
  theme_bw()