Upload New File

f622f1ed · eb949c511572e0ec1c8c386b09da7ff8 · 3ee03c2a · f622f1ed
Commit f622f1ed authored Mar 19, 2020 by eb949c511572e0ec1c8c386b09da7ff8
Show whitespace changes
Inline Side-by-side

Showing with 387 additions and 0 deletions

cholera.org module3/exo3/cholera.org +387 -0

No files found.
--- a/module3/exo3/cholera.org
+++ b/module3/exo3/cholera.org
+#+TITLE: Étude de l'épidémie de choléra à Londres en 1854
+#+AUTHOR: Arnaud Legrand
+#+LANGUAGE: fr
+
+#+HTML_HEAD: <link rel="stylesheet" type="text/css" href="http://www.pirilampo.org/styles/readtheorg/css/htmlize.css"/>
+#+HTML_HEAD: <link rel="stylesheet" type="text/css" href="http://www.pirilampo.org/styles/readtheorg/css/readtheorg.css"/>
+#+HTML_HEAD: <script src="https://ajax.googleapis.com/ajax/libs/jquery/2.1.3/jquery.min.js"></script>
+#+HTML_HEAD: <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/js/bootstrap.min.js"></script>
+#+HTML_HEAD: <script type="text/javascript" src="http://www.pirilampo.org/styles/lib/js/jquery.stickytableheaders.js"></script>
+#+HTML_HEAD: <script type="text/javascript" src="http://www.pirilampo.org/styles/readtheorg/js/readtheorg.js"></script>
+
+#+LATEX_HEADER: \usepackage{a4}
+#+LATEX_HEADER: \usepackage[french]{babel}
+
+#+PROPERTY: header-args  :session  :exports both  :eval never-export
+
+# FIXME! https://towardsdatascience.com/plotting-a-map-of-london-crime-data-using-r-8dcefef1c397
+
+* Récupération et pré-traitement des données
+Commençons par charger un certain nombre de bibliothèques qui nous
+seront utiles par la suite.
+#+begin_src R :results output :session *R* :exports both
+library(ggplot2)
+library(stringr)
+library(dplyr)
+library(tidyr)
+#+end_src
+
+#+RESULTS:
+#+begin_example
+
+Attachement du package : ‘dplyr’
+
+The following objects are masked from ‘package:stats’:
+
+    filter, lag
+
+The following objects are masked from ‘package:base’:
+
+    intersect, setdiff, setequal, union
+#+end_example
+
+Maintenant, téléchargeons les données sur l'épidémie de Choléra
+de 1854.
+
+#+begin_src R :results output :session *R* :exports both
+cholera_file = "./cholera.csv";
+url = "https://fusiontables.google.com/exporttable?query=select+*+from+1HsIb_r4gYYmIz8y_UE1h-X8yUtAYW2INy99BR_c&o=csv"
+if(!file.exists(cholera_file)) { download.file(url, cholera_file) }
+cholera = read.csv(cholera_file, strip.white=T)
+head(cholera)
+str(cholera)
+#+end_src
+
+#+RESULTS:
+#+begin_example
+essai de l'URL 'https://fusiontables.google.com/exporttable?query=select+*+from+1HsIb_r4gYYmIz8y_UE1h-X8yUtAYW2INy99BR_c&o=csv'
+downloaded 16 KB
+  count                                                      geometry
+1     3  <Point><coordinates>-0.13793,51.513418</coordinates></Point>
+2     2 <Point><coordinates>-0.137883,51.513361</coordinates></Point>
+3     1 <Point><coordinates>-0.137853,51.513317</coordinates></Point>
+4     1 <Point><coordinates>-0.137812,51.513262</coordinates></Point>
+5     4 <Point><coordinates>-0.137767,51.513204</coordinates></Point>
+6     2 <Point><coordinates>-0.137537,51.513184</coordinates></Point>
+'data.frame':	250 obs. of  2 variables:
+ $ count   : int  3 2 1 1 4 2 2 2 3 2 ...
+ $ geometry: Factor w/ 250 levels "<Point><coordinates>-0.132933,51.513258</coordinates></Point>",..: 202 199 197 195 193 185 213 206 217 214 ...
+#+end_example
+
+
+#+begin_src R :results output :session *R* :exports both
+pump_file = "./water_pump.csv";
+url = "https://fusiontables.google.com/exporttable?query=select+*+from+1Al4BQYgV0BzoyEaWmTcCRgLMkOJkiBo8QUQT_tU&o=csv"
+if(!file.exists(pump_file)) { download.file(url, pump_file) }
+pump = read.csv(pump_file, strip.white=T)
+head(pump)
+str(pump)
+#+end_src
+
+#+RESULTS:
+#+begin_example
+essai de l'URL 'https://fusiontables.google.com/exporttable?query=select+*+from+1Al4BQYgV0BzoyEaWmTcCRgLMkOJkiBo8QUQT_tU&o=csv'
+Content type 'application/x-download; charset=UTF-8' length 520 bytes
+==================================================
+downloaded 520 bytes
+                                                       geometry
+1 <Point><coordinates>-0.136668,51.513341</coordinates></Point>
+2 <Point><coordinates>-0.139586,51.513876</coordinates></Point>
+3 <Point><coordinates>-0.139671,51.514906</coordinates></Point>
+4  <Point><coordinates>-0.13163,51.512354</coordinates></Point>
+5 <Point><coordinates>-0.133594,51.512139</coordinates></Point>
+6 <Point><coordinates>-0.135919,51.511542</coordinates></Point>
+'data.frame':	8 obs. of  1 variable:
+ $ geometry: Factor w/ 8 levels "<Point><coordinates>-0.13163,51.512354</coordinates></Point>",..: 5 7 8 1 2 4 3 6
+#+end_example
+
+
+
+
+Ces coordonnées sous forme d'une chaîne de caractères ne sont pas bien
+pratiques. Extrayons les latitudes et longitudes.
+
+#+begin_src R :results output :session *R* :exports both
+cholera$geometry %>% 
+    str_replace_all("[</>A-Za-z]*","") %>%
+    str_split(",") %>% do.call("rbind",.) %>% as.numeric() -> cholera[,c("lon","lat")]
+cholera %>% select(-geometry) -> cholera
+# the do.call tricks was inspired by http://ggorjan.blogspot.com/2011/01/converting-strsplit-output-to-dataframe.html
+head(cholera)
+str(cholera)
+#+end_src
+
+#+RESULTS:
+#+begin_example
+  count       lon      lat
+1     3 -0.137930 51.51342
+2     2 -0.137883 51.51336
+3     1 -0.137853 51.51332
+4     1 -0.137812 51.51326
+5     4 -0.137767 51.51320
+6     2 -0.137537 51.51318
+'data.frame':	250 obs. of  3 variables:
+ $ count: int  3 2 1 1 4 2 2 2 3 2 ...
+ $ lon  : num  -0.138 -0.138 -0.138 -0.138 -0.138 ...
+ $ lat  : num  51.5 51.5 51.5 51.5 51.5 ...
+#+end_example
+
+Parfait! Faisons la même chose pour les données sur les pompes.
+
+#+begin_src R :results output :session *R* :exports both
+pump$geometry %>% 
+    str_replace_all("[</>A-Za-z]*","") %>%
+    str_split(",") %>% do.call("rbind",.) %>% as.numeric() -> pump[,c("lon","lat")]
+pump %>% select(-geometry) -> pump
+# the do.call tricks was inspired by http://ggorjan.blogspot.com/2011/01/converting-strsplit-output-to-dataframe.html
+head(pump)
+str(pump)
+#+end_src
+
+#+RESULTS:
+#+begin_example
+        lon      lat
+1 -0.136668 51.51334
+2 -0.139586 51.51388
+3 -0.139671 51.51491
+4 -0.131630 51.51235
+5 -0.133594 51.51214
+6 -0.135919 51.51154
+'data.frame':	8 obs. of  2 variables:
+ $ lon: num  -0.137 -0.14 -0.14 -0.132 -0.134 ...
+ $ lat: num  51.5 51.5 51.5 51.5 51.5 ...
+#+end_example
+
+D'autre part, plus tard, certains algorithmes (estimation de densité,
+/clustering/) que je vais utiliser gèrent mal le fait qu'une entrée
+regroupe plusieurs décès. Je vais donc me préparer une version
+"aplatie" de ~cholera~. Pour ça, je m'inspire de l'entrée suivante sur
+[[https://stackoverflow.com/questions/46841463/expand-a-data-frame-to-have-as-many-rows-as-range-of-two-columns-in-original-row][stackoverflow]].
+
+#+begin_src R :results output :session *R* :exports both
+cholera %>% 
+   rowwise() %>% 
+   do(data.frame(lon= .$lon, lat= .$lat, value = 1:.$count)) %>%
+   as.data.frame() -> cholera_flat
+head(cholera_flat )
+#+end_src
+
+#+RESULTS:
+:         lon      lat value
+: 1 -0.137930 51.51342     1
+: 2 -0.137930 51.51342     2
+: 3 -0.137930 51.51342     3
+: 4 -0.137883 51.51336     1
+: 5 -0.137883 51.51336     2
+: 6 -0.137853 51.51332     1
+
+* Visualisations
+** Une visualisation toute simple
+À partir de là, il est facile de représenter les données précédentes:
+#+begin_src R :results output graphics :file "map1.png" :exports both :width 600 :height 600 :session *R* 
+ggplot(cholera,aes(x=lon,y=lat)) + 
+    geom_point(aes(size=count),color="red3") +
+    geom_point(data=pump, color="blue", size=4) + 
+    theme_bw() + coord_fixed()
+#+end_src
+
+#+RESULTS:
+[[file:map1.png]]
+
+** Avec une carte de Londres (actuelle)
+ Mais pour avoir une vrai carte, il vaut mieux utiliser le bon outil:
+ [[https://journal.r-project.org/archive/2013-1/kahle-wickham.pdf][ggmap]]. J'ai donc bien lu cette documentation pour améliorer les
+ choses. Je vais centrer ma carte sur la médiane des coordonnées
+ précédentes.
+
+#+begin_src R :results output :session *R* :exports both
+library(ggmap)
+mapImageData <- get_map(location = c(lon = median(cholera$lon), lat = median(cholera$lat)),
+    color = "color",
+    source = "google",
+    maptype = "toner",
+    zoom = 16)
+
+LondonMap = ggmap(mapImageData,
+    extent = "device",
+    ylab = "Latitude",
+    xlab = "Longitude") 
+#+end_src
+
+#+RESULTS:
+#+begin_example
+Google Maps API Terms of Service: http://developers.google.com/maps/terms.
+Please cite ggmap if you use it: see citation('ggmap') for details.
+maptype = "toner" is only available with source = "stamen".
+resetting to source = "stamen"...
+Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=51.513379,-0.136288&zoom=16&size=640x640&scale=2&maptype=terrain&sensor=false
+Map from URL : http://tile.stamen.com/toner/16/32741/21789.png
+Map from URL : http://tile.stamen.com/toner/16/32742/21789.png
+Map from URL : http://tile.stamen.com/toner/16/32743/21789.png
+Map from URL : http://tile.stamen.com/toner/16/32744/21789.png
+Map from URL : http://tile.stamen.com/toner/16/32741/21790.png
+Map from URL : http://tile.stamen.com/toner/16/32742/21790.png
+Map from URL : http://tile.stamen.com/toner/16/32743/21790.png
+Map from URL : http://tile.stamen.com/toner/16/32744/21790.png
+Map from URL : http://tile.stamen.com/toner/16/32741/21791.png
+Map from URL : http://tile.stamen.com/toner/16/32742/21791.png
+Map from URL : http://tile.stamen.com/toner/16/32743/21791.png
+Map from URL : http://tile.stamen.com/toner/16/32744/21791.png
+Warning message:
+`panel.margin` is deprecated. Please use `panel.spacing` property instead
+#+end_example
+
+Allons-y. Comme avant, en rouge les cas de Choléra et en bleu les
+sources d'eau.
+#+begin_src R :results output graphics :file :file "map2.png" :exports both :width 800 :height 800 :session *R* 
+LondonMap + 
+    geom_point(data=cholera,aes(x=lon,y=lat, size=count),color="red3") +
+    geom_point(data=pump, color="blue",size=4) + theme_bw()
+#+end_src
+
+#+RESULTS:
+[[file:map2.png]]
+** Clustering
+L'histoire, dit qu'une des dames qui a contracté le Choléra faisait un
+peu office d'/outlier/ car elle n'habitait pas à proximité de la pompe
+infectée. Peut-être est-ce le petit point le plus en haut à droite ?
+Peut-être qu'un algorithme de clustering aurait été capable d'aider à
+le repérer ? Essayons de faire ça bien avec une bibliothèque faisant
+de l'estimation par maximum de vraisemblance de mixtures de
+gaussiennes. Pour ça, je m'inspire des [[https://www.statmethods.net/advstats/cluster.html][commandes données ici]].
+
+#+begin_src R :results output :session *R* :exports both
+library(mclust)
+fit <- Mclust(as.matrix(cholera_flat[,c("lon","lat")]),verbose=F)
+#+end_src
+
+#+RESULTS:
+:     __  ___________    __  _____________
+:    /  |/  / ____/ /   / / / / ___/_  __/
+:   / /|_/ / /   / /   / / / /\__ \ / /   
+:  / /  / / /___/ /___/ /_/ /___/ // /    
+: /_/  /_/\____/_____/\____//____//_/    version 5.4.1
+: Type 'citation("mclust")' for citing this R package in publications.
+
+#+begin_src R :results output graphics :file "clustering_BIC.png" :exports both :width 600 :height 400 :session *R* 
+plot(fit,what = c("BIC"))
+#+end_src
+
+#+RESULTS:
+[[file:clustering_BIC.png]]
+
+Ouh là, je ne trouve pas que ça soit bon signe, un BIC qui monte comme
+ça sans maximum bien clair.
+
+#+begin_src R :results output :session *R* :exports both
+summary(fit)
+#+end_src
+
+#+RESULTS:
+#+begin_example
+---------------------------------------------------- 
+Gaussian finite mixture model fitted by EM algorithm 
+---------------------------------------------------- 
+
+Mclust VEV (ellipsoidal, equal shape) model with 7 components: 
+
+ log.likelihood   n df      BIC      ICL
+       5523.087 489 35 10829.44 10693.22
+
+Clustering table:
+  1   2   3   4   5   6   7 
+121 119  26  73  83  23  44
+#+end_example
+
+Regardons à quoi ressemble la classification la plus vraisemblable.
+#+begin_src R :results output graphics :file "clustering.png" :exports both :width 600 :height 400 :session *R* 
+plot(fit,what = c("classification"))
+#+end_src
+
+#+RESULTS:
+[[file:clustering.png]]
+
+Bof, c'est un peu n'importe quoi mais bon, essayons quand même
+d'utiliser cette information sur la carte précédente:
+#+begin_src R :results output graphics :file "map3.png" :exports both :width 800 :height 800 :session *R* 
+cholera_flat$cluster = as.factor(fit$classification)
+
+cluster = as.data.frame(t(fit$parameters$mean))
+cluster$id = 1:nrow(cluster)
+
+cholera_clustered = cholera_flat %>% group_by(lon,lat,cluster) %>% summarize(count=n())
+
+outlier = cholera[cholera$lat == max(cholera$lat),]
+
+LondonMap + 
+    geom_point(data=cholera_clustered,
+               aes(x=lon, y=lat, size=count, color=cluster)) +
+    stat_ellipse(data=cholera_flat,aes(x=lon, y=lat, fill=cluster, group=cluster),
+                 type = "norm",geom = "polygon", alpha = .3) +
+    geom_label(data = cluster, aes(x=lon, y=lat, label=id)) + 
+    geom_label(data = outlier, aes(x=lon, y=lat+0.00012), fill="red", label="outlier",alpha=.3) + 
+    scale_colour_brewer(palette="Set1") + 
+    scale_fill_brewer(palette="Set1") + theme_bw()
+#+end_src
+
+#+RESULTS:
+[[file:map3.png]]
+
+Bon, c'est "joli". Mon outlier est effectivement le seul à ne pas être
+dans les ellipses qui rassemblent 95% de la densité des
+gaussiennes. De là à dire que c'est satisfaisant...
+
+** Estimation de densité
+Essayons plutôt d'estimer la densité de déces.
+#+begin_src R :results output graphics :file "map4.png" :exports both :width 800 :height 800 :session *R* 
+LondonMap + 
+    stat_density2d(data=cholera_flat,
+                   aes(x = lon, y = lat, fill = ..level.., alpha = ..level..),
+                   size = 2, bins = 4, 
+                   geom = "polygon",
+                   fill="red"
+                   ) +
+    geom_point(data=cholera,aes(x=lon,y=lat, size=count),color="red3") +
+    geom_point(data=pump, color="blue",size=4) + 
+    geom_label(data = outlier, aes(x=lon, y=lat+0.00012), fill="red", label="outlier",alpha=.3) + 
+    theme_bw()
+#+end_src
+
+#+RESULTS:
+[[file:map4.png]]
+
+** Et en travaillant sur les foyers plutôt que sur les décès ?
+En réappliquant l'algorithme de clustering sur les données "non
+applaties", on travaille sur l'estimation des foyées plutôt que sur
+l'estimation de décès, ce qui peut avoir du sens. On trouve alors
+uniquement une évolution du BIC bien plus franche avec deux
+clusters. Je vais donc refaire la carte précédente en estimant le
+centre des foyers.
+
+#+begin_src R :results output graphics :file "map5.png" :exports both :width 800 :height 800 :session *R* 
+LondonMap + 
+    stat_density2d(data=cholera,
+                   aes(x = lon, y = lat, fill = ..level.., alpha = ..level..),
+                   size = 2, bins = 4, 
+                   geom = "polygon",
+                   fill="red"
+                   ) +
+    stat_ellipse(data=cholera,aes(x=lon, y=lat),
+                 type = "norm",geom = "polygon", alpha = .3) +
+    geom_point(data=cholera,aes(x=lon,y=lat, size=count),color="red3") +
+    geom_point(data=pump, color="blue",size=4) + 
+    geom_label(data = outlier, aes(x=lon, y=lat+0.00012), fill="red", label="outlier",alpha=.3) + 
+    theme_bw()
+#+end_src
+
+#+RESULTS:
+[[file:map5.png]]
+
+Ah effectivement, là, c'est impressionant. Le centre des foyers
+coïncide quasi parfaitement avec l'emplacement de la source d'eau
+infectée. Notre /outlier/ en haut à droite ressort effectivement
+bien. Bon, en sachant ce qu'il fallait trouver et à force de
+tripatouiller, c'est pas absurde que je finisse par arriver à donner
+l'impression que "ça marche". À l'ère du Big Data, c'est amusant mais
+on comprend quand même les limites de ces approches "élaborées" mais
+qui ignorent l'histoire qui est derrière ces données