#+TITLE: The London cholera epidemic of 1854 #+AUTHOR: LT #+LANGUAGE: en #+HTML_HEAD: #+HTML_HEAD: #+HTML_HEAD: #+HTML_HEAD: #+HTML_HEAD: #+HTML_HEAD: #+LATEX_HEADER: \usepackage{a4} #+LATEX_HEADER: \usepackage[french]{babel} #+PROPERTY: header-args :session :exports both :eval never-export # FIXME! https://towardsdatascience.com/plotting-a-map-of-london-crime-data-using-r-8dcefef1c397 * Data recovery and pre-processing Let's start by loading a number of libraries which we will be useful later. #+begin_src R :results output :session *R* :exports both library(ggplot2) library(stringr) library(dplyr) library(tidyr) #+end_src #+RESULTS: #+begin_example Package attachment : ‘dplyr’ The following objects are masked from ‘package:stats’: filter, lag The following objects are masked from ‘package:base’: intersect, setdiff, setequal, union #+end_example Now download the data on the Cholera epidemic of 1854. #+begin_src R :results output :session *R* :exports both cholera_file = "./cholera.csv"; url = "https://fusiontables.google.com/exporttable?query=select+*+from+1HsIb_r4gYYmIz8y_UE1h-X8yUtAYW2INy99BR_c&o=csv" if(!file.exists(cholera_file)) { download.file(url, cholera_file) } cholera = read.csv(cholera_file, strip.white=T) head(cholera) str(cholera) #+end_src #+RESULTS: #+begin_example essai de l'URL 'https://fusiontables.google.com/exporttable?query=select+*+from+1HsIb_r4gYYmIz8y_UE1h-X8yUtAYW2INy99BR_c&o=csv' downloaded 16 KB count geometry 1 3 -0.13793,51.513418 2 2 -0.137883,51.513361 3 1 -0.137853,51.513317 4 1 -0.137812,51.513262 5 4 -0.137767,51.513204 6 2 -0.137537,51.513184 'data.frame': 250 obs. of 2 variables: $ count : int 3 2 1 1 4 2 2 2 3 2 ... $ geometry: Factor w/ 250 levels "-0.132933,51.513258",..: 202 199 197 195 193 185 213 206 217 214 ... #+end_example #+begin_src R :results output :session *R* :exports both pump_file = "./water_pump.csv"; url = "https://fusiontables.google.com/exporttable?query=select+*+from+1Al4BQYgV0BzoyEaWmTcCRgLMkOJkiBo8QUQT_tU&o=csv" if(!file.exists(pump_file)) { download.file(url, pump_file) } pump = read.csv(pump_file, strip.white=T) head(pump) str(pump) #+end_src #+RESULTS: #+begin_example essai de l'URL 'https://fusiontables.google.com/exporttable?query=select+*+from+1Al4BQYgV0BzoyEaWmTcCRgLMkOJkiBo8QUQT_tU&o=csv' Content type 'application/x-download; charset=UTF-8' length 520 bytes ================================================== downloaded 520 bytes geometry 1 -0.136668,51.513341 2 -0.139586,51.513876 3 -0.139671,51.514906 4 -0.13163,51.512354 5 -0.133594,51.512139 6 -0.135919,51.511542 'data.frame': 8 obs. of 1 variable: $ geometry: Factor w/ 8 levels "-0.13163,51.512354",..: 5 7 8 1 2 4 3 6 #+end_example These coordinates in the form of a character string are not very practical. Let's extract the latitudes and longitudes. #+begin_src R :results output :session *R* :exports both cholera$geometry %>% str_replace_all("[A-Za-z]*","") %>% str_split(",") %>% do.call("rbind",.) %>% as.numeric() -> cholera[,c("lon","lat")] cholera %>% select(-geometry) -> cholera # the do.call tricks was inspired by http://ggorjan.blogspot.com/2011/01/converting-strsplit-output-to-dataframe.html head(cholera) str(cholera) #+end_src #+RESULTS: #+begin_example count lon lat 1 3 -0.137930 51.51342 2 2 -0.137883 51.51336 3 1 -0.137853 51.51332 4 1 -0.137812 51.51326 5 4 -0.137767 51.51320 6 2 -0.137537 51.51318 'data.frame': 250 obs. of 3 variables: $ count: int 3 2 1 1 4 2 2 2 3 2 ... $ lon : num -0.138 -0.138 -0.138 -0.138 -0.138 ... $ lat : num 51.5 51.5 51.5 51.5 51.5 ... #+end_example #+begin_src R :results output :session *R* :exports both pump$geometry %>% str_replace_all("[A-Za-z]*","") %>% str_split(",") %>% do.call("rbind",.) %>% as.numeric() -> pump[,c("lon","lat")] pump %>% select(-geometry) -> pump # the do.call tricks was inspired by http://ggorjan.blogspot.com/2011/01/converting-strsplit-output-to-dataframe.html head(pump) str(pump) #+end_src #+RESULTS: #+begin_example lon lat 1 -0.136668 51.51334 2 -0.139586 51.51388 3 -0.139671 51.51491 4 -0.131630 51.51235 5 -0.133594 51.51214 6 -0.135919 51.51154 'data.frame': 8 obs. of 2 variables: $ lon: num -0.137 -0.14 -0.14 -0.132 -0.134 ... $ lat: num 51.5 51.5 51.5 51.5 51.5 ... #+end_example On the other hand, later, some algorithms (density estimation, / clustering /) that I will use do not manage well the fact that an entry includes several deaths. So I'm going to prepare a "flattened" version of ~ cholera ~. For this, I am inspired by the following entry on [[https://stackoverflow.com/questions/46841463/expand-a-data-frame-to-have-as-many-rows-as-range-of-two-columns-in-original-row[[stackoverflow ]]. #+begin_src R :results output :session *R* :exports both cholera %>% rowwise() %>% do(data.frame(lon= .$lon, lat= .$lat, value = 1:.$count)) %>% as.data.frame() -> cholera_flat head(cholera_flat ) #+end_src #+RESULTS: : lon lat value : 1 -0.137930 51.51342 1 : 2 -0.137930 51.51342 2 : 3 -0.137930 51.51342 3 : 4 -0.137883 51.51336 1 : 5 -0.137883 51.51336 2 : 6 -0.137853 51.51332 1 * Visualisations From there, it's easy to represent the previous data: #+begin_src R :results output graphics :file "map1.png" :exports both :width 600 :height 600 :session *R* ggplot(cholera,aes(x=lon,y=lat)) + geom_point(aes(size=count),color="red3") + geom_point(data=pump, color="blue", size=4) + theme_bw() + coord_fixed() #+end_src #+RESULTS: [[file:map1.png]] ** With a map of London (current) But to have a real map, it is better to use the right tool: [[https://journal.r-project.org/archive/2013-1/kahle-wickham.pdf[[ggmap]]. So I read this documentation well to improve things. I will center my map on the median of the previous coordinates. #+begin_src R :results output :session *R* :exports both library(ggmap) mapImageData <- get_map(location = c(lon = median(cholera$lon), lat = median(cholera$lat)), color = "color", source = "google", maptype = "toner", zoom = 16) LondonMap = ggmap(mapImageData, extent = "device", ylab = "Latitude", xlab = "Longitude") #+end_src #+RESULTS: #+begin_example Google Maps API Terms of Service: http://developers.google.com/maps/terms. Please cite ggmap if you use it: see citation('ggmap') for details. maptype = "toner" is only available with source = "stamen". resetting to source = "stamen"... Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=51.513379,-0.136288&zoom=16&size=640x640&scale=2&maptype=terrain&sensor=false Map from URL : http://tile.stamen.com/toner/16/32741/21789.png Map from URL : http://tile.stamen.com/toner/16/32742/21789.png Map from URL : http://tile.stamen.com/toner/16/32743/21789.png Map from URL : http://tile.stamen.com/toner/16/32744/21789.png Map from URL : http://tile.stamen.com/toner/16/32741/21790.png Map from URL : http://tile.stamen.com/toner/16/32742/21790.png Map from URL : http://tile.stamen.com/toner/16/32743/21790.png Map from URL : http://tile.stamen.com/toner/16/32744/21790.png Map from URL : http://tile.stamen.com/toner/16/32741/21791.png Map from URL : http://tile.stamen.com/toner/16/32742/21791.png Map from URL : http://tile.stamen.com/toner/16/32743/21791.png Map from URL : http://tile.stamen.com/toner/16/32744/21791.png Warning message: `panel.margin` is deprecated. Please use `panel.spacing` property instead #+end_example Let’s go. As before, in red the cases of Cholera and in blue the sources of water. #+begin_src R :results output graphics :file :file "map2.png" :exports both :width 800 :height 800 :session *R* LondonMap + geom_point(data=cholera,aes(x=lon,y=lat, size=count),color="red3") + geom_point(data=pump, color="blue",size=4) + theme_bw() #+end_src #+RESULTS: [[file:map2.png]] ** Clustering L'histoire, dit qu'une des dames qui a contracté le Choléra faisait un peu office d'/outlier/ car elle n'habitait pas à proximité de la pompe infectée. Peut-être est-ce le petit point le plus en haut à droite ? Peut-être qu'un algorithme de clustering aurait été capable d'aider à le repérer ? Essayons de faire ça bien avec une bibliothèque faisant de l'estimation par maximum de vraisemblance de mixtures de gaussiennes. Pour ça, je m'inspire des [[https://www.statmethods.net/advstats/cluster.html][commandes données ici]]. #+begin_src R :results output :session *R* :exports both library(mclust) fit <- Mclust(as.matrix(cholera_flat[,c("lon","lat")]),verbose=F) #+end_src #+RESULTS: : __ ___________ __ _____________ : / |/ / ____/ / / / / / ___/_ __/ : / /|_/ / / / / / / / /\__ \ / / : / / / / /___/ /___/ /_/ /___/ // / : /_/ /_/\____/_____/\____//____//_/ version 5.4.1 : Type 'citation("mclust")' for citing this R package in publications. #+begin_src R :results output graphics :file "clustering_BIC.png" :exports both :width 600 :height 400 :session *R* plot(fit,what = c("BIC")) #+end_src #+RESULTS: [[file:clustering_BIC.png]] Ouh là, je ne trouve pas que ça soit bon signe, un BIC qui monte comme ça sans maximum bien clair. #+begin_src R :results output :session *R* :exports both summary(fit) #+end_src #+RESULTS: #+begin_example ---------------------------------------------------- Gaussian finite mixture model fitted by EM algorithm ---------------------------------------------------- Mclust VEV (ellipsoidal, equal shape) model with 7 components: log.likelihood n df BIC ICL 5523.087 489 35 10829.44 10693.22 Clustering table: 1 2 3 4 5 6 7 121 119 26 73 83 23 44 #+end_example Let's take a look at what the most likely classification looks like. #+begin_src R :results output graphics :file "clustering.png" :exports both :width 600 :height 400 :session *R* plot(fit,what = c("classification")) #+end_src #+RESULTS: [[file:clustering.png]] Bof, it's a little anything but hey, let's try anyway to use this information on the previous card: #+begin_src R :results output graphics :file "map3.png" :exports both :width 800 :height 800 :session *R* cholera_flat$cluster = as.factor(fit$classification) cluster = as.data.frame(t(fit$parameters$mean)) cluster$id = 1:nrow(cluster) cholera_clustered = cholera_flat %>% group_by(lon,lat,cluster) %>% summarize(count=n()) outlier = cholera[cholera$lat == max(cholera$lat),] LondonMap + geom_point(data=cholera_clustered, aes(x=lon, y=lat, size=count, color=cluster)) + stat_ellipse(data=cholera_flat,aes(x=lon, y=lat, fill=cluster, group=cluster), type = "norm",geom = "polygon", alpha = .3) + geom_label(data = cluster, aes(x=lon, y=lat, label=id)) + geom_label(data = outlier, aes(x=lon, y=lat+0.00012), fill="red", label="outlier",alpha=.3) + scale_colour_brewer(palette="Set1") + scale_fill_brewer(palette="Set1") + theme_bw() #+end_src #+RESULTS: [[file:map3.png]] ** Estimation the density #+begin_src R :results output graphics :file "map4.png" :exports both :width 800 :height 800 :session *R* LondonMap + stat_density2d(data=cholera_flat, aes(x = lon, y = lat, fill = ..level.., alpha = ..level..), size = 2, bins = 4, geom = "polygon", fill="red" ) + geom_point(data=cholera,aes(x=lon,y=lat, size=count),color="red3") + geom_point(data=pump, color="blue",size=4) + geom_label(data = outlier, aes(x=lon, y=lat+0.00012), fill="red", label="outlier",alpha=.3) + theme_bw() #+end_src #+RESULTS: [[file:map4.png]] #+begin_src R :results output graphics :file "map5.png" :exports both :width 800 :height 800 :session *R* LondonMap + stat_density2d(data=cholera, aes(x = lon, y = lat, fill = ..level.., alpha = ..level..), size = 2, bins = 4, geom = "polygon", fill="red" ) + stat_ellipse(data=cholera,aes(x=lon, y=lat), type = "norm",geom = "polygon", alpha = .3) + geom_point(data=cholera,aes(x=lon,y=lat, size=count),color="red3") + geom_point(data=pump, color="blue",size=4) + geom_label(data = outlier, aes(x=lon, y=lat+0.00012), fill="red", label="outlier",alpha=.3) + theme_bw() #+end_src #+RESULTS: [[file:map5.png]]