#+TITLE: The London cholera epidemic of 1854
#+AUTHOR: LT
#+LANGUAGE: en
#+HTML_HEAD:
#+HTML_HEAD:
#+HTML_HEAD:
#+HTML_HEAD:
#+HTML_HEAD:
#+HTML_HEAD:
#+LATEX_HEADER: \usepackage{a4}
#+LATEX_HEADER: \usepackage[french]{babel}
#+PROPERTY: header-args :session :exports both :eval never-export
# FIXME! https://towardsdatascience.com/plotting-a-map-of-london-crime-data-using-r-8dcefef1c397
* Data recovery and pre-processing
Let's start by loading a number of libraries which we
will be useful later.
#+begin_src R :results output :session *R* :exports both
library(ggplot2)
library(stringr)
library(dplyr)
library(tidyr)
#+end_src
#+RESULTS:
#+begin_example
Package attachment : ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
#+end_example
Now download the data on the Cholera epidemic of 1854.
#+begin_src R :results output :session *R* :exports both
cholera_file = "./cholera.csv";
url = "https://fusiontables.google.com/exporttable?query=select+*+from+1HsIb_r4gYYmIz8y_UE1h-X8yUtAYW2INy99BR_c&o=csv"
if(!file.exists(cholera_file)) { download.file(url, cholera_file) }
cholera = read.csv(cholera_file, strip.white=T)
head(cholera)
str(cholera)
#+end_src
#+RESULTS:
#+begin_example
essai de l'URL 'https://fusiontables.google.com/exporttable?query=select+*+from+1HsIb_r4gYYmIz8y_UE1h-X8yUtAYW2INy99BR_c&o=csv'
downloaded 16 KB
count geometry
1 3 -0.13793,51.513418
2 2 -0.137883,51.513361
3 1 -0.137853,51.513317
4 1 -0.137812,51.513262
5 4 -0.137767,51.513204
6 2 -0.137537,51.513184
'data.frame': 250 obs. of 2 variables:
$ count : int 3 2 1 1 4 2 2 2 3 2 ...
$ geometry: Factor w/ 250 levels "-0.132933,51.513258",..: 202 199 197 195 193 185 213 206 217 214 ...
#+end_example
#+begin_src R :results output :session *R* :exports both
pump_file = "./water_pump.csv";
url = "https://fusiontables.google.com/exporttable?query=select+*+from+1Al4BQYgV0BzoyEaWmTcCRgLMkOJkiBo8QUQT_tU&o=csv"
if(!file.exists(pump_file)) { download.file(url, pump_file) }
pump = read.csv(pump_file, strip.white=T)
head(pump)
str(pump)
#+end_src
#+RESULTS:
#+begin_example
essai de l'URL 'https://fusiontables.google.com/exporttable?query=select+*+from+1Al4BQYgV0BzoyEaWmTcCRgLMkOJkiBo8QUQT_tU&o=csv'
Content type 'application/x-download; charset=UTF-8' length 520 bytes
==================================================
downloaded 520 bytes
geometry
1 -0.136668,51.513341
2 -0.139586,51.513876
3 -0.139671,51.514906
4 -0.13163,51.512354
5 -0.133594,51.512139
6 -0.135919,51.511542
'data.frame': 8 obs. of 1 variable:
$ geometry: Factor w/ 8 levels "-0.13163,51.512354",..: 5 7 8 1 2 4 3 6
#+end_example
These coordinates in the form of a character string are not very practical. Let's extract the latitudes and longitudes.
#+begin_src R :results output :session *R* :exports both
cholera$geometry %>%
str_replace_all("[>A-Za-z]*","") %>%
str_split(",") %>% do.call("rbind",.) %>% as.numeric() -> cholera[,c("lon","lat")]
cholera %>% select(-geometry) -> cholera
# the do.call tricks was inspired by http://ggorjan.blogspot.com/2011/01/converting-strsplit-output-to-dataframe.html
head(cholera)
str(cholera)
#+end_src
#+RESULTS:
#+begin_example
count lon lat
1 3 -0.137930 51.51342
2 2 -0.137883 51.51336
3 1 -0.137853 51.51332
4 1 -0.137812 51.51326
5 4 -0.137767 51.51320
6 2 -0.137537 51.51318
'data.frame': 250 obs. of 3 variables:
$ count: int 3 2 1 1 4 2 2 2 3 2 ...
$ lon : num -0.138 -0.138 -0.138 -0.138 -0.138 ...
$ lat : num 51.5 51.5 51.5 51.5 51.5 ...
#+end_example
#+begin_src R :results output :session *R* :exports both
pump$geometry %>%
str_replace_all("[>A-Za-z]*","") %>%
str_split(",") %>% do.call("rbind",.) %>% as.numeric() -> pump[,c("lon","lat")]
pump %>% select(-geometry) -> pump
# the do.call tricks was inspired by http://ggorjan.blogspot.com/2011/01/converting-strsplit-output-to-dataframe.html
head(pump)
str(pump)
#+end_src
#+RESULTS:
#+begin_example
lon lat
1 -0.136668 51.51334
2 -0.139586 51.51388
3 -0.139671 51.51491
4 -0.131630 51.51235
5 -0.133594 51.51214
6 -0.135919 51.51154
'data.frame': 8 obs. of 2 variables:
$ lon: num -0.137 -0.14 -0.14 -0.132 -0.134 ...
$ lat: num 51.5 51.5 51.5 51.5 51.5 ...
#+end_example
On the other hand, later, some algorithms (density estimation, / clustering /) that I will use do not manage well the fact that an entry includes several deaths. So I'm going to prepare a "flattened" version of ~ cholera ~. For this, I am inspired by the following entry on
[[https://stackoverflow.com/questions/46841463/expand-a-data-frame-to-have-as-many-rows-as-range-of-two-columns-in-original-row[[stackoverflow ]].
#+begin_src R :results output :session *R* :exports both
cholera %>%
rowwise() %>%
do(data.frame(lon= .$lon, lat= .$lat, value = 1:.$count)) %>%
as.data.frame() -> cholera_flat
head(cholera_flat )
#+end_src
#+RESULTS:
: lon lat value
: 1 -0.137930 51.51342 1
: 2 -0.137930 51.51342 2
: 3 -0.137930 51.51342 3
: 4 -0.137883 51.51336 1
: 5 -0.137883 51.51336 2
: 6 -0.137853 51.51332 1
* Visualisations
From there, it's easy to represent the previous data:
#+begin_src R :results output graphics :file "map1.png" :exports both :width 600 :height 600 :session *R*
ggplot(cholera,aes(x=lon,y=lat)) +
geom_point(aes(size=count),color="red3") +
geom_point(data=pump, color="blue", size=4) +
theme_bw() + coord_fixed()
#+end_src
#+RESULTS:
[[file:map1.png]]
** With a map of London (current)
But to have a real map, it is better to use the right tool: [[https://journal.r-project.org/archive/2013-1/kahle-wickham.pdf[[ggmap]]. So I read this documentation well to improve things. I will center my map on the median of the previous coordinates.
#+begin_src R :results output :session *R* :exports both
library(ggmap)
mapImageData <- get_map(location = c(lon = median(cholera$lon), lat = median(cholera$lat)),
color = "color",
source = "google",
maptype = "toner",
zoom = 16)
LondonMap = ggmap(mapImageData,
extent = "device",
ylab = "Latitude",
xlab = "Longitude")
#+end_src
#+RESULTS:
#+begin_example
Google Maps API Terms of Service: http://developers.google.com/maps/terms.
Please cite ggmap if you use it: see citation('ggmap') for details.
maptype = "toner" is only available with source = "stamen".
resetting to source = "stamen"...
Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=51.513379,-0.136288&zoom=16&size=640x640&scale=2&maptype=terrain&sensor=false
Map from URL : http://tile.stamen.com/toner/16/32741/21789.png
Map from URL : http://tile.stamen.com/toner/16/32742/21789.png
Map from URL : http://tile.stamen.com/toner/16/32743/21789.png
Map from URL : http://tile.stamen.com/toner/16/32744/21789.png
Map from URL : http://tile.stamen.com/toner/16/32741/21790.png
Map from URL : http://tile.stamen.com/toner/16/32742/21790.png
Map from URL : http://tile.stamen.com/toner/16/32743/21790.png
Map from URL : http://tile.stamen.com/toner/16/32744/21790.png
Map from URL : http://tile.stamen.com/toner/16/32741/21791.png
Map from URL : http://tile.stamen.com/toner/16/32742/21791.png
Map from URL : http://tile.stamen.com/toner/16/32743/21791.png
Map from URL : http://tile.stamen.com/toner/16/32744/21791.png
Warning message:
`panel.margin` is deprecated. Please use `panel.spacing` property instead
#+end_example
Let’s go. As before, in red the cases of Cholera and in blue the
sources of water.
#+begin_src R :results output graphics :file :file "map2.png" :exports both :width 800 :height 800 :session *R*
LondonMap +
geom_point(data=cholera,aes(x=lon,y=lat, size=count),color="red3") +
geom_point(data=pump, color="blue",size=4) + theme_bw()
#+end_src
#+RESULTS:
[[file:map2.png]]
** Clustering
L'histoire, dit qu'une des dames qui a contracté le Choléra faisait un
peu office d'/outlier/ car elle n'habitait pas à proximité de la pompe
infectée. Peut-être est-ce le petit point le plus en haut à droite ?
Peut-être qu'un algorithme de clustering aurait été capable d'aider à
le repérer ? Essayons de faire ça bien avec une bibliothèque faisant
de l'estimation par maximum de vraisemblance de mixtures de
gaussiennes. Pour ça, je m'inspire des [[https://www.statmethods.net/advstats/cluster.html][commandes données ici]].
#+begin_src R :results output :session *R* :exports both
library(mclust)
fit <- Mclust(as.matrix(cholera_flat[,c("lon","lat")]),verbose=F)
#+end_src
#+RESULTS:
: __ ___________ __ _____________
: / |/ / ____/ / / / / / ___/_ __/
: / /|_/ / / / / / / / /\__ \ / /
: / / / / /___/ /___/ /_/ /___/ // /
: /_/ /_/\____/_____/\____//____//_/ version 5.4.1
: Type 'citation("mclust")' for citing this R package in publications.
#+begin_src R :results output graphics :file "clustering_BIC.png" :exports both :width 600 :height 400 :session *R*
plot(fit,what = c("BIC"))
#+end_src
#+RESULTS:
[[file:clustering_BIC.png]]
Ouh là, je ne trouve pas que ça soit bon signe, un BIC qui monte comme
ça sans maximum bien clair.
#+begin_src R :results output :session *R* :exports both
summary(fit)
#+end_src
#+RESULTS:
#+begin_example
----------------------------------------------------
Gaussian finite mixture model fitted by EM algorithm
----------------------------------------------------
Mclust VEV (ellipsoidal, equal shape) model with 7 components:
log.likelihood n df BIC ICL
5523.087 489 35 10829.44 10693.22
Clustering table:
1 2 3 4 5 6 7
121 119 26 73 83 23 44
#+end_example
Let's take a look at what the most likely classification looks like.
#+begin_src R :results output graphics :file "clustering.png" :exports both :width 600 :height 400 :session *R*
plot(fit,what = c("classification"))
#+end_src
#+RESULTS:
[[file:clustering.png]]
Bof, it's a little anything but hey, let's try anyway
to use this information on the previous card:
#+begin_src R :results output graphics :file "map3.png" :exports both :width 800 :height 800 :session *R*
cholera_flat$cluster = as.factor(fit$classification)
cluster = as.data.frame(t(fit$parameters$mean))
cluster$id = 1:nrow(cluster)
cholera_clustered = cholera_flat %>% group_by(lon,lat,cluster) %>% summarize(count=n())
outlier = cholera[cholera$lat == max(cholera$lat),]
LondonMap +
geom_point(data=cholera_clustered,
aes(x=lon, y=lat, size=count, color=cluster)) +
stat_ellipse(data=cholera_flat,aes(x=lon, y=lat, fill=cluster, group=cluster),
type = "norm",geom = "polygon", alpha = .3) +
geom_label(data = cluster, aes(x=lon, y=lat, label=id)) +
geom_label(data = outlier, aes(x=lon, y=lat+0.00012), fill="red", label="outlier",alpha=.3) +
scale_colour_brewer(palette="Set1") +
scale_fill_brewer(palette="Set1") + theme_bw()
#+end_src
#+RESULTS:
[[file:map3.png]]
** Estimation the density
#+begin_src R :results output graphics :file "map4.png" :exports both :width 800 :height 800 :session *R*
LondonMap +
stat_density2d(data=cholera_flat,
aes(x = lon, y = lat, fill = ..level.., alpha = ..level..),
size = 2, bins = 4,
geom = "polygon",
fill="red"
) +
geom_point(data=cholera,aes(x=lon,y=lat, size=count),color="red3") +
geom_point(data=pump, color="blue",size=4) +
geom_label(data = outlier, aes(x=lon, y=lat+0.00012), fill="red", label="outlier",alpha=.3) +
theme_bw()
#+end_src
#+RESULTS:
[[file:map4.png]]
#+begin_src R :results output graphics :file "map5.png" :exports both :width 800 :height 800 :session *R*
LondonMap +
stat_density2d(data=cholera,
aes(x = lon, y = lat, fill = ..level.., alpha = ..level..),
size = 2, bins = 4,
geom = "polygon",
fill="red"
) +
stat_ellipse(data=cholera,aes(x=lon, y=lat),
type = "norm",geom = "polygon", alpha = .3) +
geom_point(data=cholera,aes(x=lon,y=lat, size=count),color="red3") +
geom_point(data=pump, color="blue",size=4) +
geom_label(data = outlier, aes(x=lon, y=lat+0.00012), fill="red", label="outlier",alpha=.3) +
theme_bw()
#+end_src
#+RESULTS:
[[file:map5.png]]