diff --git a/module3/exo3/exercice_python_en.org b/module3/exo3/exercice_python_en.org
index 5782f493934678ba782fb65634a4d86e5f3adefc..c89163955773823627381ea227caf206c24c0d90 100644
--- a/module3/exo3/exercice_python_en.org
+++ b/module3/exo3/exercice_python_en.org
@@ -1,8 +1,6 @@
-#+TITLE: Your title
-#+AUTHOR: Your name
-#+DATE: Today's date
+#+TITLE: The London cholera epidemic of 1854
+#+AUTHOR: LT
#+LANGUAGE: en
-# #+PROPERTY: header-args :eval never-export
#+HTML_HEAD:
#+HTML_HEAD:
@@ -11,84 +9,353 @@
#+HTML_HEAD:
#+HTML_HEAD:
-* Some explanations
+#+LATEX_HEADER: \usepackage{a4}
+#+LATEX_HEADER: \usepackage[french]{babel}
-This is an org-mode document with code examples in R. Once opened in
-Emacs, this document can easily be exported to HTML, PDF, and Office
-formats. For more information on org-mode, see
-https://orgmode.org/guide/.
+#+PROPERTY: header-args :session :exports both :eval never-export
-When you type the shortcut =C-c C-e h o=, this document will be
-exported as HTML. All the code in it will be re-executed, and the
-results will be retrieved and included into the exported document. If
-you do not want to re-execute all code each time, you can delete the #
-and the space before ~#+PROPERTY:~ in the header of this document.
+# FIXME! https://towardsdatascience.com/plotting-a-map-of-london-crime-data-using-r-8dcefef1c397
-Like we showed in the video, Python code is included as follows (and
-is exxecuted by typing ~C-c C-c~):
+* Data recovery and pre-processing
+Let's start by loading a number of libraries which we
+will be useful later.
+#+begin_src R :results output :session *R* :exports both
+library(ggplot2)
+library(stringr)
+library(dplyr)
+library(tidyr)
+#+end_src
+
+#+RESULTS:
+#+begin_example
+
+Package attachment : ‘dplyr’
+
+The following objects are masked from ‘package:stats’:
+
+ filter, lag
+
+The following objects are masked from ‘package:base’:
+
+ intersect, setdiff, setequal, union
+#+end_example
+
+Now download the data on the Cholera epidemic of 1854.
-#+begin_src python :results output :exports both
-print("Hello world!")
+#+begin_src R :results output :session *R* :exports both
+cholera_file = "./cholera.csv";
+url = "https://fusiontables.google.com/exporttable?query=select+*+from+1HsIb_r4gYYmIz8y_UE1h-X8yUtAYW2INy99BR_c&o=csv"
+if(!file.exists(cholera_file)) { download.file(url, cholera_file) }
+cholera = read.csv(cholera_file, strip.white=T)
+head(cholera)
+str(cholera)
#+end_src
#+RESULTS:
-: Hello world!
+#+begin_example
+essai de l'URL 'https://fusiontables.google.com/exporttable?query=select+*+from+1HsIb_r4gYYmIz8y_UE1h-X8yUtAYW2INy99BR_c&o=csv'
+downloaded 16 KB
+ count geometry
+1 3 -0.13793,51.513418
+2 2 -0.137883,51.513361
+3 1 -0.137853,51.513317
+4 1 -0.137812,51.513262
+5 4 -0.137767,51.513204
+6 2 -0.137537,51.513184
+'data.frame': 250 obs. of 2 variables:
+ $ count : int 3 2 1 1 4 2 2 2 3 2 ...
+ $ geometry: Factor w/ 250 levels "-0.132933,51.513258",..: 202 199 197 195 193 185 213 206 217 214 ...
+#+end_example
-And now the same but in an Python session. With a session, Python's
-state, i.e. the values of all the variables, remains persistent from
-one code block to the next. The code is still executed using ~C-c
-C-c~.
-#+begin_src python :results output :session :exports both
-import numpy
-x=numpy.linspace(-15,15)
-print(x)
+#+begin_src R :results output :session *R* :exports both
+pump_file = "./water_pump.csv";
+url = "https://fusiontables.google.com/exporttable?query=select+*+from+1Al4BQYgV0BzoyEaWmTcCRgLMkOJkiBo8QUQT_tU&o=csv"
+if(!file.exists(pump_file)) { download.file(url, pump_file) }
+pump = read.csv(pump_file, strip.white=T)
+head(pump)
+str(pump)
#+end_src
#+RESULTS:
#+begin_example
-[-15. -14.3877551 -13.7755102 -13.16326531 -12.55102041
- -11.93877551 -11.32653061 -10.71428571 -10.10204082 -9.48979592
- -8.87755102 -8.26530612 -7.65306122 -7.04081633 -6.42857143
- -5.81632653 -5.20408163 -4.59183673 -3.97959184 -3.36734694
- -2.75510204 -2.14285714 -1.53061224 -0.91836735 -0.30612245
- 0.30612245 0.91836735 1.53061224 2.14285714 2.75510204
- 3.36734694 3.97959184 4.59183673 5.20408163 5.81632653
- 6.42857143 7.04081633 7.65306122 8.26530612 8.87755102
- 9.48979592 10.10204082 10.71428571 11.32653061 11.93877551
- 12.55102041 13.16326531 13.7755102 14.3877551 15. ]
+essai de l'URL 'https://fusiontables.google.com/exporttable?query=select+*+from+1Al4BQYgV0BzoyEaWmTcCRgLMkOJkiBo8QUQT_tU&o=csv'
+Content type 'application/x-download; charset=UTF-8' length 520 bytes
+==================================================
+downloaded 520 bytes
+ geometry
+1 -0.136668,51.513341
+2 -0.139586,51.513876
+3 -0.139671,51.514906
+4 -0.13163,51.512354
+5 -0.133594,51.512139
+6 -0.135919,51.511542
+'data.frame': 8 obs. of 1 variable:
+ $ geometry: Factor w/ 8 levels "-0.13163,51.512354",..: 5 7 8 1 2 4 3 6
#+end_example
-Finally, an example for graphical output:
-#+begin_src python :results output file :session :var matplot_lib_filename="./cosxsx.png" :exports results
-import matplotlib.pyplot as plt
-plt.figure(figsize=(10,5))
-plt.plot(x,numpy.cos(x)/x)
-plt.tight_layout()
-plt.savefig(matplot_lib_filename)
-print(matplot_lib_filename)
+
+These coordinates in the form of a character string are not very practical. Let's extract the latitudes and longitudes.
+
+#+begin_src R :results output :session *R* :exports both
+cholera$geometry %>%
+ str_replace_all("[>A-Za-z]*","") %>%
+ str_split(",") %>% do.call("rbind",.) %>% as.numeric() -> cholera[,c("lon","lat")]
+cholera %>% select(-geometry) -> cholera
+# the do.call tricks was inspired by http://ggorjan.blogspot.com/2011/01/converting-strsplit-output-to-dataframe.html
+head(cholera)
+str(cholera)
#+end_src
#+RESULTS:
-[[file:./cosxsx.png]]
+#+begin_example
+ count lon lat
+1 3 -0.137930 51.51342
+2 2 -0.137883 51.51336
+3 1 -0.137853 51.51332
+4 1 -0.137812 51.51326
+5 4 -0.137767 51.51320
+6 2 -0.137537 51.51318
+'data.frame': 250 obs. of 3 variables:
+ $ count: int 3 2 1 1 4 2 2 2 3 2 ...
+ $ lon : num -0.138 -0.138 -0.138 -0.138 -0.138 ...
+ $ lat : num 51.5 51.5 51.5 51.5 51.5 ...
+#+end_example
-Note the parameter ~:exports results~, which indicates that the code
-will not appear in the exported document. We recommend that in the
-context of this MOOC, you always leave this parameter setting as
-~:exports both~, because we want your analyses to be perfectly
-transparent and reproducible.
-Watch out: the figure generated by the code block is /not/ stored in
-the org document. It's a plain file, here named ~cosxsx.png~. You have
-to commit it explicitly if you want your analysis to be legible and
-understandable on GitLab.
+#+begin_src R :results output :session *R* :exports both
+pump$geometry %>%
+ str_replace_all("[>A-Za-z]*","") %>%
+ str_split(",") %>% do.call("rbind",.) %>% as.numeric() -> pump[,c("lon","lat")]
+pump %>% select(-geometry) -> pump
+# the do.call tricks was inspired by http://ggorjan.blogspot.com/2011/01/converting-strsplit-output-to-dataframe.html
+head(pump)
+str(pump)
+#+end_src
+
+#+RESULTS:
+#+begin_example
+ lon lat
+1 -0.136668 51.51334
+2 -0.139586 51.51388
+3 -0.139671 51.51491
+4 -0.131630 51.51235
+5 -0.133594 51.51214
+6 -0.135919 51.51154
+'data.frame': 8 obs. of 2 variables:
+ $ lon: num -0.137 -0.14 -0.14 -0.132 -0.134 ...
+ $ lat: num 51.5 51.5 51.5 51.5 51.5 ...
+#+end_example
+
+On the other hand, later, some algorithms (density estimation, / clustering /) that I will use do not manage well the fact that an entry includes several deaths. So I'm going to prepare a "flattened" version of ~ cholera ~. For this, I am inspired by the following entry on
+[[https://stackoverflow.com/questions/46841463/expand-a-data-frame-to-have-as-many-rows-as-range-of-two-columns-in-original-row[[stackoverflow ]].
+
+#+begin_src R :results output :session *R* :exports both
+cholera %>%
+ rowwise() %>%
+ do(data.frame(lon= .$lon, lat= .$lat, value = 1:.$count)) %>%
+ as.data.frame() -> cholera_flat
+head(cholera_flat )
+#+end_src
+
+#+RESULTS:
+: lon lat value
+: 1 -0.137930 51.51342 1
+: 2 -0.137930 51.51342 2
+: 3 -0.137930 51.51342 3
+: 4 -0.137883 51.51336 1
+: 5 -0.137883 51.51336 2
+: 6 -0.137853 51.51332 1
+
+* Visualisations
+From there, it's easy to represent the previous data:
+#+begin_src R :results output graphics :file "map1.png" :exports both :width 600 :height 600 :session *R*
+ggplot(cholera,aes(x=lon,y=lat)) +
+ geom_point(aes(size=count),color="red3") +
+ geom_point(data=pump, color="blue", size=4) +
+ theme_bw() + coord_fixed()
+#+end_src
+
+#+RESULTS:
+[[file:map1.png]]
+
+** With a map of London (current)
+But to have a real map, it is better to use the right tool: [[https://journal.r-project.org/archive/2013-1/kahle-wickham.pdf[[ggmap]]. So I read this documentation well to improve things. I will center my map on the median of the previous coordinates.
+
+#+begin_src R :results output :session *R* :exports both
+library(ggmap)
+mapImageData <- get_map(location = c(lon = median(cholera$lon), lat = median(cholera$lat)),
+ color = "color",
+ source = "google",
+ maptype = "toner",
+ zoom = 16)
+
+LondonMap = ggmap(mapImageData,
+ extent = "device",
+ ylab = "Latitude",
+ xlab = "Longitude")
+#+end_src
+
+#+RESULTS:
+#+begin_example
+Google Maps API Terms of Service: http://developers.google.com/maps/terms.
+Please cite ggmap if you use it: see citation('ggmap') for details.
+maptype = "toner" is only available with source = "stamen".
+resetting to source = "stamen"...
+Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=51.513379,-0.136288&zoom=16&size=640x640&scale=2&maptype=terrain&sensor=false
+Map from URL : http://tile.stamen.com/toner/16/32741/21789.png
+Map from URL : http://tile.stamen.com/toner/16/32742/21789.png
+Map from URL : http://tile.stamen.com/toner/16/32743/21789.png
+Map from URL : http://tile.stamen.com/toner/16/32744/21789.png
+Map from URL : http://tile.stamen.com/toner/16/32741/21790.png
+Map from URL : http://tile.stamen.com/toner/16/32742/21790.png
+Map from URL : http://tile.stamen.com/toner/16/32743/21790.png
+Map from URL : http://tile.stamen.com/toner/16/32744/21790.png
+Map from URL : http://tile.stamen.com/toner/16/32741/21791.png
+Map from URL : http://tile.stamen.com/toner/16/32742/21791.png
+Map from URL : http://tile.stamen.com/toner/16/32743/21791.png
+Map from URL : http://tile.stamen.com/toner/16/32744/21791.png
+Warning message:
+`panel.margin` is deprecated. Please use `panel.spacing` property instead
+#+end_example
+
+Let’s go. As before, in red the cases of Cholera and in blue the
+sources of water.
+#+begin_src R :results output graphics :file :file "map2.png" :exports both :width 800 :height 800 :session *R*
+LondonMap +
+ geom_point(data=cholera,aes(x=lon,y=lat, size=count),color="red3") +
+ geom_point(data=pump, color="blue",size=4) + theme_bw()
+#+end_src
+
+#+RESULTS:
+[[file:map2.png]]
+** Clustering
+L'histoire, dit qu'une des dames qui a contracté le Choléra faisait un
+peu office d'/outlier/ car elle n'habitait pas à proximité de la pompe
+infectée. Peut-être est-ce le petit point le plus en haut à droite ?
+Peut-être qu'un algorithme de clustering aurait été capable d'aider à
+le repérer ? Essayons de faire ça bien avec une bibliothèque faisant
+de l'estimation par maximum de vraisemblance de mixtures de
+gaussiennes. Pour ça, je m'inspire des [[https://www.statmethods.net/advstats/cluster.html][commandes données ici]].
+
+#+begin_src R :results output :session *R* :exports both
+library(mclust)
+fit <- Mclust(as.matrix(cholera_flat[,c("lon","lat")]),verbose=F)
+#+end_src
+
+#+RESULTS:
+: __ ___________ __ _____________
+: / |/ / ____/ / / / / / ___/_ __/
+: / /|_/ / / / / / / / /\__ \ / /
+: / / / / /___/ /___/ /_/ /___/ // /
+: /_/ /_/\____/_____/\____//____//_/ version 5.4.1
+: Type 'citation("mclust")' for citing this R package in publications.
+
+#+begin_src R :results output graphics :file "clustering_BIC.png" :exports both :width 600 :height 400 :session *R*
+plot(fit,what = c("BIC"))
+#+end_src
+
+#+RESULTS:
+[[file:clustering_BIC.png]]
+
+Ouh là, je ne trouve pas que ça soit bon signe, un BIC qui monte comme
+ça sans maximum bien clair.
+
+#+begin_src R :results output :session *R* :exports both
+summary(fit)
+#+end_src
+
+#+RESULTS:
+#+begin_example
+----------------------------------------------------
+Gaussian finite mixture model fitted by EM algorithm
+----------------------------------------------------
+
+Mclust VEV (ellipsoidal, equal shape) model with 7 components:
+
+ log.likelihood n df BIC ICL
+ 5523.087 489 35 10829.44 10693.22
+
+Clustering table:
+ 1 2 3 4 5 6 7
+121 119 26 73 83 23 44
+#+end_example
+
+Let's take a look at what the most likely classification looks like.
+#+begin_src R :results output graphics :file "clustering.png" :exports both :width 600 :height 400 :session *R*
+plot(fit,what = c("classification"))
+#+end_src
+
+#+RESULTS:
+[[file:clustering.png]]
+
+Bof, it's a little anything but hey, let's try anyway
+to use this information on the previous card:
+#+begin_src R :results output graphics :file "map3.png" :exports both :width 800 :height 800 :session *R*
+cholera_flat$cluster = as.factor(fit$classification)
+
+cluster = as.data.frame(t(fit$parameters$mean))
+cluster$id = 1:nrow(cluster)
+
+cholera_clustered = cholera_flat %>% group_by(lon,lat,cluster) %>% summarize(count=n())
+
+outlier = cholera[cholera$lat == max(cholera$lat),]
+
+LondonMap +
+ geom_point(data=cholera_clustered,
+ aes(x=lon, y=lat, size=count, color=cluster)) +
+ stat_ellipse(data=cholera_flat,aes(x=lon, y=lat, fill=cluster, group=cluster),
+ type = "norm",geom = "polygon", alpha = .3) +
+ geom_label(data = cluster, aes(x=lon, y=lat, label=id)) +
+ geom_label(data = outlier, aes(x=lon, y=lat+0.00012), fill="red", label="outlier",alpha=.3) +
+ scale_colour_brewer(palette="Set1") +
+ scale_fill_brewer(palette="Set1") + theme_bw()
+#+end_src
+
+#+RESULTS:
+[[file:map3.png]]
+
+
+
+** Estimation the density
+#+begin_src R :results output graphics :file "map4.png" :exports both :width 800 :height 800 :session *R*
+LondonMap +
+ stat_density2d(data=cholera_flat,
+ aes(x = lon, y = lat, fill = ..level.., alpha = ..level..),
+ size = 2, bins = 4,
+ geom = "polygon",
+ fill="red"
+ ) +
+ geom_point(data=cholera,aes(x=lon,y=lat, size=count),color="red3") +
+ geom_point(data=pump, color="blue",size=4) +
+ geom_label(data = outlier, aes(x=lon, y=lat+0.00012), fill="red", label="outlier",alpha=.3) +
+ theme_bw()
+#+end_src
+
+#+RESULTS:
+[[file:map4.png]]
+
+
+
+#+begin_src R :results output graphics :file "map5.png" :exports both :width 800 :height 800 :session *R*
+LondonMap +
+ stat_density2d(data=cholera,
+ aes(x = lon, y = lat, fill = ..level.., alpha = ..level..),
+ size = 2, bins = 4,
+ geom = "polygon",
+ fill="red"
+ ) +
+ stat_ellipse(data=cholera,aes(x=lon, y=lat),
+ type = "norm",geom = "polygon", alpha = .3) +
+ geom_point(data=cholera,aes(x=lon,y=lat, size=count),color="red3") +
+ geom_point(data=pump, color="blue",size=4) +
+ geom_label(data = outlier, aes(x=lon, y=lat+0.00012), fill="red", label="outlier",alpha=.3) +
+ theme_bw()
+#+end_src
+
+#+RESULTS:
+[[file:map5.png]]
-Finally, don't forget that we provide in the resource section of this
-MOOC a configuration with a few keyboard shortcuts that allow you to
-quickly create code blocks in Python by typing ~