diff --git a/module3/exo3/exercice_python_en.org b/module3/exo3/exercice_python_en.org index 5782f493934678ba782fb65634a4d86e5f3adefc..c89163955773823627381ea227caf206c24c0d90 100644 --- a/module3/exo3/exercice_python_en.org +++ b/module3/exo3/exercice_python_en.org @@ -1,8 +1,6 @@ -#+TITLE: Your title -#+AUTHOR: Your name -#+DATE: Today's date +#+TITLE: The London cholera epidemic of 1854 +#+AUTHOR: LT #+LANGUAGE: en -# #+PROPERTY: header-args :eval never-export #+HTML_HEAD: #+HTML_HEAD: @@ -11,84 +9,353 @@ #+HTML_HEAD: #+HTML_HEAD: -* Some explanations +#+LATEX_HEADER: \usepackage{a4} +#+LATEX_HEADER: \usepackage[french]{babel} -This is an org-mode document with code examples in R. Once opened in -Emacs, this document can easily be exported to HTML, PDF, and Office -formats. For more information on org-mode, see -https://orgmode.org/guide/. +#+PROPERTY: header-args :session :exports both :eval never-export -When you type the shortcut =C-c C-e h o=, this document will be -exported as HTML. All the code in it will be re-executed, and the -results will be retrieved and included into the exported document. If -you do not want to re-execute all code each time, you can delete the # -and the space before ~#+PROPERTY:~ in the header of this document. +# FIXME! https://towardsdatascience.com/plotting-a-map-of-london-crime-data-using-r-8dcefef1c397 -Like we showed in the video, Python code is included as follows (and -is exxecuted by typing ~C-c C-c~): +* Data recovery and pre-processing +Let's start by loading a number of libraries which we +will be useful later. +#+begin_src R :results output :session *R* :exports both +library(ggplot2) +library(stringr) +library(dplyr) +library(tidyr) +#+end_src + +#+RESULTS: +#+begin_example + +Package attachment : ‘dplyr’ + +The following objects are masked from ‘package:stats’: + + filter, lag + +The following objects are masked from ‘package:base’: + + intersect, setdiff, setequal, union +#+end_example + +Now download the data on the Cholera epidemic of 1854. -#+begin_src python :results output :exports both -print("Hello world!") +#+begin_src R :results output :session *R* :exports both +cholera_file = "./cholera.csv"; +url = "https://fusiontables.google.com/exporttable?query=select+*+from+1HsIb_r4gYYmIz8y_UE1h-X8yUtAYW2INy99BR_c&o=csv" +if(!file.exists(cholera_file)) { download.file(url, cholera_file) } +cholera = read.csv(cholera_file, strip.white=T) +head(cholera) +str(cholera) #+end_src #+RESULTS: -: Hello world! +#+begin_example +essai de l'URL 'https://fusiontables.google.com/exporttable?query=select+*+from+1HsIb_r4gYYmIz8y_UE1h-X8yUtAYW2INy99BR_c&o=csv' +downloaded 16 KB + count geometry +1 3 -0.13793,51.513418 +2 2 -0.137883,51.513361 +3 1 -0.137853,51.513317 +4 1 -0.137812,51.513262 +5 4 -0.137767,51.513204 +6 2 -0.137537,51.513184 +'data.frame': 250 obs. of 2 variables: + $ count : int 3 2 1 1 4 2 2 2 3 2 ... + $ geometry: Factor w/ 250 levels "-0.132933,51.513258",..: 202 199 197 195 193 185 213 206 217 214 ... +#+end_example -And now the same but in an Python session. With a session, Python's -state, i.e. the values of all the variables, remains persistent from -one code block to the next. The code is still executed using ~C-c -C-c~. -#+begin_src python :results output :session :exports both -import numpy -x=numpy.linspace(-15,15) -print(x) +#+begin_src R :results output :session *R* :exports both +pump_file = "./water_pump.csv"; +url = "https://fusiontables.google.com/exporttable?query=select+*+from+1Al4BQYgV0BzoyEaWmTcCRgLMkOJkiBo8QUQT_tU&o=csv" +if(!file.exists(pump_file)) { download.file(url, pump_file) } +pump = read.csv(pump_file, strip.white=T) +head(pump) +str(pump) #+end_src #+RESULTS: #+begin_example -[-15. -14.3877551 -13.7755102 -13.16326531 -12.55102041 - -11.93877551 -11.32653061 -10.71428571 -10.10204082 -9.48979592 - -8.87755102 -8.26530612 -7.65306122 -7.04081633 -6.42857143 - -5.81632653 -5.20408163 -4.59183673 -3.97959184 -3.36734694 - -2.75510204 -2.14285714 -1.53061224 -0.91836735 -0.30612245 - 0.30612245 0.91836735 1.53061224 2.14285714 2.75510204 - 3.36734694 3.97959184 4.59183673 5.20408163 5.81632653 - 6.42857143 7.04081633 7.65306122 8.26530612 8.87755102 - 9.48979592 10.10204082 10.71428571 11.32653061 11.93877551 - 12.55102041 13.16326531 13.7755102 14.3877551 15. ] +essai de l'URL 'https://fusiontables.google.com/exporttable?query=select+*+from+1Al4BQYgV0BzoyEaWmTcCRgLMkOJkiBo8QUQT_tU&o=csv' +Content type 'application/x-download; charset=UTF-8' length 520 bytes +================================================== +downloaded 520 bytes + geometry +1 -0.136668,51.513341 +2 -0.139586,51.513876 +3 -0.139671,51.514906 +4 -0.13163,51.512354 +5 -0.133594,51.512139 +6 -0.135919,51.511542 +'data.frame': 8 obs. of 1 variable: + $ geometry: Factor w/ 8 levels "-0.13163,51.512354",..: 5 7 8 1 2 4 3 6 #+end_example -Finally, an example for graphical output: -#+begin_src python :results output file :session :var matplot_lib_filename="./cosxsx.png" :exports results -import matplotlib.pyplot as plt -plt.figure(figsize=(10,5)) -plt.plot(x,numpy.cos(x)/x) -plt.tight_layout() -plt.savefig(matplot_lib_filename) -print(matplot_lib_filename) + +These coordinates in the form of a character string are not very practical. Let's extract the latitudes and longitudes. + +#+begin_src R :results output :session *R* :exports both +cholera$geometry %>% + str_replace_all("[A-Za-z]*","") %>% + str_split(",") %>% do.call("rbind",.) %>% as.numeric() -> cholera[,c("lon","lat")] +cholera %>% select(-geometry) -> cholera +# the do.call tricks was inspired by http://ggorjan.blogspot.com/2011/01/converting-strsplit-output-to-dataframe.html +head(cholera) +str(cholera) #+end_src #+RESULTS: -[[file:./cosxsx.png]] +#+begin_example + count lon lat +1 3 -0.137930 51.51342 +2 2 -0.137883 51.51336 +3 1 -0.137853 51.51332 +4 1 -0.137812 51.51326 +5 4 -0.137767 51.51320 +6 2 -0.137537 51.51318 +'data.frame': 250 obs. of 3 variables: + $ count: int 3 2 1 1 4 2 2 2 3 2 ... + $ lon : num -0.138 -0.138 -0.138 -0.138 -0.138 ... + $ lat : num 51.5 51.5 51.5 51.5 51.5 ... +#+end_example -Note the parameter ~:exports results~, which indicates that the code -will not appear in the exported document. We recommend that in the -context of this MOOC, you always leave this parameter setting as -~:exports both~, because we want your analyses to be perfectly -transparent and reproducible. -Watch out: the figure generated by the code block is /not/ stored in -the org document. It's a plain file, here named ~cosxsx.png~. You have -to commit it explicitly if you want your analysis to be legible and -understandable on GitLab. +#+begin_src R :results output :session *R* :exports both +pump$geometry %>% + str_replace_all("[A-Za-z]*","") %>% + str_split(",") %>% do.call("rbind",.) %>% as.numeric() -> pump[,c("lon","lat")] +pump %>% select(-geometry) -> pump +# the do.call tricks was inspired by http://ggorjan.blogspot.com/2011/01/converting-strsplit-output-to-dataframe.html +head(pump) +str(pump) +#+end_src + +#+RESULTS: +#+begin_example + lon lat +1 -0.136668 51.51334 +2 -0.139586 51.51388 +3 -0.139671 51.51491 +4 -0.131630 51.51235 +5 -0.133594 51.51214 +6 -0.135919 51.51154 +'data.frame': 8 obs. of 2 variables: + $ lon: num -0.137 -0.14 -0.14 -0.132 -0.134 ... + $ lat: num 51.5 51.5 51.5 51.5 51.5 ... +#+end_example + +On the other hand, later, some algorithms (density estimation, / clustering /) that I will use do not manage well the fact that an entry includes several deaths. So I'm going to prepare a "flattened" version of ~ cholera ~. For this, I am inspired by the following entry on +[[https://stackoverflow.com/questions/46841463/expand-a-data-frame-to-have-as-many-rows-as-range-of-two-columns-in-original-row[[stackoverflow ]]. + +#+begin_src R :results output :session *R* :exports both +cholera %>% + rowwise() %>% + do(data.frame(lon= .$lon, lat= .$lat, value = 1:.$count)) %>% + as.data.frame() -> cholera_flat +head(cholera_flat ) +#+end_src + +#+RESULTS: +: lon lat value +: 1 -0.137930 51.51342 1 +: 2 -0.137930 51.51342 2 +: 3 -0.137930 51.51342 3 +: 4 -0.137883 51.51336 1 +: 5 -0.137883 51.51336 2 +: 6 -0.137853 51.51332 1 + +* Visualisations +From there, it's easy to represent the previous data: +#+begin_src R :results output graphics :file "map1.png" :exports both :width 600 :height 600 :session *R* +ggplot(cholera,aes(x=lon,y=lat)) + + geom_point(aes(size=count),color="red3") + + geom_point(data=pump, color="blue", size=4) + + theme_bw() + coord_fixed() +#+end_src + +#+RESULTS: +[[file:map1.png]] + +** With a map of London (current) +But to have a real map, it is better to use the right tool: [[https://journal.r-project.org/archive/2013-1/kahle-wickham.pdf[[ggmap]]. So I read this documentation well to improve things. I will center my map on the median of the previous coordinates. + +#+begin_src R :results output :session *R* :exports both +library(ggmap) +mapImageData <- get_map(location = c(lon = median(cholera$lon), lat = median(cholera$lat)), + color = "color", + source = "google", + maptype = "toner", + zoom = 16) + +LondonMap = ggmap(mapImageData, + extent = "device", + ylab = "Latitude", + xlab = "Longitude") +#+end_src + +#+RESULTS: +#+begin_example +Google Maps API Terms of Service: http://developers.google.com/maps/terms. +Please cite ggmap if you use it: see citation('ggmap') for details. +maptype = "toner" is only available with source = "stamen". +resetting to source = "stamen"... +Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=51.513379,-0.136288&zoom=16&size=640x640&scale=2&maptype=terrain&sensor=false +Map from URL : http://tile.stamen.com/toner/16/32741/21789.png +Map from URL : http://tile.stamen.com/toner/16/32742/21789.png +Map from URL : http://tile.stamen.com/toner/16/32743/21789.png +Map from URL : http://tile.stamen.com/toner/16/32744/21789.png +Map from URL : http://tile.stamen.com/toner/16/32741/21790.png +Map from URL : http://tile.stamen.com/toner/16/32742/21790.png +Map from URL : http://tile.stamen.com/toner/16/32743/21790.png +Map from URL : http://tile.stamen.com/toner/16/32744/21790.png +Map from URL : http://tile.stamen.com/toner/16/32741/21791.png +Map from URL : http://tile.stamen.com/toner/16/32742/21791.png +Map from URL : http://tile.stamen.com/toner/16/32743/21791.png +Map from URL : http://tile.stamen.com/toner/16/32744/21791.png +Warning message: +`panel.margin` is deprecated. Please use `panel.spacing` property instead +#+end_example + +Let’s go. As before, in red the cases of Cholera and in blue the +sources of water. +#+begin_src R :results output graphics :file :file "map2.png" :exports both :width 800 :height 800 :session *R* +LondonMap + + geom_point(data=cholera,aes(x=lon,y=lat, size=count),color="red3") + + geom_point(data=pump, color="blue",size=4) + theme_bw() +#+end_src + +#+RESULTS: +[[file:map2.png]] +** Clustering +L'histoire, dit qu'une des dames qui a contracté le Choléra faisait un +peu office d'/outlier/ car elle n'habitait pas à proximité de la pompe +infectée. Peut-être est-ce le petit point le plus en haut à droite ? +Peut-être qu'un algorithme de clustering aurait été capable d'aider à +le repérer ? Essayons de faire ça bien avec une bibliothèque faisant +de l'estimation par maximum de vraisemblance de mixtures de +gaussiennes. Pour ça, je m'inspire des [[https://www.statmethods.net/advstats/cluster.html][commandes données ici]]. + +#+begin_src R :results output :session *R* :exports both +library(mclust) +fit <- Mclust(as.matrix(cholera_flat[,c("lon","lat")]),verbose=F) +#+end_src + +#+RESULTS: +: __ ___________ __ _____________ +: / |/ / ____/ / / / / / ___/_ __/ +: / /|_/ / / / / / / / /\__ \ / / +: / / / / /___/ /___/ /_/ /___/ // / +: /_/ /_/\____/_____/\____//____//_/ version 5.4.1 +: Type 'citation("mclust")' for citing this R package in publications. + +#+begin_src R :results output graphics :file "clustering_BIC.png" :exports both :width 600 :height 400 :session *R* +plot(fit,what = c("BIC")) +#+end_src + +#+RESULTS: +[[file:clustering_BIC.png]] + +Ouh là, je ne trouve pas que ça soit bon signe, un BIC qui monte comme +ça sans maximum bien clair. + +#+begin_src R :results output :session *R* :exports both +summary(fit) +#+end_src + +#+RESULTS: +#+begin_example +---------------------------------------------------- +Gaussian finite mixture model fitted by EM algorithm +---------------------------------------------------- + +Mclust VEV (ellipsoidal, equal shape) model with 7 components: + + log.likelihood n df BIC ICL + 5523.087 489 35 10829.44 10693.22 + +Clustering table: + 1 2 3 4 5 6 7 +121 119 26 73 83 23 44 +#+end_example + +Let's take a look at what the most likely classification looks like. +#+begin_src R :results output graphics :file "clustering.png" :exports both :width 600 :height 400 :session *R* +plot(fit,what = c("classification")) +#+end_src + +#+RESULTS: +[[file:clustering.png]] + +Bof, it's a little anything but hey, let's try anyway +to use this information on the previous card: +#+begin_src R :results output graphics :file "map3.png" :exports both :width 800 :height 800 :session *R* +cholera_flat$cluster = as.factor(fit$classification) + +cluster = as.data.frame(t(fit$parameters$mean)) +cluster$id = 1:nrow(cluster) + +cholera_clustered = cholera_flat %>% group_by(lon,lat,cluster) %>% summarize(count=n()) + +outlier = cholera[cholera$lat == max(cholera$lat),] + +LondonMap + + geom_point(data=cholera_clustered, + aes(x=lon, y=lat, size=count, color=cluster)) + + stat_ellipse(data=cholera_flat,aes(x=lon, y=lat, fill=cluster, group=cluster), + type = "norm",geom = "polygon", alpha = .3) + + geom_label(data = cluster, aes(x=lon, y=lat, label=id)) + + geom_label(data = outlier, aes(x=lon, y=lat+0.00012), fill="red", label="outlier",alpha=.3) + + scale_colour_brewer(palette="Set1") + + scale_fill_brewer(palette="Set1") + theme_bw() +#+end_src + +#+RESULTS: +[[file:map3.png]] + + + +** Estimation the density +#+begin_src R :results output graphics :file "map4.png" :exports both :width 800 :height 800 :session *R* +LondonMap + + stat_density2d(data=cholera_flat, + aes(x = lon, y = lat, fill = ..level.., alpha = ..level..), + size = 2, bins = 4, + geom = "polygon", + fill="red" + ) + + geom_point(data=cholera,aes(x=lon,y=lat, size=count),color="red3") + + geom_point(data=pump, color="blue",size=4) + + geom_label(data = outlier, aes(x=lon, y=lat+0.00012), fill="red", label="outlier",alpha=.3) + + theme_bw() +#+end_src + +#+RESULTS: +[[file:map4.png]] + + + +#+begin_src R :results output graphics :file "map5.png" :exports both :width 800 :height 800 :session *R* +LondonMap + + stat_density2d(data=cholera, + aes(x = lon, y = lat, fill = ..level.., alpha = ..level..), + size = 2, bins = 4, + geom = "polygon", + fill="red" + ) + + stat_ellipse(data=cholera,aes(x=lon, y=lat), + type = "norm",geom = "polygon", alpha = .3) + + geom_point(data=cholera,aes(x=lon,y=lat, size=count),color="red3") + + geom_point(data=pump, color="blue",size=4) + + geom_label(data = outlier, aes(x=lon, y=lat+0.00012), fill="red", label="outlier",alpha=.3) + + theme_bw() +#+end_src + +#+RESULTS: +[[file:map5.png]] -Finally, don't forget that we provide in the resource section of this -MOOC a configuration with a few keyboard shortcuts that allow you to -quickly create code blocks in Python by typing ~