diff --git a/module3/exo2/exercice_python_en.org b/module3/exo2/exercice_python_en.org index 5782f493934678ba782fb65634a4d86e5f3adefc..88c2aa5fbf3be2f6a73c4453c3f701479e137d09 100644 --- a/module3/exo2/exercice_python_en.org +++ b/module3/exo2/exercice_python_en.org @@ -1,8 +1,7 @@ -#+TITLE: Your title -#+AUTHOR: Your name -#+DATE: Today's date +#+TITLE: Analysis of the incidence of chickenpox +#+AUTHOR: Thomas Rushton +#+DATE: 2024-04-24 #+LANGUAGE: en -# #+PROPERTY: header-args :eval never-export #+HTML_HEAD: #+HTML_HEAD: @@ -11,84 +10,185 @@ #+HTML_HEAD: #+HTML_HEAD: -* Some explanations +#+PROPERTY: header-args :session -This is an org-mode document with code examples in R. Once opened in -Emacs, this document can easily be exported to HTML, PDF, and Office -formats. For more information on org-mode, see -https://orgmode.org/guide/. +#+OPTIONS: ^:{} -When you type the shortcut =C-c C-e h o=, this document will be -exported as HTML. All the code in it will be re-executed, and the -results will be retrieved and included into the exported document. If -you do not want to re-execute all code each time, you can delete the # -and the space before ~#+PROPERTY:~ in the header of this document. +* Data acquisition -Like we showed in the video, Python code is included as follows (and -is exxecuted by typing ~C-c C-c~): +Let's start by getting the data on chickenpox incidence from [[https://www.sentiweb.fr/][Rèseau Sentinelles]]. +#+NAME: data-url +https://www.sentiweb.fr/datasets/incidence-PAY-7.csv + +#+begin_src python :results output :var data_url=data-url +data_file = "chickenpox.csv" + +import os +import urllib.request +if not os.path.exists(data_file): + urllib.request.urlretrieve(data_url, data_file) +#+end_src + +#+RESULTS: + +** Format the data + +Discard the first line (which is a comment) and split the remaining +lines into columns. + +#+begin_src python :results silent :exports both +data = open(data_file, 'rb').read() +lines = data.decode('latin-1').strip().split('\n') +data_lines = lines[1:] +table = [line.split(',') for line in data_lines] +# Could use the csv library instead... +#+end_src + +Let's take a gander at the data: + +#+begin_src python :results value :exports both +table[:5] +#+end_src + +#+RESULTS: +| week | indicator | inc | inc_low | inc_up | inc100 | inc100_low | inc100_up | geo_insee | geo_name | +| 202416 | 7 | 19330 | 13879 | 24781 | 29 | 21 | 37 | FR | France | +| 202415 | 7 | 24807 | 17183 | 32431 | 37 | 26 | 48 | FR | France | +| 202414 | 7 | 16181 | 12544 | 19818 | 24 | 19 | 29 | FR | France | +| 202413 | 7 | 18322 | 14206 | 22438 | 27 | 21 | 33 | FR | France | + +Alright, looks good 👍 + +** Sanitise the data + +But there may be problems with the data. Let's check for the obvious +case of missing/empty data: #+begin_src python :results output :exports both -print("Hello world!") +valid_table = [] +for row in table: + missing = any([column == '' for column in row]) + if missing: + print(row) + else: + valid_table.append(row) +#+end_src + +#+RESULTS: + +This is kind of grim, but it does the job. + +** Extract required columns + +For a temporal analysis, we just need the =week= and =inc= columns: + +#+begin_src python :results silent :exports both +week = [row[0] for row in valid_table] +assert week[0] == 'week' +del week[0] +inc = [row[2] for row in valid_table] +assert inc[0] == 'inc' +del inc[0] +data = list(zip(week, inc)) +#+end_src + +Let's check what we have, using ~None~ to tell Org where to put +separators in the resulting table (which contains the first five and +last five weeks' data): + +#+begin_src python :results value :exports both +[('week', 'inc'), None] + data[:5] + [None] + data[-5:] #+end_src #+RESULTS: -: Hello world! +| week | inc | +|--------+-------| +| 202416 | 19330 | +| 202415 | 24807 | +| 202414 | 16181 | +| 202413 | 18322 | +| 202412 | 12818 | +|--------+-------| +| 199101 | 15565 | +| 199052 | 19375 | +| 199051 | 19080 | +| 199050 | 11079 | +| 199049 | 1143 | + +** Convert dates + +Dates are represented in ISO 8601 format (YYYYWW) so let's parse +those. It should already be sorted chronologically, but let's make +sure of that too. + +#+begin_src python :results silent :exports both +import datetime +converted_data = [(datetime.datetime.strptime(year_and_week + ":1", '%G%V:%u').date(), + int(inc)) + for year_and_week, inc in data] +converted_data.sort(key = lambda record: record[0]) +#+end_src -And now the same but in an Python session. With a session, Python's -state, i.e. the values of all the variables, remains persistent from -one code block to the next. The code is still executed using ~C-c -C-c~. +Let's check again: -#+begin_src python :results output :session :exports both -import numpy -x=numpy.linspace(-15,15) -print(x) +#+begin_src python :results value :exports both +data_as_str = [(str(date), str(inc)) for date, inc in converted_data] +[('date', 'inc'), None] + data_as_str[:5] + [None] + data_as_str[-5:] #+end_src #+RESULTS: -#+begin_example -[-15. -14.3877551 -13.7755102 -13.16326531 -12.55102041 - -11.93877551 -11.32653061 -10.71428571 -10.10204082 -9.48979592 - -8.87755102 -8.26530612 -7.65306122 -7.04081633 -6.42857143 - -5.81632653 -5.20408163 -4.59183673 -3.97959184 -3.36734694 - -2.75510204 -2.14285714 -1.53061224 -0.91836735 -0.30612245 - 0.30612245 0.91836735 1.53061224 2.14285714 2.75510204 - 3.36734694 3.97959184 4.59183673 5.20408163 5.81632653 - 6.42857143 7.04081633 7.65306122 8.26530612 8.87755102 - 9.48979592 10.10204082 10.71428571 11.32653061 11.93877551 - 12.55102041 13.16326531 13.7755102 14.3877551 15. ] -#+end_example - -Finally, an example for graphical output: -#+begin_src python :results output file :session :var matplot_lib_filename="./cosxsx.png" :exports results +| date | inc | +|------------+-------| +| 1990-12-03 | 1143 | +| 1990-12-10 | 11079 | +| 1990-12-17 | 19080 | +| 1990-12-24 | 19375 | +| 1990-12-31 | 15565 | +|------------+-------| +| 2024-03-18 | 12818 | +| 2024-03-25 | 18322 | +| 2024-04-01 | 16181 | +| 2024-04-08 | 24807 | +| 2024-04-15 | 19330 | + +** Visual inspection + +So, now we can take a look at incidence over time. (The 'flu notebook +switches to R here, but we're going to stick with python.) + +#+begin_src python :results output file :var filename="./incidence.png" :exports both import matplotlib.pyplot as plt -plt.figure(figsize=(10,5)) -plt.plot(x,numpy.cos(x)/x) +plt.clf() + +date,incidence = zip(*converted_data) + +plt.plot(date,incidence) plt.tight_layout() +plt.savefig(filename) +print(filename) +#+end_src + +#+RESULTS: +[[file:./incidence.png]] + +And we can zoom in on a period of, say, five years: -plt.savefig(matplot_lib_filename) -print(matplot_lib_filename) +#+begin_src python :results output file :var filename="./incidence-zoom.png" :exports both +plt.clf() + +start = 10 +years = 5 +date,incidence = zip(*converted_data[52*start:52*(start+years)]) + +plt.plot(date,incidence) +plt.tight_layout() +plt.savefig(filename) +print(filename) #+end_src #+RESULTS: -[[file:./cosxsx.png]] - -Note the parameter ~:exports results~, which indicates that the code -will not appear in the exported document. We recommend that in the -context of this MOOC, you always leave this parameter setting as -~:exports both~, because we want your analyses to be perfectly -transparent and reproducible. - -Watch out: the figure generated by the code block is /not/ stored in -the org document. It's a plain file, here named ~cosxsx.png~. You have -to commit it explicitly if you want your analysis to be legible and -understandable on GitLab. - -Finally, don't forget that we provide in the resource section of this -MOOC a configuration with a few keyboard shortcuts that allow you to -quickly create code blocks in Python by typing ~