From 7cafd23970b09b49a4396dca0a717bf6b80b25bd Mon Sep 17 00:00:00 2001 From: "Anton Y." Date: Thu, 10 Jul 2025 14:38:06 +0300 Subject: [PATCH] exercise 1 mod 3 --- .../exo1/influenza-like-illness-analysis.org | 27 +++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/module3/exo1/influenza-like-illness-analysis.org b/module3/exo1/influenza-like-illness-analysis.org index 6c8b47a..508c163 100644 --- a/module3/exo1/influenza-like-illness-analysis.org +++ b/module3/exo1/influenza-like-illness-analysis.org @@ -65,10 +65,21 @@ The [[https://en.wikipedia.org/wiki/ISO_8601][ISO-8601]] format is popular in Eu ** Download After downloading the raw data, we extract the part we are interested in. We first split the file into lines, of which we discard the first one that contains a comment. We then split the remaining lines into columns. +#+BEGIN_SRC python :results output :var data_url=data-url +data_file = "syndrome-grippal.csv" +import os +import urllib.request +if not os.path.exists(data_file): + urllib.request.urlretrieve(data_url, data_file) +#+END_SRC + +#+RESULTS: + + #+BEGIN_SRC python :results silent :var data_url=data-url -from urllib.request import urlopen +#from urllib.request import urlopen -data = urlopen(data_url).read() +data = open(data_file, 'rb').read() lines = data.decode('latin-1').strip().split('\n') data_lines = lines[1:] table = [line.split(',') for line in data_lines] @@ -79,6 +90,13 @@ Let's have a look at what we have so far: table[:5] #+END_SRC +#+RESULTS: +| week | indicator | inc | inc_low | inc_up | inc100 | inc100_low | inc100_up | geo_insee | geo_name | +| 202527 | 3 | 24517 | 19166 | 29868 | 37 | 29 | 45 | FR | France | +| 202526 | 3 | 22152 | 17561 | 26743 | 33 | 26 | 40 | FR | France | +| 202525 | 3 | 23323 | 18546 | 28100 | 35 | 28 | 42 | FR | France | +| 202524 | 3 | 23154 | 18577 | 27731 | 35 | 28 | 42 | FR | France | + ** Checking for missing data Unfortunately there are many ways to indicate the absence of a data value in a dataset. Here we check for a common one: empty fields. For completeness, we should also look for non-numerical data in numerical columns. We don't do this here, but checks in later processing steps would catch such anomalies. @@ -93,6 +111,9 @@ for row in table: valid_table.append(row) #+END_SRC +#+RESULTS: +: ['198919', '3', '-', '', '', '-', '', '', 'FR', 'France'] + ** Extraction of the required columns There are only two columns that we will need for our analysis: the first (~"week"~) and the third (~"inc"~). We check the names in the header to be sure we pick the right data. We make a new table containing just the two columns required, without the header. #+BEGIN_SRC python :results silent @@ -110,6 +131,8 @@ Let's look at the first and last lines. We insert ~None~ to indicate to org-mode [('week', 'inc'), None] + data[:5] + [None] + data[-5:] #+END_SRC +#+RESULTS: + ** Verification It is always prudent to verify if the data looks credible. A simple fact we can check for is that weeks are given as six-digit integers (four for the year, two for the week), and that the incidence values are positive integers. #+BEGIN_SRC python :results output -- 2.18.1