From 7cafd23970b09b49a4396dca0a717bf6b80b25bd Mon Sep 17 00:00:00 2001
From: "Anton Y." <skycluster@gmail.com>
Date: Thu, 10 Jul 2025 14:38:06 +0300
Subject: [PATCH] exercise 1 mod 3

---
 .../exo1/influenza-like-illness-analysis.org  | 27 +++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/module3/exo1/influenza-like-illness-analysis.org b/module3/exo1/influenza-like-illness-analysis.org
index 6c8b47a..508c163 100644
--- a/module3/exo1/influenza-like-illness-analysis.org
+++ b/module3/exo1/influenza-like-illness-analysis.org
@@ -65,10 +65,21 @@ The [[https://en.wikipedia.org/wiki/ISO_8601][ISO-8601]] format is popular in Eu
 ** Download
 After downloading the raw data, we extract the part we are interested in. We first split the file into lines, of which we discard the first one that contains a comment. We then split the remaining lines into columns.
 
+#+BEGIN_SRC python :results output :var data_url=data-url	
+data_file = "syndrome-grippal.csv"	
+import os	
+import urllib.request	
+if not os.path.exists(data_file):	
+    urllib.request.urlretrieve(data_url, data_file)	
+#+END_SRC	
+
+#+RESULTS:
+
+
 #+BEGIN_SRC python :results silent :var data_url=data-url
-from urllib.request import urlopen
+#from urllib.request import urlopen
 
-data = urlopen(data_url).read()
+data = open(data_file, 'rb').read()
 lines = data.decode('latin-1').strip().split('\n')
 data_lines = lines[1:]
 table = [line.split(',') for line in data_lines]
@@ -79,6 +90,13 @@ Let's have a look at what we have so far:
 table[:5]
 #+END_SRC
 
+#+RESULTS:
+|   week | indicator |   inc | inc_low | inc_up | inc100 | inc100_low | inc100_up | geo_insee | geo_name |
+| 202527 |         3 | 24517 |   19166 |  29868 |     37 |         29 |        45 | FR        | France   |
+| 202526 |         3 | 22152 |   17561 |  26743 |     33 |         26 |        40 | FR        | France   |
+| 202525 |         3 | 23323 |   18546 |  28100 |     35 |         28 |        42 | FR        | France   |
+| 202524 |         3 | 23154 |   18577 |  27731 |     35 |         28 |        42 | FR        | France   |
+
 ** Checking for missing data
 Unfortunately there are many ways to indicate the absence of a data value in a dataset. Here we check for a common one: empty fields. For completeness, we should also look for non-numerical data in numerical columns. We don't do this here, but checks in later processing steps would catch such anomalies.
 
@@ -93,6 +111,9 @@ for row in table:
         valid_table.append(row)
 #+END_SRC
 
+#+RESULTS:
+: ['198919', '3', '-', '', '', '-', '', '', 'FR', 'France']
+
 ** Extraction of the required columns
 There are only two columns that we will need for our analysis: the first (~"week"~) and the third (~"inc"~). We check the names in the header to be sure we pick the right data. We make a new table containing just the two columns required, without the header.
 #+BEGIN_SRC python :results silent
@@ -110,6 +131,8 @@ Let's look at the first and last lines. We insert ~None~ to indicate to org-mode
 [('week', 'inc'), None] + data[:5] + [None] + data[-5:]
 #+END_SRC
 
+#+RESULTS:
+
 ** Verification
 It is always prudent to verify if the data looks credible. A simple fact we can check for is that weeks are given as six-digit integers (four for the year, two for the week), and that the incidence values are positive integers.
 #+BEGIN_SRC python :results output
-- 
2.18.1