#est populaire en Europe, mais peu utilisée aux Etats-Unis. Ceci explique peut-être que peu de logiciels savent gérer ce format. Le langage Python le fait depuis la version 3.6. Nous utilisons donc ce langage pour la préparation de nos données, ce qui a l'avantage de ne nécessiter aucune bibliothèque supplémentaire. (Note: nous expliquerons dans le module 4 pourquoi il est avantageux pour la réproductibilité de se limiter à un minimum de bibliothèques.) ** Téléchargement #+begin_src python :results output :session :exports both data_file = 'incidence-PAY-7.csv' import os import urllib.request if not os.path.exists(data_file): urllib.request.urlretrieve(data_url, data_file) #+end_src #+RESULTS: #+begin_src python :results silent :session :var data_url=data-url data= open(data_file, 'rb').read() lines = data.decode('latin-1').strip().split('\n') data_lines = lines[1:] table = [line.split(',') for line in data_lines] #+end_src #+begin_src python :results value :session table[:5] #+end_src #+RESULTS: | week | indicator | inc | inc_low | inc_up | inc100 | inc100_low | inc100_up | geo_insee | geo_name | | 202403 | 3 | 177465 | 164064 | 190866 | 266 | 246 | 286 | FR | France | | 202402 | 3 | 130259 | 120192 | 140326 | 195 | 180 | 210 | FR | France | | 202401 | 3 | 120769 | 109452 | 132086 | 181 | 164 | 198 | FR | France | | 202352 | 3 | 115446 | 103738 | 127154 | 174 | 156 | 192 | FR | France | #+begin_src python :results output :session valid_table = [] for row in table: missing = any([column == '' for column in row]) if missing: print(row) else: valid_table.append(row) #+end_src #+RESULTS: : ['198919', '3', '-', '', '', '-', '', '', 'FR', 'France'] #+BEGIN_SRC python :results silent :session week = [row[0] for row in valid_table] assert week[0] == 'week' del week[0] inc = [row[2] for row in valid_table] assert inc[0] == 'inc del inc[0] data = list(zip(week, inc)) #+END_SRC #+BEGIN_SRC python :results value :session [('week', 'inc'), None] + data[:5] + [None] + data[-5:] #+END_SRC #+RESULTS: | week | inc | |--------+--------| | 202403 | 177465 | | 202402 | 130259 | | 202401 | 120769 | | 202352 | 115446 | | 202351 | 148755 | |--------+--------| | 198448 | 78620 | | 198447 | 72029 | | 198446 | 87330 | | 198445 | 135223 | | 198444 | 68422 | #+BEGIN_SRC python :results output :session for week, inc in data: if len(week) != 6 or not week.isdigit(): print("Valeur suspecte dans la colonne 'week': ", (week, inc)) if not inc.isdigit(): print("Valeur suspecte dans la colonne 'inc': ", (week, inc)) #+END_SRC #+RESULTS: #+BEGIN_SRC python :results silent :session import datetime converted_data = [(datetime.datetime.strptime(year_and_week + ":1" , '%G%V:%u').date(), int(inc)) for year_and_week, inc in data] converted_data.sort(key = lambda record: record[0]) #+END_SRC #+BEGIN_SRC python :results value :session str_data = [(str(date), str(inc)) for date, inc in converted_data] [('date', 'inc'), None] + str_data[:5] + [None] + str_data[-5:] #+END_SRC #+RESULTS: | date | inc | |------------+--------| | 1984-10-29 | 68422 | | 1984-11-05 | 135223 | | 1984-11-12 | 87330 | | 1984-11-19 | 72029 | | 1984-11-26 | 78620 | |------------+--------| | 2023-12-18 | 148755 | | 2023-12-25 | 115446 | | 2024-01-01 | 120769 | | 2024-01-08 | 130259 | | 2024-01-15 | 177465 | #+BEGIN_SRC python :results output :session dates = [date for date, _ in converted_data] for date1, date2 in zip(dates[:-1], dates[1:]): if date2-date1 != datetime.timedelta(weeks=1): print(f"Il y a {date2-date1} entre {date1} et {date2}") #+END_SRC #+RESULTS: : Il y a 14 days, 0:00:00 entre 1989-05-01 et 1989-05-15 #+NAME: data-for-R #+BEGIN_SRC python :results silent :session [('date', 'inc'), None] + [(str(date), inc) for date, inc in converted_data] #+END_SRC #+BEGIN_SRC R :results output :var data=data-for-R :session data$date <- as.Date(data$date) summary(data) #+END_SRC #+RESULTS: : date inc : Min. :1984-10-29 Min. : 0 : 1st Qu.:1994-08-23 1st Qu.: 5329 : Median :2004-06-10 Median : 16690 : Mean :2004-06-09 Mean : 59929 : 3rd Qu.:2014-03-29 3rd Qu.: 50927 : Max. :2024-01-15 Max. :1001824 #+BEGIN_SRC R :results output graphics :file inc-plot.png :session plot(data, type="l", xlab="Date", ylab="inc") #+END_SRC #+RESULTS: #+BEGIN_SRC R :results output graphics :file inc-plot-zoom.png :session plot(tail(data, 200), type="l", xlab="Date", ylab="Incidence hebdomadaire") #+END_SRC #+RESULTS: #+BEGIN_SRC R :results silent :session pic_annuel = function(annee) { debut = paste0(annee-1,"-08-01") fin = paste0(annee,"-08-01") semaines = data$date > debut & data$date <= fin sum(data$inc[semaines], na.rm=TRUE) } #+END_SRC #+BEGIN_SRC R :results silent :session annees <- 1985:2024 #+END_SRC #+BEGIN_SRC R :results value :session inc_annuelle = data.frame(annee = annees, incidence = sapply(annees, pic_annuel)) head(inc_annuelle) #+END_SRC #+RESULTS: | 1985 | 5779196 | | 1986 | 5100540 | | 1987 | 2861556 | | 1988 | 2766142 | | 1989 | 5460155 | | 1990 | 5233987 | #+BEGIN_SRC R :results output graphics :file annual-inc-plot.png :session plot(inc_annuelle, type="p", xlab="Année", ylab="Incidence annuelle") #+END_SRC #+RESULTS: #+BEGIN_SRC R :results output :session head(inc_annuelle[order(inc_annuelle$incidence),]) #+END_SRC #+RESULTS: : annee incidence : 37 2021 748872 : 30 2014 1602814 : 7 1991 1660832 : 11 1995 1837329 : 40 2024 1972910 : 36 2020 2011122