#est populaire en Europe, mais peu utilisée aux Etats-Unis. Ceci explique peut-être que peu de logiciels savent gérer ce format. Le langage Python le fait depuis la version 3.6. Nous utilisons donc ce langage pour la préparation de nos données, ce qui a l'avantage de ne nécessiter aucune bibliothèque supplémentaire. (Note: nous expliquerons dans le module 4 pourquoi il est avantageux pour la réproductibilité de se limiter à un minimum de bibliothèques.)

** Téléchargement

   #+begin_src python :results output :session :exports both
data_file = 'incidence-PAY-7.csv'
import os
import urllib.request
if not os.path.exists(data_file):
    urllib.request.urlretrieve(data_url, data_file)
   #+end_src
   
   #+RESULTS:

   #+begin_src python :results silent :session :var data_url=data-url

data= open(data_file, 'rb').read()
lines = data.decode('latin-1').strip().split('\n')
data_lines = lines[1:]
table = [line.split(',') for line in data_lines]
   #+end_src

   #+begin_src python :results value :session
table[:5]
   #+end_src

   #+RESULTS:
   |   week | indicator |    inc | inc_low |  inc_up | inc100 | inc100_low | inc100_up | geo_insee | geo_name |
   | 202403 |         3 | 177465 | 164064 | 190866 |    266 |       246 |      286 | FR       | France  |
   | 202402 |         3 | 130259 | 120192 | 140326 |    195 |       180 |      210 | FR       | France  |
   | 202401 |         3 | 120769 | 109452 | 132086 |    181 |       164 |      198 | FR       | France  |
   | 202352 |         3 | 115446 | 103738 | 127154 |    174 |       156 |      192 | FR       | France  |


   #+begin_src python :results output :session 
valid_table = []
for row in table:
    missing = any([column == '' for column in row])
    if missing:
        print(row)
    else:
        valid_table.append(row)

   #+end_src

   #+RESULTS:
   : ['198919', '3', '-', '', '', '-', '', '', 'FR', 'France']

#+BEGIN_SRC python :results silent :session
week = [row[0] for row in valid_table]
assert week[0] == 'week'
del week[0]
inc = [row[2] for row in valid_table]
assert inc[0] == 'inc
del inc[0]
data = list(zip(week, inc))
#+END_SRC

#+BEGIN_SRC python :results value :session
[('week', 'inc'), None] + data[:5] + [None] + data[-5:]
#+END_SRC

#+RESULTS:
|   week |    inc |
|--------+--------|
| 202403 | 177465 |
| 202402 | 130259 |
| 202401 | 120769 |
| 202352 | 115446 |
| 202351 | 148755 |
|--------+--------|
| 198448 |  78620 |
| 198447 |  72029 |
| 198446 |  87330 |
| 198445 | 135223 |
| 198444 |  68422 |

#+BEGIN_SRC python :results output :session
for week, inc in data:
    if len(week) != 6 or not week.isdigit():
        print("Valeur suspecte dans la colonne 'week': ", (week, inc))
    if not inc.isdigit():
        print("Valeur suspecte dans la colonne 'inc': ", (week, inc))
#+END_SRC

#+RESULTS:

#+BEGIN_SRC python :results silent :session
import datetime
converted_data = [(datetime.datetime.strptime(year_and_week + ":1" , '%G%V:%u').date(),
                  int(inc))
                  for year_and_week, inc in data]
converted_data.sort(key = lambda record: record[0])
#+END_SRC

#+BEGIN_SRC python :results value :session
str_data = [(str(date), str(inc)) for date, inc in converted_data]
[('date', 'inc'), None] + str_data[:5] + [None] + str_data[-5:]
#+END_SRC

#+RESULTS:
|       date |    inc |
|------------+--------|
| 1984-10-29 |  68422 |
| 1984-11-05 | 135223 |
| 1984-11-12 |  87330 |
| 1984-11-19 |  72029 |
| 1984-11-26 |  78620 |
|------------+--------|
| 2023-12-18 | 148755 |
| 2023-12-25 | 115446 |
| 2024-01-01 | 120769 |
| 2024-01-08 | 130259 |
| 2024-01-15 | 177465 |

#+BEGIN_SRC python :results output :session
dates = [date for date, _ in converted_data]
for date1, date2 in zip(dates[:-1], dates[1:]):
    if date2-date1 != datetime.timedelta(weeks=1):
        print(f"Il y a {date2-date1} entre {date1} et {date2}")
#+END_SRC

#+RESULTS:
: Il y a 14 days, 0:00:00 entre 1989-05-01 et 1989-05-15

#+NAME: data-for-R
#+BEGIN_SRC python :results silent :session
[('date', 'inc'), None] + [(str(date), inc) for date, inc in converted_data]
#+END_SRC

#+BEGIN_SRC R :results output :var data=data-for-R :session
data$date <- as.Date(data$date)
summary(data)
#+END_SRC

#+RESULTS:
:       date                 inc         
:  Min.   :1984-10-29   Min.   :      0  
:  1st Qu.:1994-08-23   1st Qu.:   5329  
:  Median :2004-06-10   Median :  16690  
:  Mean   :2004-06-09   Mean   :  59929  
:  3rd Qu.:2014-03-29   3rd Qu.:  50927  
:  Max.   :2024-01-15   Max.   :1001824


#+BEGIN_SRC R :results output graphics :file inc-plot.png :session
plot(data, type="l", xlab="Date", ylab="inc")
#+END_SRC

#+RESULTS:

#+BEGIN_SRC R :results output graphics :file inc-plot-zoom.png :session
plot(tail(data, 200), type="l", xlab="Date", ylab="Incidence hebdomadaire")
#+END_SRC

#+RESULTS:

#+BEGIN_SRC R :results silent :session
pic_annuel = function(annee) {
      debut = paste0(annee-1,"-08-01")
      fin = paste0(annee,"-08-01")
      semaines = data$date > debut & data$date <= fin
      sum(data$inc[semaines], na.rm=TRUE)
      }
#+END_SRC

#+BEGIN_SRC R :results silent :session
annees <- 1985:2024
#+END_SRC

#+BEGIN_SRC R :results value :session
inc_annuelle = data.frame(annee = annees,
                          incidence = sapply(annees, pic_annuel))
head(inc_annuelle)
#+END_SRC

#+RESULTS:
| 1985 | 5779196 |
| 1986 | 5100540 |
| 1987 | 2861556 |
| 1988 | 2766142 |
| 1989 | 5460155 |
| 1990 | 5233987 |

#+BEGIN_SRC R :results output graphics :file annual-inc-plot.png :session
plot(inc_annuelle, type="p", xlab="Année", ylab="Incidence annuelle")
#+END_SRC

#+RESULTS:

#+BEGIN_SRC R :results output :session
head(inc_annuelle[order(inc_annuelle$incidence),])
#+END_SRC

#+RESULTS:
:    annee incidence
: 37  2021    748872
: 30  2014   1602814
: 7   1991   1660832
: 11  1995   1837329
: 40  2024   1972910
: 36  2020   2011122