From da7f5fa74120534f34932bdcbeebcc8baa1e13ea Mon Sep 17 00:00:00 2001 From: Tommy Rushton Date: Mon, 15 Apr 2024 17:30:38 +0200 Subject: [PATCH] More on module3 --- .gitignore | 1 + .../exo1/influenza-like-illness-analysis.org | 87 +------------------ 2 files changed, 2 insertions(+), 86 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9bd9b01 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*.~undo-tree~ diff --git a/module3/exo1/influenza-like-illness-analysis.org b/module3/exo1/influenza-like-illness-analysis.org index f6e1336..e06804d 100644 --- a/module3/exo1/influenza-like-illness-analysis.org +++ b/module3/exo1/influenza-like-illness-analysis.org @@ -77,7 +77,7 @@ replacing our file by a corrupted version. Therefore we download the data only if no local copy exists. #+BEGIN_SRC python :results output :var data_url=data-url -data_file = "syndrome.grippal.csv" +data_file = "syndrome-grippal.csv" import os import urllib.request @@ -98,20 +98,11 @@ data_lines = lines[1:] table = [line.split(',') for line in data_lines] #+END_SRC -#+RESULTS: - Let's have a look at what we have so far: #+BEGIN_SRC python :results value :exports both table[:5] #+END_SRC -#+RESULTS: -| week | indicator | inc | inc_low | inc_up | inc100 | inc100_low | inc100_up | geo_insee | geo_name | -| 202414 | 3 | 35194 | 28705 | 41683 | 53 | 43 | 63 | FR | France | -| 202413 | 3 | 35038 | 29535 | 40541 | 53 | 45 | 61 | FR | France | -| 202412 | 3 | 40639 | 34582 | 46696 | 61 | 52 | 70 | FR | France | -| 202411 | 3 | 50268 | 43331 | 57205 | 75 | 65 | 85 | FR | France | - ** Checking for missing data Unfortunately there are many ways to indicate the absence of a data value in a dataset. Here we check for a common one: empty fields. For completeness, we should also look for non-numerical data in numerical columns. We don't do this here, but checks in later processing steps would catch such anomalies. @@ -126,9 +117,6 @@ for row in table: valid_table.append(row) #+END_SRC -#+RESULTS: -: ['198919', '3', '-', '', '', '-', '', '', 'FR', 'France'] - ** Extraction of the required columns There are only two columns that we will need for our analysis: the first (~"week"~) and the third (~"inc"~). We check the names in the header to be sure we pick the right data. We make a new table containing just the two columns required, without the header. #+BEGIN_SRC python :results silent :exports both @@ -146,21 +134,6 @@ Let's look at the first and last lines. We insert ~None~ to indicate to org-mode [('week', 'inc'), None] + data[:5] + [None] + data[-5:] #+END_SRC -#+RESULTS: -| week | inc | -|--------+--------| -| 202414 | 35194 | -| 202413 | 35038 | -| 202412 | 40639 | -| 202411 | 50268 | -| 202410 | 60107 | -|--------+--------| -| 198448 | 78620 | -| 198447 | 72029 | -| 198446 | 87330 | -| 198445 | 135223 | -| 198444 | 68422 | - ** Verification It is always prudent to verify if the data looks credible. A simple fact we can check for is that weeks are given as six-digit integers (four for the year, two for the week), and that the incidence values are positive integers. #+BEGIN_SRC python :results output :exports both @@ -171,8 +144,6 @@ for week, inc in data: print("Suspicious value in column 'inc': ", (week, inc)) #+END_SRC -#+RESULTS: - No problem - fine! ** Date conversion @@ -192,21 +163,6 @@ str_data = [(str(date), str(inc)) for date, inc in converted_data] [('date', 'inc'), None] + str_data[:5] + [None] + str_data[-5:] #+END_SRC -#+RESULTS: -| date | inc | -|------------+--------| -| 1984-10-29 | 68422 | -| 1984-11-05 | 135223 | -| 1984-11-12 | 87330 | -| 1984-11-19 | 72029 | -| 1984-11-26 | 78620 | -|------------+--------| -| 2024-03-04 | 60107 | -| 2024-03-11 | 50268 | -| 2024-03-18 | 40639 | -| 2024-03-25 | 35038 | -| 2024-04-01 | 35194 | - ** Date verification We do one more verification: our dates must be separated by exactly one week, except around the missing data point. #+BEGIN_SRC python :results output :exports both @@ -216,9 +172,6 @@ for date1, date2 in zip(dates[:-1], dates[1:]): print(f"The difference between {date1} and {date2} is {date2-date1}") #+END_SRC -#+RESULTS: -: The difference between 1989-05-01 and 1989-05-15 is 14 days, 0:00:00 - ** Transfer Python -> R We switch to R for data inspection and analysis, because the code is more concise in R and requires no additional libraries. @@ -234,32 +187,17 @@ data$date <- as.Date(data$date) summary(data) #+END_SRC -#+RESULTS: -: date inc -: Min. :1984-10-29 Min. : 0 -: 1st Qu.:1994-09-12 1st Qu.: 5364 -: Median :2004-07-19 Median : 16901 -: Mean :2004-07-18 Mean : 60163 -: 3rd Qu.:2014-05-26 3rd Qu.: 52140 -: Max. :2024-04-01 Max. :1001824 - ** Inspection Finally we can look at a plot of our data! #+BEGIN_SRC R :results file graphics :file inc-plot.png plot(data, type="l", xlab="Date", ylab="Weekly incidence") #+END_SRC -#+RESULTS: -[[file:inc-plot.png]] - A zoom on the last few years makes the peaks in winter stand out more clearly. #+BEGIN_SRC R :results file graphics :file inc-plot-zoom.png :exports both plot(tail(data, 200), type="l", xlab="Date", ylab="Weekly incidence") #+END_SRC -#+RESULTS: -[[file:inc-plot-zoom.png]] - * Study of the annual incidence ** Computation of the annual incidence @@ -287,42 +225,19 @@ annnual_inc = data.frame(year = years, head(annnual_inc) #+END_SRC -#+RESULTS: -| 1986 | 5100540 | -| 1987 | 2861556 | -| 1988 | 2766142 | -| 1989 | 5460155 | -| 1990 | 5233987 | -| 1991 | 1660832 | - ** Inspection A plot of the annual incidence: #+BEGIN_SRC R :results graphics file :file annual-inc-plot.png :exports both plot(annnual_inc, type="p", xlab="Année", ylab="Annual incidence") #+END_SRC -#+RESULTS: -[[file:annual-inc-plot.png]] - ** Identification of the strongest epidemics A list sorted by decreasing annual incidence makes it easy to find the most important ones: #+BEGIN_SRC R :results output :exports both head(annnual_inc[order(-annnual_inc$incidence),]) #+END_SRC -#+RESULTS: -: year incidence -: 4 1989 5460155 -: 5 1990 5233987 -: 1 1986 5100540 -: 28 2013 4182265 -: 25 2010 4085126 -: 14 1999 3897443 - Finally, a histogram clearly shows the few very strong epidemics, which affect about 10% of the French population, but are rare: there were three of them in the course of 35 years. The typical epidemic affects only half as many people. #+BEGIN_SRC R :results output graphics file :file annual-inc-hist.png :exports both hist(annnual_inc$incidence, breaks=10, xlab="Annual incidence", ylab="Number of observations", main="") #+END_SRC - -#+RESULTS: -[[file:annual-inc-hist.png]] -- 2.18.1