Incidence de grippe par région via snakemake

7f022725 · Konrad Hinsen · 4f56d125 · 7f022725 · 7f022725 · 7f022725
Commit 7f022725 authored Sep 19, 2019 by Konrad Hinsen
7 changed files
--- a/module6/ressources/incidence_syndrome_grippal_par_region_snakemake/Snakefile
+++ b/module6/ressources/incidence_syndrome_grippal_par_region_snakemake/Snakefile
+rule all:
+     input:
+        "data/peak-year-all-regions.txt"
+
+rule download:
+     output:
+        "data/weekly-incidence-all-regions.csv"
+     shell:
+        "wget -O {output} http://www.sentiweb.fr/datasets/incidence-RDD-3.csv"
+
+checkpoint split_by_region:
+     input:
+        "data/weekly-incidence-all-regions.csv"
+     output:
+        directory("data/by-region")
+     script:
+        "scripts/split-by-region.py"
+
+rule preprocess:
+     input:
+        "data/by-region/weekly-incidence-{region}.csv"
+     output:
+        data="data/preprocessed-weekly-incidence-{region}.csv",
+        errorlog="data/errors-from-preprocessing-{region}.txt"
+     script:
+        "scripts/preprocess.py"
+
+rule plot:
+     input:
+        "data/preprocessed-weekly-incidence-{region}.csv"
+     output:
+        "data/weekly-incidence-plot-{region}.png",
+        "data/weekly-incidence-plot-last-years-{region}.png"
+     script:
+        "scripts/incidence-plots.R"
+
+rule annual_incidence:
+     input:
+        "data/preprocessed-weekly-incidence-{region}.csv"
+     output:
+        "data/annual-incidence-{region}.csv"
+     script:
+        "scripts/annual-incidence.R"
+
+def annual_incidence_files(wildcards):
+    directory = checkpoints.split_by_region.get().output[0]
+    pattern = os.path.join(directory, "weekly-incidence-{region}.csv")
+    print("Pattern:", pattern)
+    return expand("data/annual-incidence-{region}.csv",
+                  region=glob_wildcards(pattern).region)
+
+rule peak_years:
+     input:
+        annual_incidence_files
+     output:
+        "data/peak-year-all-regions.txt"
+     script:
+        "scripts/peak-years.py"
--- a/module6/ressources/incidence_syndrome_grippal_par_region_snakemake/scripts/annual-incidence-histogram.R
+++ b/module6/ressources/incidence_syndrome_grippal_par_region_snakemake/scripts/annual-incidence-histogram.R
+# Read in the data and convert the dates
+data = read.csv(snakemake@input[[1]])
+
+# Plot the histogram
+png(filename=snakemake@output[[1]])
+hist(data$incidence,
+     breaks=10,
+     xlab="Annual incidence",
+     ylab="Number of observations",
+     main="")
+dev.off()
--- a/module6/ressources/incidence_syndrome_grippal_par_region_snakemake/scripts/annual-incidence.R
+++ b/module6/ressources/incidence_syndrome_grippal_par_region_snakemake/scripts/annual-incidence.R
+# Read in the data and convert the dates
+data = read.csv(snakemake@input[[1]])
+names(data) <- c("date", "incidence")
+data$date <- as.Date(data$date)
+
+# A function that extracts the peak for year N
+yearly_peak = function(year) {
+      start = paste0(year-1,"-08-01")
+      end = paste0(year,"-08-01")
+      records = data$date > start & data$date <= end
+      sum(data$incidence[records])
+      }
+
+# The years for which we have the full peak
+years <- 1986:2018
+
+# Make a new data frame for the annual incidences
+annual_data = data.frame(year = years,
+                         incidence = sapply(years, yearly_peak))
+# write output file
+write.csv(annual_data,
+          file=snakemake@output[[1]],
+          row.names=FALSE)
--- a/module6/ressources/incidence_syndrome_grippal_par_region_snakemake/scripts/incidence-plots.R
+++ b/module6/ressources/incidence_syndrome_grippal_par_region_snakemake/scripts/incidence-plots.R
+# Read in the data and convert the dates
+data = read.csv(snakemake@input[[1]])
+data$week_starting <- as.Date(data$week_starting)
+
+# Plot the complete incidence dataset
+png(filename=snakemake@output[[1]])
+plot(data, type="l", xlab="Date", ylab="Weekly incidence")
+dev.off()
+
+# Zoom on the last four years
+png(filename=snakemake@output[[2]])
+plot(tail(data, 4*52), type="l", xlab="Date", ylab="Weekly incidence")
+dev.off()
--- a/module6/ressources/incidence_syndrome_grippal_par_region_snakemake/scripts/peak-years.py
+++ b/module6/ressources/incidence_syndrome_grippal_par_region_snakemake/scripts/peak-years.py
+# Libraries used by this script:
+import csv       # for reading CSV files
+import os        # for path manipulations
+
+with open(snakemake.output[0], 'w') as result_file:
+    for filename in snakemake.input:
+        region = '-'.join(os.path.splitext(filename)[0].split('-')[2:])
+        with open(filename, 'r') as csv_file:
+            csv_reader = csv.reader(csv_file)
+            csv_reader.__next__()
+            peak_year = None
+            peak_incidence = 0
+            for year, incidence in csv_reader:
+                incidence = int(incidence)
+                if incidence > peak_incidence:
+                    peak_incidence = incidence
+                    peak_year = year
+        result_file.write(region)
+        result_file.write(', ')
+        result_file.write(peak_year)
+        result_file.write('\n')
--- a/module6/ressources/incidence_syndrome_grippal_par_region_snakemake/scripts/preprocess.py
+++ b/module6/ressources/incidence_syndrome_grippal_par_region_snakemake/scripts/preprocess.py
+# Libraries used by this script:
+import datetime  # for date conversion
+import csv       # for writing output to a CSV file
+
+# Read the CSV file into memory
+data = open(snakemake.input[0], 'r').read()
+# Remove white space at both ends,
+# and split into lines.
+lines = data.strip() \
+            .split('\n')
+# Split each line into columns
+table = [line.split(',') for line in lines]
+
+# Remove records with missing data and write
+# the removed records to a separate file for inspection.
+with open(snakemake.output.errorlog, "w") as errorlog:
+    valid_table = []
+    for row in table:
+        missing = any([column == '' for column in row])
+        if missing:
+            errorlog.write("Missing data in record\n")
+            errorlog.write(str(row))
+            errorlog.write("\n")
+        else:
+            valid_table.append(row)
+
+# Extract the two relevant columns, "week" and "inc"
+week = [row[0] for row in valid_table]
+assert week[0] == 'week'
+del week[0]
+inc = [row[2] for row in valid_table]
+assert inc[0] == 'inc'
+del inc[0]
+data = list(zip(week, inc))
+
+# Check for obviously out-of-range values
+with open(snakemake.output.errorlog, "a") as errorlog:
+    for week, inc in data:
+        if len(week) != 6 or not week.isdigit():
+            errorlog.write("Suspect value in column 'week': {week}\n")
+        if not inc.isdigit():
+            errorlog.write("Suspect value in column 'inc': {inc}\n")
+
+# Convert year/week by date of the corresponding Monday,
+# then sort by increasing date
+converted_data = \
+    [(datetime.datetime.strptime(year_and_week + ":1" , '%G%V:%u').date(), inc)
+     for year_and_week, inc in data]
+converted_data.sort(key = lambda record: record[0])
+
+# Check that consecutive dates are seven days apart
+with open(snakemake.output.errorlog, "a") as errorlog:
+    dates = [date for date, _ in converted_data]
+    for date1, date2 in zip(dates[:-1], dates[1:]):
+        if date2-date1 != datetime.timedelta(weeks=1):
+            errorlog.write(f"{date2-date1} between {date1} and {date2}\n")
+
+# Write data to a CSV file with two columns:
+#  1. the date of the Monday of each week, in ISO format
+#  2. the incidence estimate for that week
+with open(snakemake.output.data, "w") as csvfile:
+    csv_writer = csv.writer(csvfile)
+    csv_writer.writerow(["week_starting", "incidence"])
+    for row in converted_data:
+        csv_writer.writerow(row)
--- a/module6/ressources/incidence_syndrome_grippal_par_region_snakemake/scripts/split-by-region.py
+++ b/module6/ressources/incidence_syndrome_grippal_par_region_snakemake/scripts/split-by-region.py
+import os
+
+# Read the CSV file into memory
+data = open(snakemake.input[0], 'rb').read()
+# Decode the Latin-1 character set,
+# remove white space at both ends,
+# and split into lines.
+lines = data.decode('latin-1') \
+         .strip() \
+         .split('\n')
+
+# Separate header from data table
+comment = lines[0]
+header = lines[1]
+table = [line.split(',') for line in lines[2:]]
+
+# Find all the regions mentioned in the table
+regions = set(record[-1] for record in table)
+
+# Create the output directory
+directory = snakemake.output[0]
+if not os.path.exists(directory):
+    os.makedirs(directory)
+
+# Write CSV files for each region
+for region in regions:
+    # Some region names contain spaces which are awkward in filenames
+    region_name = '-'.join(region.split(' '))
+    filename = os.path.join(directory, 'weekly-incidence-' + region_name + '.csv')
+    with open(filename, 'w') as output_file:
+        output_file.write(header)
+        output_file.write('\n')
+        for record in table:
+            # Write only the records for right region
+            if record[-1] == region:
+                output_file.write(','.join(record))
+                output_file.write('\n')