diff --git a/.gitignore b/.gitignore index e6ffe3a48ab1bae6a2bc1da089924c554c5f03ae..e96156cc253bdcf79d2d5e724bce235c2b50d9f9 100644 --- a/.gitignore +++ b/.gitignore @@ -9,4 +9,6 @@ *.tex _minted* svg-inkscape* -*-svg.pdf \ No newline at end of file +*-svg.pdf +.Rhistory +.snakemake diff --git a/Makefile b/Makefile index 69dfd0b969cfbab7474bb9e73cf01403255880bd..ec285bd9430f877a62f37486c5a9112a797e8551 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ ressources-md: for i in module1/ressources module2/ressources module2/slides module3/ressources \ - module4/ressources module5/ressources; do \ + module4/ressources module5/ressources module6/ressources; do \ make -C $$i ressources-md; \ done diff --git a/module6/ressources/incidence_syndrome_grippal_snakemake/Snakefile b/module6/ressources/incidence_syndrome_grippal_snakemake/Snakefile new file mode 100644 index 0000000000000000000000000000000000000000..59feb8a3f4abbec69dcd3b50bd5aab6bedac9211 --- /dev/null +++ b/module6/ressources/incidence_syndrome_grippal_snakemake/Snakefile @@ -0,0 +1,39 @@ +rule download: + output: + "data/weekly-incidence.csv" + shell: + "wget -O {output} http://www.sentiweb.fr/datasets/incidence-PAY-3.csv" + +rule preprocess: + input: + "data/weekly-incidence.csv" + output: + data="data/preprocessed-weekly-incidence.csv", + errorlog="data/errors-from-preprocessing.csv" + script: + "scripts/preprocess.py" + +rule plot: + input: + "data/preprocessed-weekly-incidence.csv" + output: + "data/weekly-incidence-plot.png", + "data/weekly-incidence-plot-last-years.png" + script: + "scripts/incidence-plots.R" + +rule annual_incidence: + input: + "data/preprocessed-weekly-incidence.csv" + output: + "data/annual-incidence.csv" + script: + "scripts/annual-incidence.R" + +rule histogram: + input: + "data/annual-incidence.csv" + output: + "data/annual-incidence-histogram.png" + script: + "scripts/annual-incidence-histogram.R" diff --git a/module6/ressources/incidence_syndrome_grippal_snakemake/scripts/annual-incidence-histogram.R b/module6/ressources/incidence_syndrome_grippal_snakemake/scripts/annual-incidence-histogram.R new file mode 100644 index 0000000000000000000000000000000000000000..0871acf531ea6643fbf0a01d5940176e6ee0a640 --- /dev/null +++ b/module6/ressources/incidence_syndrome_grippal_snakemake/scripts/annual-incidence-histogram.R @@ -0,0 +1,11 @@ +# Read in the data and convert the dates +data = read.csv(snakemake@input[[1]]) + +# Plot the histogram +png(filename=snakemake@output[[1]]) +hist(data$incidence, + breaks=10, + xlab="Annual incidence", + ylab="Number of observations", + main="") +dev.off() diff --git a/module6/ressources/incidence_syndrome_grippal_snakemake/scripts/annual-incidence.R b/module6/ressources/incidence_syndrome_grippal_snakemake/scripts/annual-incidence.R new file mode 100644 index 0000000000000000000000000000000000000000..aa2b2c8030d39bc5483917b84837a55edaeba13f --- /dev/null +++ b/module6/ressources/incidence_syndrome_grippal_snakemake/scripts/annual-incidence.R @@ -0,0 +1,23 @@ +# Read in the data and convert the dates +data = read.csv(snakemake@input[[1]]) +names(data) <- c("date", "incidence") +data$date <- as.Date(data$date) + +# A function that extracts the peak for year N +yearly_peak = function(year) { + start = paste0(year-1,"-08-01") + end = paste0(year,"-08-01") + records = data$date > start & data$date <= end + sum(data$incidence[records]) + } + +# The years for which we have the full peak +years <- 1986:2018 + +# Make a new data frame for the annual incidences +annual_data = data.frame(year = years, + incidence = sapply(years, yearly_peak)) +# write output file +write.csv(annual_data, + file=snakemake@output[[1]], + row.names=FALSE) diff --git a/module6/ressources/incidence_syndrome_grippal_snakemake/scripts/incidence-plots.R b/module6/ressources/incidence_syndrome_grippal_snakemake/scripts/incidence-plots.R new file mode 100644 index 0000000000000000000000000000000000000000..2a5f514c339c5b77b6ebbf4353371e8971444f92 --- /dev/null +++ b/module6/ressources/incidence_syndrome_grippal_snakemake/scripts/incidence-plots.R @@ -0,0 +1,14 @@ +# Read in the data and convert the dates +data = read.csv(snakemake@input[[1]]) +names(data) <- c("date", "incidence") +data$date <- as.Date(data$date) + +# Plot the complete incidence dataset +png(filename=snakemake@output[[1]]) +plot(data, type="l", xlab="Date", ylab="Weekly incidence") +dev.off() + +# Zoom on the last four years +png(filename=snakemake@output[[2]]) +plot(tail(data, 4*52), type="l", xlab="Date", ylab="Weekly incidence") +dev.off() diff --git a/module6/ressources/incidence_syndrome_grippal_snakemake/scripts/preprocess.py b/module6/ressources/incidence_syndrome_grippal_snakemake/scripts/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..056b006083b98a06b97cbe66c58d34f14cb1e0ca --- /dev/null +++ b/module6/ressources/incidence_syndrome_grippal_snakemake/scripts/preprocess.py @@ -0,0 +1,68 @@ +# Libraries used by this script: +import datetime # for date conversion +import csv # for writing output to a CSV file + +# Read the CSV file into memory +data = open(snakemake.input[0], 'rb').read() +# Decode the Latin-1 character set, +# remove white space at both ends, +# and split into lines. +lines = data.decode('latin-1') \ + .strip() \ + .split('\n') +# Discard the first line, which contains a comment +data_lines = lines[1:] +# Split each line into columns +table = [line.split(',') for line in data_lines] + +# Remove records with missing data and write +# the removed records to a separate file for inspection. +with open(snakemake.output.errorlog, "w") as errorlog: + valid_table = [] + for row in table: + missing = any([column == '' for column in row]) + if missing: + errorlog.write("Missing data in record\n") + errorlog.write(str(row)) + errorlog.write("\n") + else: + valid_table.append(row) + +# Extract the two relevant columns, "week" and "inc" +week = [row[0] for row in valid_table] +assert week[0] == 'week' +del week[0] +inc = [row[2] for row in valid_table] +assert inc[0] == 'inc' +del inc[0] +data = list(zip(week, inc)) + +# Check for obviously out-of-range values +with open(snakemake.output.errorlog, "a") as errorlog: + for week, inc in data: + if len(week) != 6 or not week.isdigit(): + errorlog.write("Suspect value in column 'week': {week}\n") + if not inc.isdigit(): + errorlog.write("Suspect value in column 'inc': {inc}\n") + +# Convert year/week by date of the corresponding Monday, +# then sort by increasing date +converted_data = \ + [(datetime.datetime.strptime(year_and_week + ":1" , '%G%V:%u').date(), inc) + for year_and_week, inc in data] +converted_data.sort(key = lambda record: record[0]) + +# Check that consecutive dates are seven days apart +with open(snakemake.output.errorlog, "a") as errorlog: + dates = [date for date, _ in converted_data] + for date1, date2 in zip(dates[:-1], dates[1:]): + if date2-date1 != datetime.timedelta(weeks=1): + errorlog.write(f"{date2-date1} between {date1} and {date2}\n") + +# Write data to a CSV file with two columns: +# 1. the date of the Monday of each week, in ISO format +# 2. the incidence estimate for that week +with open(snakemake.output.data, "w") as csvfile: + csv_writer = csv.writer(csvfile) + for row in converted_data: + csv_writer.writerow(row) diff --git a/module6/ressources/snakemake_tutorial_fr.org b/module6/ressources/snakemake_tutorial_fr.org new file mode 100644 index 0000000000000000000000000000000000000000..bbe23731eabb16ca9e3457c776b88523dd7d16b8 --- /dev/null +++ b/module6/ressources/snakemake_tutorial_fr.org @@ -0,0 +1,13 @@ +# -*- mode: org -*- +#+TITLE: Gérer un workflow avec snakemake +#+DATE: August, 2019 +#+STARTUP: overview indent +#+OPTIONS: num:nil toc:t +#+PROPERTY: header-args :eval never-export + +* Installer snakemake +TODO +* L'analyse de l'incidence du syndrome grippal revisitée +Nous allons reprendre l'exemple du module 3, l'analyse de l'incidence du syndrome grippal. +** 1ère étape: le téléchargement des données +