diff --git a/module6/ressources/incidence_syndrome_grippal_par_region_snakemake/Snakefile b/module6/ressources/incidence_syndrome_grippal_par_region_snakemake/Snakefile new file mode 100644 index 0000000000000000000000000000000000000000..bc0ccc3593b324a790835400c64aa21c5e7dd8d9 --- /dev/null +++ b/module6/ressources/incidence_syndrome_grippal_par_region_snakemake/Snakefile @@ -0,0 +1,58 @@ +rule all: + input: + "data/peak-year-all-regions.txt" + +rule download: + output: + "data/weekly-incidence-all-regions.csv" + shell: + "wget -O {output} http://www.sentiweb.fr/datasets/incidence-RDD-3.csv" + +checkpoint split_by_region: + input: + "data/weekly-incidence-all-regions.csv" + output: + directory("data/by-region") + script: + "scripts/split-by-region.py" + +rule preprocess: + input: + "data/by-region/weekly-incidence-{region}.csv" + output: + data="data/preprocessed-weekly-incidence-{region}.csv", + errorlog="data/errors-from-preprocessing-{region}.txt" + script: + "scripts/preprocess.py" + +rule plot: + input: + "data/preprocessed-weekly-incidence-{region}.csv" + output: + "data/weekly-incidence-plot-{region}.png", + "data/weekly-incidence-plot-last-years-{region}.png" + script: + "scripts/incidence-plots.R" + +rule annual_incidence: + input: + "data/preprocessed-weekly-incidence-{region}.csv" + output: + "data/annual-incidence-{region}.csv" + script: + "scripts/annual-incidence.R" + +def annual_incidence_files(wildcards): + directory = checkpoints.split_by_region.get().output[0] + pattern = os.path.join(directory, "weekly-incidence-{region}.csv") + print("Pattern:", pattern) + return expand("data/annual-incidence-{region}.csv", + region=glob_wildcards(pattern).region) + +rule peak_years: + input: + annual_incidence_files + output: + "data/peak-year-all-regions.txt" + script: + "scripts/peak-years.py" diff --git a/module6/ressources/incidence_syndrome_grippal_par_region_snakemake/scripts/annual-incidence-histogram.R b/module6/ressources/incidence_syndrome_grippal_par_region_snakemake/scripts/annual-incidence-histogram.R new file mode 100644 index 0000000000000000000000000000000000000000..0871acf531ea6643fbf0a01d5940176e6ee0a640 --- /dev/null +++ b/module6/ressources/incidence_syndrome_grippal_par_region_snakemake/scripts/annual-incidence-histogram.R @@ -0,0 +1,11 @@ +# Read in the data and convert the dates +data = read.csv(snakemake@input[[1]]) + +# Plot the histogram +png(filename=snakemake@output[[1]]) +hist(data$incidence, + breaks=10, + xlab="Annual incidence", + ylab="Number of observations", + main="") +dev.off() diff --git a/module6/ressources/incidence_syndrome_grippal_par_region_snakemake/scripts/annual-incidence.R b/module6/ressources/incidence_syndrome_grippal_par_region_snakemake/scripts/annual-incidence.R new file mode 100644 index 0000000000000000000000000000000000000000..aa2b2c8030d39bc5483917b84837a55edaeba13f --- /dev/null +++ b/module6/ressources/incidence_syndrome_grippal_par_region_snakemake/scripts/annual-incidence.R @@ -0,0 +1,23 @@ +# Read in the data and convert the dates +data = read.csv(snakemake@input[[1]]) +names(data) <- c("date", "incidence") +data$date <- as.Date(data$date) + +# A function that extracts the peak for year N +yearly_peak = function(year) { + start = paste0(year-1,"-08-01") + end = paste0(year,"-08-01") + records = data$date > start & data$date <= end + sum(data$incidence[records]) + } + +# The years for which we have the full peak +years <- 1986:2018 + +# Make a new data frame for the annual incidences +annual_data = data.frame(year = years, + incidence = sapply(years, yearly_peak)) +# write output file +write.csv(annual_data, + file=snakemake@output[[1]], + row.names=FALSE) diff --git a/module6/ressources/incidence_syndrome_grippal_par_region_snakemake/scripts/incidence-plots.R b/module6/ressources/incidence_syndrome_grippal_par_region_snakemake/scripts/incidence-plots.R new file mode 100644 index 0000000000000000000000000000000000000000..abdd13686af13e6b54ada64651d66de4c05c190d --- /dev/null +++ b/module6/ressources/incidence_syndrome_grippal_par_region_snakemake/scripts/incidence-plots.R @@ -0,0 +1,13 @@ +# Read in the data and convert the dates +data = read.csv(snakemake@input[[1]]) +data$week_starting <- as.Date(data$week_starting) + +# Plot the complete incidence dataset +png(filename=snakemake@output[[1]]) +plot(data, type="l", xlab="Date", ylab="Weekly incidence") +dev.off() + +# Zoom on the last four years +png(filename=snakemake@output[[2]]) +plot(tail(data, 4*52), type="l", xlab="Date", ylab="Weekly incidence") +dev.off() diff --git a/module6/ressources/incidence_syndrome_grippal_par_region_snakemake/scripts/peak-years.py b/module6/ressources/incidence_syndrome_grippal_par_region_snakemake/scripts/peak-years.py new file mode 100644 index 0000000000000000000000000000000000000000..6a997b34e7b19fa97adafda7fd364f529cfc2431 --- /dev/null +++ b/module6/ressources/incidence_syndrome_grippal_par_region_snakemake/scripts/peak-years.py @@ -0,0 +1,21 @@ +# Libraries used by this script: +import csv # for reading CSV files +import os # for path manipulations + +with open(snakemake.output[0], 'w') as result_file: + for filename in snakemake.input: + region = '-'.join(os.path.splitext(filename)[0].split('-')[2:]) + with open(filename, 'r') as csv_file: + csv_reader = csv.reader(csv_file) + csv_reader.__next__() + peak_year = None + peak_incidence = 0 + for year, incidence in csv_reader: + incidence = int(incidence) + if incidence > peak_incidence: + peak_incidence = incidence + peak_year = year + result_file.write(region) + result_file.write(', ') + result_file.write(peak_year) + result_file.write('\n') diff --git a/module6/ressources/incidence_syndrome_grippal_par_region_snakemake/scripts/preprocess.py b/module6/ressources/incidence_syndrome_grippal_par_region_snakemake/scripts/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..d25ef2cb7ef8b9351f612710be7ba9c519ba06bc --- /dev/null +++ b/module6/ressources/incidence_syndrome_grippal_par_region_snakemake/scripts/preprocess.py @@ -0,0 +1,65 @@ +# Libraries used by this script: +import datetime # for date conversion +import csv # for writing output to a CSV file + +# Read the CSV file into memory +data = open(snakemake.input[0], 'r').read() +# Remove white space at both ends, +# and split into lines. +lines = data.strip() \ + .split('\n') +# Split each line into columns +table = [line.split(',') for line in lines] + +# Remove records with missing data and write +# the removed records to a separate file for inspection. +with open(snakemake.output.errorlog, "w") as errorlog: + valid_table = [] + for row in table: + missing = any([column == '' for column in row]) + if missing: + errorlog.write("Missing data in record\n") + errorlog.write(str(row)) + errorlog.write("\n") + else: + valid_table.append(row) + +# Extract the two relevant columns, "week" and "inc" +week = [row[0] for row in valid_table] +assert week[0] == 'week' +del week[0] +inc = [row[2] for row in valid_table] +assert inc[0] == 'inc' +del inc[0] +data = list(zip(week, inc)) + +# Check for obviously out-of-range values +with open(snakemake.output.errorlog, "a") as errorlog: + for week, inc in data: + if len(week) != 6 or not week.isdigit(): + errorlog.write("Suspect value in column 'week': {week}\n") + if not inc.isdigit(): + errorlog.write("Suspect value in column 'inc': {inc}\n") + +# Convert year/week by date of the corresponding Monday, +# then sort by increasing date +converted_data = \ + [(datetime.datetime.strptime(year_and_week + ":1" , '%G%V:%u').date(), inc) + for year_and_week, inc in data] +converted_data.sort(key = lambda record: record[0]) + +# Check that consecutive dates are seven days apart +with open(snakemake.output.errorlog, "a") as errorlog: + dates = [date for date, _ in converted_data] + for date1, date2 in zip(dates[:-1], dates[1:]): + if date2-date1 != datetime.timedelta(weeks=1): + errorlog.write(f"{date2-date1} between {date1} and {date2}\n") + +# Write data to a CSV file with two columns: +# 1. the date of the Monday of each week, in ISO format +# 2. the incidence estimate for that week +with open(snakemake.output.data, "w") as csvfile: + csv_writer = csv.writer(csvfile) + csv_writer.writerow(["week_starting", "incidence"]) + for row in converted_data: + csv_writer.writerow(row) diff --git a/module6/ressources/incidence_syndrome_grippal_par_region_snakemake/scripts/split-by-region.py b/module6/ressources/incidence_syndrome_grippal_par_region_snakemake/scripts/split-by-region.py new file mode 100644 index 0000000000000000000000000000000000000000..097aec73567f58bf19bec2c5d9dd74bb04f13c3a --- /dev/null +++ b/module6/ressources/incidence_syndrome_grippal_par_region_snakemake/scripts/split-by-region.py @@ -0,0 +1,37 @@ +import os + +# Read the CSV file into memory +data = open(snakemake.input[0], 'rb').read() +# Decode the Latin-1 character set, +# remove white space at both ends, +# and split into lines. +lines = data.decode('latin-1') \ + .strip() \ + .split('\n') + +# Separate header from data table +comment = lines[0] +header = lines[1] +table = [line.split(',') for line in lines[2:]] + +# Find all the regions mentioned in the table +regions = set(record[-1] for record in table) + +# Create the output directory +directory = snakemake.output[0] +if not os.path.exists(directory): + os.makedirs(directory) + +# Write CSV files for each region +for region in regions: + # Some region names contain spaces which are awkward in filenames + region_name = '-'.join(region.split(' ')) + filename = os.path.join(directory, 'weekly-incidence-' + region_name + '.csv') + with open(filename, 'w') as output_file: + output_file.write(header) + output_file.write('\n') + for record in table: + # Write only the records for right region + if record[-1] == region: + output_file.write(','.join(record)) + output_file.write('\n')