Commit 7f022725 authored by Konrad Hinsen's avatar Konrad Hinsen

Incidence de grippe par région via snakemake

parent 4f56d125
rule all:
input:
"data/peak-year-all-regions.txt"
rule download:
output:
"data/weekly-incidence-all-regions.csv"
shell:
"wget -O {output} http://www.sentiweb.fr/datasets/incidence-RDD-3.csv"
checkpoint split_by_region:
input:
"data/weekly-incidence-all-regions.csv"
output:
directory("data/by-region")
script:
"scripts/split-by-region.py"
rule preprocess:
input:
"data/by-region/weekly-incidence-{region}.csv"
output:
data="data/preprocessed-weekly-incidence-{region}.csv",
errorlog="data/errors-from-preprocessing-{region}.txt"
script:
"scripts/preprocess.py"
rule plot:
input:
"data/preprocessed-weekly-incidence-{region}.csv"
output:
"data/weekly-incidence-plot-{region}.png",
"data/weekly-incidence-plot-last-years-{region}.png"
script:
"scripts/incidence-plots.R"
rule annual_incidence:
input:
"data/preprocessed-weekly-incidence-{region}.csv"
output:
"data/annual-incidence-{region}.csv"
script:
"scripts/annual-incidence.R"
def annual_incidence_files(wildcards):
directory = checkpoints.split_by_region.get().output[0]
pattern = os.path.join(directory, "weekly-incidence-{region}.csv")
print("Pattern:", pattern)
return expand("data/annual-incidence-{region}.csv",
region=glob_wildcards(pattern).region)
rule peak_years:
input:
annual_incidence_files
output:
"data/peak-year-all-regions.txt"
script:
"scripts/peak-years.py"
# Read in the data and convert the dates
data = read.csv(snakemake@input[[1]])
# Plot the histogram
png(filename=snakemake@output[[1]])
hist(data$incidence,
breaks=10,
xlab="Annual incidence",
ylab="Number of observations",
main="")
dev.off()
# Read in the data and convert the dates
data = read.csv(snakemake@input[[1]])
names(data) <- c("date", "incidence")
data$date <- as.Date(data$date)
# A function that extracts the peak for year N
yearly_peak = function(year) {
start = paste0(year-1,"-08-01")
end = paste0(year,"-08-01")
records = data$date > start & data$date <= end
sum(data$incidence[records])
}
# The years for which we have the full peak
years <- 1986:2018
# Make a new data frame for the annual incidences
annual_data = data.frame(year = years,
incidence = sapply(years, yearly_peak))
# write output file
write.csv(annual_data,
file=snakemake@output[[1]],
row.names=FALSE)
# Read in the data and convert the dates
data = read.csv(snakemake@input[[1]])
data$week_starting <- as.Date(data$week_starting)
# Plot the complete incidence dataset
png(filename=snakemake@output[[1]])
plot(data, type="l", xlab="Date", ylab="Weekly incidence")
dev.off()
# Zoom on the last four years
png(filename=snakemake@output[[2]])
plot(tail(data, 4*52), type="l", xlab="Date", ylab="Weekly incidence")
dev.off()
# Libraries used by this script:
import csv # for reading CSV files
import os # for path manipulations
with open(snakemake.output[0], 'w') as result_file:
for filename in snakemake.input:
region = '-'.join(os.path.splitext(filename)[0].split('-')[2:])
with open(filename, 'r') as csv_file:
csv_reader = csv.reader(csv_file)
csv_reader.__next__()
peak_year = None
peak_incidence = 0
for year, incidence in csv_reader:
incidence = int(incidence)
if incidence > peak_incidence:
peak_incidence = incidence
peak_year = year
result_file.write(region)
result_file.write(', ')
result_file.write(peak_year)
result_file.write('\n')
# Libraries used by this script:
import datetime # for date conversion
import csv # for writing output to a CSV file
# Read the CSV file into memory
data = open(snakemake.input[0], 'r').read()
# Remove white space at both ends,
# and split into lines.
lines = data.strip() \
.split('\n')
# Split each line into columns
table = [line.split(',') for line in lines]
# Remove records with missing data and write
# the removed records to a separate file for inspection.
with open(snakemake.output.errorlog, "w") as errorlog:
valid_table = []
for row in table:
missing = any([column == '' for column in row])
if missing:
errorlog.write("Missing data in record\n")
errorlog.write(str(row))
errorlog.write("\n")
else:
valid_table.append(row)
# Extract the two relevant columns, "week" and "inc"
week = [row[0] for row in valid_table]
assert week[0] == 'week'
del week[0]
inc = [row[2] for row in valid_table]
assert inc[0] == 'inc'
del inc[0]
data = list(zip(week, inc))
# Check for obviously out-of-range values
with open(snakemake.output.errorlog, "a") as errorlog:
for week, inc in data:
if len(week) != 6 or not week.isdigit():
errorlog.write("Suspect value in column 'week': {week}\n")
if not inc.isdigit():
errorlog.write("Suspect value in column 'inc': {inc}\n")
# Convert year/week by date of the corresponding Monday,
# then sort by increasing date
converted_data = \
[(datetime.datetime.strptime(year_and_week + ":1" , '%G%V:%u').date(), inc)
for year_and_week, inc in data]
converted_data.sort(key = lambda record: record[0])
# Check that consecutive dates are seven days apart
with open(snakemake.output.errorlog, "a") as errorlog:
dates = [date for date, _ in converted_data]
for date1, date2 in zip(dates[:-1], dates[1:]):
if date2-date1 != datetime.timedelta(weeks=1):
errorlog.write(f"{date2-date1} between {date1} and {date2}\n")
# Write data to a CSV file with two columns:
# 1. the date of the Monday of each week, in ISO format
# 2. the incidence estimate for that week
with open(snakemake.output.data, "w") as csvfile:
csv_writer = csv.writer(csvfile)
csv_writer.writerow(["week_starting", "incidence"])
for row in converted_data:
csv_writer.writerow(row)
import os
# Read the CSV file into memory
data = open(snakemake.input[0], 'rb').read()
# Decode the Latin-1 character set,
# remove white space at both ends,
# and split into lines.
lines = data.decode('latin-1') \
.strip() \
.split('\n')
# Separate header from data table
comment = lines[0]
header = lines[1]
table = [line.split(',') for line in lines[2:]]
# Find all the regions mentioned in the table
regions = set(record[-1] for record in table)
# Create the output directory
directory = snakemake.output[0]
if not os.path.exists(directory):
os.makedirs(directory)
# Write CSV files for each region
for region in regions:
# Some region names contain spaces which are awkward in filenames
region_name = '-'.join(region.split(' '))
filename = os.path.join(directory, 'weekly-incidence-' + region_name + '.csv')
with open(filename, 'w') as output_file:
output_file.write(header)
output_file.write('\n')
for record in table:
# Write only the records for right region
if record[-1] == region:
output_file.write(','.join(record))
output_file.write('\n')
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment