You need to sign in or sign up before continuing.
Commit 1bf2ebfc authored by Konrad Hinsen's avatar Konrad Hinsen

Supprimer les fichiers obsolètes

parent b8a7cbf2
rule download:
output:
"data/weekly-incidence.csv"
shell:
"wget -O {output} http://www.sentiweb.fr/datasets/incidence-PAY-3.csv"
rule preprocess:
input:
"data/weekly-incidence.csv"
output:
data="data/preprocessed-weekly-incidence.csv",
errorlog="data/errors-from-preprocessing.txt"
script:
"scripts/preprocess.py"
rule plot:
input:
"data/preprocessed-weekly-incidence.csv"
output:
"data/weekly-incidence-plot.png",
"data/weekly-incidence-plot-last-years.png"
script:
"scripts/incidence-plots.R"
rule annual_incidence:
input:
"data/preprocessed-weekly-incidence.csv"
output:
"data/annual-incidence.csv"
script:
"scripts/annual-incidence.R"
rule histogram:
input:
"data/annual-incidence.csv"
output:
"data/annual-incidence-histogram.png"
script:
"scripts/annual-incidence-histogram.R"
rule all:
input:
"data/weekly-incidence.csv",
"data/preprocessed-weekly-incidence.csv",
"data/weekly-incidence-plot.png",
"data/weekly-incidence-plot-last-years.png",
"data/annual-incidence.csv",
"data/annual-incidence-histogram.png"
# Read in the data and convert the dates
data = read.csv(snakemake@input[[1]])
# Plot the histogram
png(filename=snakemake@output[[1]])
hist(data$incidence,
breaks=10,
xlab="Annual incidence",
ylab="Number of observations",
main="")
dev.off()
# Read in the data and convert the dates
data = read.csv(snakemake@input[[1]])
names(data) <- c("date", "incidence")
data$date <- as.Date(data$date)
# A function that extracts the peak for year N
yearly_peak = function(year) {
start = paste0(year-1,"-08-01")
end = paste0(year,"-08-01")
records = data$date > start & data$date <= end
sum(data$incidence[records])
}
# The years for which we have the full peak
years <- 1986:2018
# Make a new data frame for the annual incidences
annual_data = data.frame(year = years,
incidence = sapply(years, yearly_peak))
# write output file
write.csv(annual_data,
file=snakemake@output[[1]],
row.names=FALSE)
# Read in the data and convert the dates
data = read.csv(snakemake@input[[1]])
data$week_starting <- as.Date(data$week_starting)
# Plot the complete incidence dataset
png(filename=snakemake@output[[1]])
plot(data, type="l", xlab="Date", ylab="Weekly incidence")
dev.off()
# Zoom on the last four years
png(filename=snakemake@output[[2]])
plot(tail(data, 4*52), type="l", xlab="Date", ylab="Weekly incidence")
dev.off()
# Libraries used by this script:
import datetime # for date conversion
import csv # for writing output to a CSV file
# Read the CSV file into memory
data = open(snakemake.input[0], 'rb').read()
# Decode the Latin-1 character set,
# remove white space at both ends,
# and split into lines.
lines = data.decode('latin-1') \
.strip() \
.split('\n')
# Discard the first line, which contains a comment
data_lines = lines[1:]
# Split each line into columns
table = [line.split(',') for line in data_lines]
# Remove records with missing data and write
# the removed records to a separate file for inspection.
with open(snakemake.output.errorlog, "w") as errorlog:
valid_table = []
for row in table:
missing = any([column == '' for column in row])
if missing:
errorlog.write("Missing data in record\n")
errorlog.write(str(row))
errorlog.write("\n")
else:
valid_table.append(row)
# Extract the two relevant columns, "week" and "inc"
week = [row[0] for row in valid_table]
assert week[0] == 'week'
del week[0]
inc = [row[2] for row in valid_table]
assert inc[0] == 'inc'
del inc[0]
data = list(zip(week, inc))
# Check for obviously out-of-range values
with open(snakemake.output.errorlog, "a") as errorlog:
for week, inc in data:
if len(week) != 6 or not week.isdigit():
errorlog.write("Suspect value in column 'week': {week}\n")
if not inc.isdigit():
errorlog.write("Suspect value in column 'inc': {inc}\n")
# Convert year/week by date of the corresponding Monday,
# then sort by increasing date
converted_data = \
[(datetime.datetime.strptime(year_and_week + ":1" , '%G%V:%u').date(), inc)
for year_and_week, inc in data]
converted_data.sort(key = lambda record: record[0])
# Check that consecutive dates are seven days apart
with open(snakemake.output.errorlog, "a") as errorlog:
dates = [date for date, _ in converted_data]
for date1, date2 in zip(dates[:-1], dates[1:]):
if date2-date1 != datetime.timedelta(weeks=1):
errorlog.write(f"{date2-date1} between {date1} and {date2}\n")
# Write data to a CSV file with two columns:
# 1. the date of the Monday of each week, in ISO format
# 2. the incidence estimate for that week
with open(snakemake.output.data, "w") as csvfile:
csv_writer = csv.writer(csvfile)
csv_writer.writerow(["week_starting", "incidence"])
for row in converted_data:
csv_writer.writerow(row)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment