Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
M
mooc-rr-ressources
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
4
Merge Requests
4
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Commits
Issue Boards
Open sidebar
Learning Lab
mooc-rr-ressources
Commits
7f022725
Commit
7f022725
authored
Sep 19, 2019
by
Konrad Hinsen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Incidence de grippe par région via snakemake
parent
4f56d125
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
228 additions
and
0 deletions
+228
-0
Snakefile
...incidence_syndrome_grippal_par_region_snakemake/Snakefile
+58
-0
annual-incidence-histogram.R
...par_region_snakemake/scripts/annual-incidence-histogram.R
+11
-0
annual-incidence.R
...e_grippal_par_region_snakemake/scripts/annual-incidence.R
+23
-0
incidence-plots.R
...me_grippal_par_region_snakemake/scripts/incidence-plots.R
+13
-0
peak-years.py
...ndrome_grippal_par_region_snakemake/scripts/peak-years.py
+21
-0
preprocess.py
...ndrome_grippal_par_region_snakemake/scripts/preprocess.py
+65
-0
split-by-region.py
...e_grippal_par_region_snakemake/scripts/split-by-region.py
+37
-0
No files found.
module6/ressources/incidence_syndrome_grippal_par_region_snakemake/Snakefile
0 → 100644
View file @
7f022725
rule all:
input:
"data/peak-year-all-regions.txt"
rule download:
output:
"data/weekly-incidence-all-regions.csv"
shell:
"wget -O {output} http://www.sentiweb.fr/datasets/incidence-RDD-3.csv"
checkpoint split_by_region:
input:
"data/weekly-incidence-all-regions.csv"
output:
directory("data/by-region")
script:
"scripts/split-by-region.py"
rule preprocess:
input:
"data/by-region/weekly-incidence-{region}.csv"
output:
data="data/preprocessed-weekly-incidence-{region}.csv",
errorlog="data/errors-from-preprocessing-{region}.txt"
script:
"scripts/preprocess.py"
rule plot:
input:
"data/preprocessed-weekly-incidence-{region}.csv"
output:
"data/weekly-incidence-plot-{region}.png",
"data/weekly-incidence-plot-last-years-{region}.png"
script:
"scripts/incidence-plots.R"
rule annual_incidence:
input:
"data/preprocessed-weekly-incidence-{region}.csv"
output:
"data/annual-incidence-{region}.csv"
script:
"scripts/annual-incidence.R"
def annual_incidence_files(wildcards):
directory = checkpoints.split_by_region.get().output[0]
pattern = os.path.join(directory, "weekly-incidence-{region}.csv")
print("Pattern:", pattern)
return expand("data/annual-incidence-{region}.csv",
region=glob_wildcards(pattern).region)
rule peak_years:
input:
annual_incidence_files
output:
"data/peak-year-all-regions.txt"
script:
"scripts/peak-years.py"
module6/ressources/incidence_syndrome_grippal_par_region_snakemake/scripts/annual-incidence-histogram.R
0 → 100644
View file @
7f022725
# Read in the data and convert the dates
data
=
read.csv
(
snakemake
@
input
[[
1
]])
# Plot the histogram
png
(
filename
=
snakemake
@
output
[[
1
]])
hist
(
data
$
incidence
,
breaks
=
10
,
xlab
=
"Annual incidence"
,
ylab
=
"Number of observations"
,
main
=
""
)
dev.off
()
module6/ressources/incidence_syndrome_grippal_par_region_snakemake/scripts/annual-incidence.R
0 → 100644
View file @
7f022725
# Read in the data and convert the dates
data
=
read.csv
(
snakemake
@
input
[[
1
]])
names
(
data
)
<-
c
(
"date"
,
"incidence"
)
data
$
date
<-
as.Date
(
data
$
date
)
# A function that extracts the peak for year N
yearly_peak
=
function
(
year
)
{
start
=
paste0
(
year
-1
,
"-08-01"
)
end
=
paste0
(
year
,
"-08-01"
)
records
=
data
$
date
>
start
&
data
$
date
<=
end
sum
(
data
$
incidence
[
records
])
}
# The years for which we have the full peak
years
<-
1986
:
2018
# Make a new data frame for the annual incidences
annual_data
=
data.frame
(
year
=
years
,
incidence
=
sapply
(
years
,
yearly_peak
))
# write output file
write.csv
(
annual_data
,
file
=
snakemake
@
output
[[
1
]],
row.names
=
FALSE
)
module6/ressources/incidence_syndrome_grippal_par_region_snakemake/scripts/incidence-plots.R
0 → 100644
View file @
7f022725
# Read in the data and convert the dates
data
=
read.csv
(
snakemake
@
input
[[
1
]])
data
$
week_starting
<-
as.Date
(
data
$
week_starting
)
# Plot the complete incidence dataset
png
(
filename
=
snakemake
@
output
[[
1
]])
plot
(
data
,
type
=
"l"
,
xlab
=
"Date"
,
ylab
=
"Weekly incidence"
)
dev.off
()
# Zoom on the last four years
png
(
filename
=
snakemake
@
output
[[
2
]])
plot
(
tail
(
data
,
4
*
52
),
type
=
"l"
,
xlab
=
"Date"
,
ylab
=
"Weekly incidence"
)
dev.off
()
module6/ressources/incidence_syndrome_grippal_par_region_snakemake/scripts/peak-years.py
0 → 100644
View file @
7f022725
# Libraries used by this script:
import
csv
# for reading CSV files
import
os
# for path manipulations
with
open
(
snakemake
.
output
[
0
],
'w'
)
as
result_file
:
for
filename
in
snakemake
.
input
:
region
=
'-'
.
join
(
os
.
path
.
splitext
(
filename
)[
0
]
.
split
(
'-'
)[
2
:])
with
open
(
filename
,
'r'
)
as
csv_file
:
csv_reader
=
csv
.
reader
(
csv_file
)
csv_reader
.
__next__
()
peak_year
=
None
peak_incidence
=
0
for
year
,
incidence
in
csv_reader
:
incidence
=
int
(
incidence
)
if
incidence
>
peak_incidence
:
peak_incidence
=
incidence
peak_year
=
year
result_file
.
write
(
region
)
result_file
.
write
(
', '
)
result_file
.
write
(
peak_year
)
result_file
.
write
(
'
\n
'
)
module6/ressources/incidence_syndrome_grippal_par_region_snakemake/scripts/preprocess.py
0 → 100644
View file @
7f022725
# Libraries used by this script:
import
datetime
# for date conversion
import
csv
# for writing output to a CSV file
# Read the CSV file into memory
data
=
open
(
snakemake
.
input
[
0
],
'r'
)
.
read
()
# Remove white space at both ends,
# and split into lines.
lines
=
data
.
strip
()
\
.
split
(
'
\n
'
)
# Split each line into columns
table
=
[
line
.
split
(
','
)
for
line
in
lines
]
# Remove records with missing data and write
# the removed records to a separate file for inspection.
with
open
(
snakemake
.
output
.
errorlog
,
"w"
)
as
errorlog
:
valid_table
=
[]
for
row
in
table
:
missing
=
any
([
column
==
''
for
column
in
row
])
if
missing
:
errorlog
.
write
(
"Missing data in record
\n
"
)
errorlog
.
write
(
str
(
row
))
errorlog
.
write
(
"
\n
"
)
else
:
valid_table
.
append
(
row
)
# Extract the two relevant columns, "week" and "inc"
week
=
[
row
[
0
]
for
row
in
valid_table
]
assert
week
[
0
]
==
'week'
del
week
[
0
]
inc
=
[
row
[
2
]
for
row
in
valid_table
]
assert
inc
[
0
]
==
'inc'
del
inc
[
0
]
data
=
list
(
zip
(
week
,
inc
))
# Check for obviously out-of-range values
with
open
(
snakemake
.
output
.
errorlog
,
"a"
)
as
errorlog
:
for
week
,
inc
in
data
:
if
len
(
week
)
!=
6
or
not
week
.
isdigit
():
errorlog
.
write
(
"Suspect value in column 'week': {week}
\n
"
)
if
not
inc
.
isdigit
():
errorlog
.
write
(
"Suspect value in column 'inc': {inc}
\n
"
)
# Convert year/week by date of the corresponding Monday,
# then sort by increasing date
converted_data
=
\
[(
datetime
.
datetime
.
strptime
(
year_and_week
+
":1"
,
'
%
G
%
V:
%
u'
)
.
date
(),
inc
)
for
year_and_week
,
inc
in
data
]
converted_data
.
sort
(
key
=
lambda
record
:
record
[
0
])
# Check that consecutive dates are seven days apart
with
open
(
snakemake
.
output
.
errorlog
,
"a"
)
as
errorlog
:
dates
=
[
date
for
date
,
_
in
converted_data
]
for
date1
,
date2
in
zip
(
dates
[:
-
1
],
dates
[
1
:]):
if
date2
-
date1
!=
datetime
.
timedelta
(
weeks
=
1
):
errorlog
.
write
(
f
"{date2-date1} between {date1} and {date2}
\n
"
)
# Write data to a CSV file with two columns:
# 1. the date of the Monday of each week, in ISO format
# 2. the incidence estimate for that week
with
open
(
snakemake
.
output
.
data
,
"w"
)
as
csvfile
:
csv_writer
=
csv
.
writer
(
csvfile
)
csv_writer
.
writerow
([
"week_starting"
,
"incidence"
])
for
row
in
converted_data
:
csv_writer
.
writerow
(
row
)
module6/ressources/incidence_syndrome_grippal_par_region_snakemake/scripts/split-by-region.py
0 → 100644
View file @
7f022725
import
os
# Read the CSV file into memory
data
=
open
(
snakemake
.
input
[
0
],
'rb'
)
.
read
()
# Decode the Latin-1 character set,
# remove white space at both ends,
# and split into lines.
lines
=
data
.
decode
(
'latin-1'
)
\
.
strip
()
\
.
split
(
'
\n
'
)
# Separate header from data table
comment
=
lines
[
0
]
header
=
lines
[
1
]
table
=
[
line
.
split
(
','
)
for
line
in
lines
[
2
:]]
# Find all the regions mentioned in the table
regions
=
set
(
record
[
-
1
]
for
record
in
table
)
# Create the output directory
directory
=
snakemake
.
output
[
0
]
if
not
os
.
path
.
exists
(
directory
):
os
.
makedirs
(
directory
)
# Write CSV files for each region
for
region
in
regions
:
# Some region names contain spaces which are awkward in filenames
region_name
=
'-'
.
join
(
region
.
split
(
' '
))
filename
=
os
.
path
.
join
(
directory
,
'weekly-incidence-'
+
region_name
+
'.csv'
)
with
open
(
filename
,
'w'
)
as
output_file
:
output_file
.
write
(
header
)
output_file
.
write
(
'
\n
'
)
for
record
in
table
:
# Write only the records for right region
if
record
[
-
1
]
==
region
:
output_file
.
write
(
','
.
join
(
record
))
output_file
.
write
(
'
\n
'
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment