#+TITLE: Analysis of the incidence of chickenpox
#+AUTHOR: Jamal KHAN
#+DATE: 2020-09-16
#+LANGUAGE: en
# #+PROPERTY: header-args :session *python* :exports both
#+HTML_HEAD:
#+HTML_HEAD:
#+HTML_HEAD:
#+HTML_HEAD:
#+HTML_HEAD:
#+HTML_HEAD:
* Data download
#+NAME: data-url
https://www.sentiweb.fr/datasets/incidence-PAY-7.csv
#+BEGIN_SRC python :session *python* :results output :var data_url=data-url
data_file = 'chickenpox_incidence.csv'
import datetime
from urllib.request import urlretrieve
import os
if not os.path.exists(data_file):
urlretrieve(data_url, data_file)
print(f'Data is retrieved at {datetime.datetime.utcnow()} UTC')
#+END_SRC
#+RESULTS:
:
: Data is retrieved at 2020-09-16 22:07:31.650075 UTC
Now we extract the interesting part of the data. from the format of the file The week is column 0, incidence is column 4. We took Monday as the first day of the week so '%W' code in python/pandas.
#+BEGIN_SRC python :session *python* :results outputs :export both
import pandas as pd
data = pd.read_csv(data_file, skiprows=2, header=None)
data = data.loc[:, [0, 2]].rename(columns={0:'Datetime', 2:'Incidence'})
data.Datetime = pd.to_datetime(data.Datetime*10+1, format='%Y%W%w')
data = data.sort_values(by='Datetime')
data = data.set_index('Datetime')
data.describe()
#+END_SRC
#+RESULTS:
: Incidence
: count 1554.000000
: mean 12647.119691
: std 6657.542827
: min 161.000000
: 25% 7326.750000
: 50% 12627.000000
: 75% 17155.000000
: max 36298.000000
Now check for missing data. Pandas automatically handles the missing data, so we will check for na value in the dataframe.
#+BEGIN_SRC python :session *python* :results outputs :export both
data.is_na()
#+END_SRC
#+RESULTS:
Looks ok. Now time to plot a timeseries of the chicken pox incidence.
#+BEGIN_SRC python :session *python* :results output file :var ts_plot="chickepox_timeseries.png" :export file
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(6, 3))
data.plot(ax=ax)
plt.savefig(ts_plot)
print(ts_plot)
#+END_SRC
#+RESULTS:
[[file:chickepox_timeseries.png]]
Additionally, the data starts at the beginning of 1991 and ends at the beginning of the 2020. The monthly evolution is not clear from the long timeseries. Plotting a shorter version.
#+BEGIN_SRC python :session *python* :results output file :var ts_plot="chickepox_timeseries_short.png" :export file
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(6, 3))
data['1991-01-01':'1994-01-01'].plot(ax=ax)
plt.savefig(ts_plot)
print(ts_plot)
#+END_SRC
#+RESULTS:
[[file:chickepox_timeseries_short.png]]
It appears that the dip is in november. So I need to group the yearly data starting from November.
#+BEGIN_SRC python :session *python* :results output file :var ts_plot="chickepox_timeseries_yearly.png" :export both
data_yearly = data.groupby(data.index.shift(8, freq='m').year).sum()
data_yearly = data_yearly.iloc[1:-2]
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(6, 3))
data_yearly.plot(ax=ax)
plt.savefig(ts_plot)
print(data_yearly.sort_values(by='Incidence'))
print(ts_plot)
#+END_SRC
#+RESULTS:
Plot of the aggregated values.
#+BEGIN_SRC python :session *python* :results output file :var ts_plot="chickepox_timeseries_yearly.png" :export both
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(6, 3))
data_yearly.plot(ax=ax)
plt.savefig(ts_plot)
print(ts_plot)
#+END_SRC
#+RESULTS:
[[file:chickepox_timeseries_yearly.png]]