#+TITLE: Analysis of the incidence of chickenpox #+AUTHOR: Jamal KHAN #+DATE: 2020-09-16 #+LANGUAGE: en # #+PROPERTY: header-args :session *python* :exports both #+HTML_HEAD: #+HTML_HEAD: #+HTML_HEAD: #+HTML_HEAD: #+HTML_HEAD: #+HTML_HEAD: * Data download #+NAME: data-url https://www.sentiweb.fr/datasets/incidence-PAY-7.csv #+BEGIN_SRC python :session *python* :results output :var data_url=data-url data_file = 'chickenpox_incidence.csv' import datetime from urllib.request import urlretrieve import os if not os.path.exists(data_file): urlretrieve(data_url, data_file) print(f'Data is retrieved at {datetime.datetime.utcnow()} UTC') #+END_SRC #+RESULTS: : : Data is retrieved at 2020-09-16 22:07:31.650075 UTC Now we extract the interesting part of the data. from the format of the file The week is column 0, incidence is column 4. We took Monday as the first day of the week so '%W' code in python/pandas. #+BEGIN_SRC python :session *python* :results outputs :export both import pandas as pd data = pd.read_csv(data_file, skiprows=2, header=None) data = data.loc[:, [0, 2]].rename(columns={0:'Datetime', 2:'Incidence'}) data.Datetime = pd.to_datetime(data.Datetime*10+1, format='%Y%W%w') data = data.sort_values(by='Datetime') data = data.set_index('Datetime') data.describe() #+END_SRC #+RESULTS: : Incidence : count 1554.000000 : mean 12647.119691 : std 6657.542827 : min 161.000000 : 25% 7326.750000 : 50% 12627.000000 : 75% 17155.000000 : max 36298.000000 Now check for missing data. Pandas automatically handles the missing data, so we will check for na value in the dataframe. #+BEGIN_SRC python :session *python* :results outputs :export both data.is_na() #+END_SRC #+RESULTS: Looks ok. Now time to plot a timeseries of the chicken pox incidence. #+BEGIN_SRC python :session *python* :results output file :var ts_plot="chickepox_timeseries.png" :export file import matplotlib.pyplot as plt fig, ax = plt.subplots(figsize=(6, 3)) data.plot(ax=ax) plt.savefig(ts_plot) print(ts_plot) #+END_SRC #+RESULTS: [[file:chickepox_timeseries.png]] Additionally, the data starts at the beginning of 1991 and ends at the beginning of the 2020. The monthly evolution is not clear from the long timeseries. Plotting a shorter version. #+BEGIN_SRC python :session *python* :results output file :var ts_plot="chickepox_timeseries_short.png" :export file import matplotlib.pyplot as plt fig, ax = plt.subplots(figsize=(6, 3)) data['1991-01-01':'1994-01-01'].plot(ax=ax) plt.savefig(ts_plot) print(ts_plot) #+END_SRC #+RESULTS: [[file:chickepox_timeseries_short.png]] It appears that the dip is in november. So I need to group the yearly data starting from November. #+BEGIN_SRC python :session *python* :results output file :var ts_plot="chickepox_timeseries_yearly.png" :export both data_yearly = data.groupby(data.index.shift(8, freq='m').year).sum() data_yearly = data_yearly.iloc[1:-2] import matplotlib.pyplot as plt fig, ax = plt.subplots(figsize=(6, 3)) data_yearly.plot(ax=ax) plt.savefig(ts_plot) print(data_yearly.sort_values(by='Incidence')) print(ts_plot) #+END_SRC #+RESULTS: Plot of the aggregated values. #+BEGIN_SRC python :session *python* :results output file :var ts_plot="chickepox_timeseries_yearly.png" :export both import matplotlib.pyplot as plt fig, ax = plt.subplots(figsize=(6, 3)) data_yearly.plot(ax=ax) plt.savefig(ts_plot) print(ts_plot) #+END_SRC #+RESULTS: [[file:chickepox_timeseries_yearly.png]]