{ "cells": [ { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", "import matplotlib.pyplot as plt\n", "import pandas as pd\n", "import isoweek\n", "import os\n", "import urllib.request" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To ensure that we always have an available copy of the data, we will dowload it and keep a local version. If we already have a local version we wont download the data again." ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "data_url = \"http://www.sentiweb.fr/datasets/incidence-PAY-7.csv\"\n", "data_file = \"chickenpox.csv\"\n", "if not os.path.exists(data_file):\n", " urllib.request.urlretrieve(data_url, data_file)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
weekindicatorincinc_lowinc_upinc100inc100_lowinc100_upgeo_inseegeo_name
02023197937760901266414919FRFrance
1202318710671729114051161121FRFrance
22023177918461621220614919FRFrance
3202316711387801414760171222FRFrance
4202315714040761320467211131FRFrance
52023147152471103219462231729FRFrance
6202313713322970016944201525FRFrance
7202312710374721813530161121FRFrance
820231174919288069587410FRFrance
920231074854273169777410FRFrance
10202309770044548946011715FRFrance
112023087817553161103412816FRFrance
12202307765953782940810614FRFrance
132023067959560171317314919FRFrance
1420230576237390785679513FRFrance
1520230476299397386259612FRFrance
1620230376063379883289612FRFrance
172023027657630601009210515FRFrance
182023017815354701083612816FRFrance
1920225275171271776258412FRFrance
2020225176226382286309513FRFrance
212022507659031001008010515FRFrance
2220224975095321269788511FRFrance
2320224874985304369278511FRFrance
2420224776087373384419513FRFrance
252022467303313924674537FRFrance
262022457382717205934639FRFrance
272022447427122316311639FRFrance
2820224375863330284249513FRFrance
292022427377019505590639FRFrance
.................................
16631991267176081130423912312042FRFrance
16641991257161691070021638281838FRFrance
16651991247161711007122271281739FRFrance
1666199123711947767116223211329FRFrance
1667199122715452995320951271737FRFrance
1668199121714903897520831261636FRFrance
16691991207190531274225364342345FRFrance
16701991197167391124622232291939FRFrance
16711991187213851388228888382551FRFrance
1672199117713462887718047241632FRFrance
16731991167148571006819646261834FRFrance
1674199115713975978118169251832FRFrance
1675199114712265768416846221430FRFrance
167619911379567604113093171123FRFrance
1677199112710864733114397191325FRFrance
16781991117155741118419964271935FRFrance
16791991107166431137221914292038FRFrance
1680199109713741878018702241533FRFrance
1681199108713289881317765231531FRFrance
1682199107712337807716597221529FRFrance
1683199106710877701314741191226FRFrance
1684199105710442654414340181125FRFrance
16851991047791345631126314820FRFrance
16861991037153871048420290271836FRFrance
16871991027162771104621508292038FRFrance
16881991017155651027120859271836FRFrance
16891990527193751329525455342345FRFrance
16901990517190801380724353342543FRFrance
1691199050711079666015498201228FRFrance
16921990497114302610205FRFrance
\n", "

1693 rows × 10 columns

\n", "
" ], "text/plain": [ " week indicator inc inc_low inc_up inc100 inc100_low \\\n", "0 202319 7 9377 6090 12664 14 9 \n", "1 202318 7 10671 7291 14051 16 11 \n", "2 202317 7 9184 6162 12206 14 9 \n", "3 202316 7 11387 8014 14760 17 12 \n", "4 202315 7 14040 7613 20467 21 11 \n", "5 202314 7 15247 11032 19462 23 17 \n", "6 202313 7 13322 9700 16944 20 15 \n", "7 202312 7 10374 7218 13530 16 11 \n", "8 202311 7 4919 2880 6958 7 4 \n", "9 202310 7 4854 2731 6977 7 4 \n", "10 202309 7 7004 4548 9460 11 7 \n", "11 202308 7 8175 5316 11034 12 8 \n", "12 202307 7 6595 3782 9408 10 6 \n", "13 202306 7 9595 6017 13173 14 9 \n", "14 202305 7 6237 3907 8567 9 5 \n", "15 202304 7 6299 3973 8625 9 6 \n", "16 202303 7 6063 3798 8328 9 6 \n", "17 202302 7 6576 3060 10092 10 5 \n", "18 202301 7 8153 5470 10836 12 8 \n", "19 202252 7 5171 2717 7625 8 4 \n", "20 202251 7 6226 3822 8630 9 5 \n", "21 202250 7 6590 3100 10080 10 5 \n", "22 202249 7 5095 3212 6978 8 5 \n", "23 202248 7 4985 3043 6927 8 5 \n", "24 202247 7 6087 3733 8441 9 5 \n", "25 202246 7 3033 1392 4674 5 3 \n", "26 202245 7 3827 1720 5934 6 3 \n", "27 202244 7 4271 2231 6311 6 3 \n", "28 202243 7 5863 3302 8424 9 5 \n", "29 202242 7 3770 1950 5590 6 3 \n", "... ... ... ... ... ... ... ... \n", "1663 199126 7 17608 11304 23912 31 20 \n", "1664 199125 7 16169 10700 21638 28 18 \n", "1665 199124 7 16171 10071 22271 28 17 \n", "1666 199123 7 11947 7671 16223 21 13 \n", "1667 199122 7 15452 9953 20951 27 17 \n", "1668 199121 7 14903 8975 20831 26 16 \n", "1669 199120 7 19053 12742 25364 34 23 \n", "1670 199119 7 16739 11246 22232 29 19 \n", "1671 199118 7 21385 13882 28888 38 25 \n", "1672 199117 7 13462 8877 18047 24 16 \n", "1673 199116 7 14857 10068 19646 26 18 \n", "1674 199115 7 13975 9781 18169 25 18 \n", "1675 199114 7 12265 7684 16846 22 14 \n", "1676 199113 7 9567 6041 13093 17 11 \n", "1677 199112 7 10864 7331 14397 19 13 \n", "1678 199111 7 15574 11184 19964 27 19 \n", "1679 199110 7 16643 11372 21914 29 20 \n", "1680 199109 7 13741 8780 18702 24 15 \n", "1681 199108 7 13289 8813 17765 23 15 \n", "1682 199107 7 12337 8077 16597 22 15 \n", "1683 199106 7 10877 7013 14741 19 12 \n", "1684 199105 7 10442 6544 14340 18 11 \n", "1685 199104 7 7913 4563 11263 14 8 \n", "1686 199103 7 15387 10484 20290 27 18 \n", "1687 199102 7 16277 11046 21508 29 20 \n", "1688 199101 7 15565 10271 20859 27 18 \n", "1689 199052 7 19375 13295 25455 34 23 \n", "1690 199051 7 19080 13807 24353 34 25 \n", "1691 199050 7 11079 6660 15498 20 12 \n", "1692 199049 7 1143 0 2610 2 0 \n", "\n", " inc100_up geo_insee geo_name \n", "0 19 FR France \n", "1 21 FR France \n", "2 19 FR France \n", "3 22 FR France \n", "4 31 FR France \n", "5 29 FR France \n", "6 25 FR France \n", "7 21 FR France \n", "8 10 FR France \n", "9 10 FR France \n", "10 15 FR France \n", "11 16 FR France \n", "12 14 FR France \n", "13 19 FR France \n", "14 13 FR France \n", "15 12 FR France \n", "16 12 FR France \n", "17 15 FR France \n", "18 16 FR France \n", "19 12 FR France \n", "20 13 FR France \n", "21 15 FR France \n", "22 11 FR France \n", "23 11 FR France \n", "24 13 FR France \n", "25 7 FR France \n", "26 9 FR France \n", "27 9 FR France \n", "28 13 FR France \n", "29 9 FR France \n", "... ... ... ... \n", "1663 42 FR France \n", "1664 38 FR France \n", "1665 39 FR France \n", "1666 29 FR France \n", "1667 37 FR France \n", "1668 36 FR France \n", "1669 45 FR France \n", "1670 39 FR France \n", "1671 51 FR France \n", "1672 32 FR France \n", "1673 34 FR France \n", "1674 32 FR France \n", "1675 30 FR France \n", "1676 23 FR France \n", "1677 25 FR France \n", "1678 35 FR France \n", "1679 38 FR France \n", "1680 33 FR France \n", "1681 31 FR France \n", "1682 29 FR France \n", "1683 26 FR France \n", "1684 25 FR France \n", "1685 20 FR France \n", "1686 36 FR France \n", "1687 38 FR France \n", "1688 36 FR France \n", "1689 45 FR France \n", "1690 43 FR France \n", "1691 28 FR France \n", "1692 5 FR France \n", "\n", "[1693 rows x 10 columns]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "raw_data = pd.read_csv(data_file, skiprows=1)\n", "raw_data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Are there missing data points? No" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
weekindicatorincinc_lowinc_upinc100inc100_lowinc100_upgeo_inseegeo_name
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [week, indicator, inc, inc_low, inc_up, inc100, inc100_low, inc100_up, geo_insee, geo_name]\n", "Index: []" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "raw_data[raw_data.isnull().any(axis=1)]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Our dataset uses an uncommon encoding; the week number is attached\n", "to the year number, leaving the impression of a six-digit integer.\n", "That is how Pandas interprets it.\n", "\n", "A second problem is that Pandas does not know about week numbers.\n", "It needs to be given the dates of the beginning and end of the week.\n", "We use the library `isoweek` for that.\n", "\n", "Since the conversion is a bit lengthy, we write a small Python \n", "function for doing it. Then we apply it to all points in our dataset. \n", "The results go into a new column 'period'." ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "def convert_week(year_and_week_int):\n", " year_and_week_str = str(year_and_week_int)\n", " year = int(year_and_week_str[:4])\n", " week = int(year_and_week_str[4:])\n", " w = isoweek.Week(year, week)\n", " return pd.Period(w.day(0), 'W')\n", "\n", "data['period'] = [convert_week(yw) for yw in data['week']]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "There are two more small changes to make.\n", "\n", "First, we define the observation periods as the new index of\n", "our dataset. That turns it into a time series, which will be\n", "convenient later on.\n", "\n", "Second, we sort the points chronologically." ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [], "source": [ "sorted_data = data.set_index('period').sort_index()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We check the consistency of the data. Between the end of a period and\n", "the beginning of the next one, the difference should be zero, or very small.\n", "We tolerate an error of one second.\n", "\n", "This is OK." ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [], "source": [ "periods = sorted_data.index\n", "for p1, p2 in zip(periods[:-1], periods[1:]):\n", " delta = p2.to_timestamp() - p1.end_time\n", " if delta > pd.Timedelta('1s'):\n", " print(p1, p2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "A first look at the data!" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "sorted_data['inc'].plot()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "A zoom on the last few years shows more clearly that the peaks are situated in winter." ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "sorted_data['inc'][-200:].plot()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Study of the annual incidence" ] }, { "cell_type": "markdown", "metadata": {}, "source": [] }, { "cell_type": "code", "execution_count": 88, "metadata": {}, "outputs": [], "source": [ "first_december_week = [pd.Period(pd.Timestamp(y, 9, 1), 'W')\n", " for y in range(1991,\n", " sorted_data.index[-1].year)]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We check that our periods contain between 51 and 52 weeks, as a safeguard against potential mistakes in our code." ] }, { "cell_type": "code", "execution_count": 90, "metadata": {}, "outputs": [], "source": [ "year = []\n", "yearly_incidence = []\n", "for week1, week2 in zip(first_december_week[:-1],\n", " first_december_week[1:]):\n", " one_year = sorted_data['inc'][week1:week2-1]\n", " assert abs(len(one_year)-52) < 2\n", " yearly_incidence.append(one_year.sum())\n", " year.append(week2.year)\n", "yearly_incidence = pd.Series(data=yearly_incidence, index=year)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Annual incidences" ] }, { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 91, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "yearly_incidence.plot(style='*')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Sorted by value" ] }, { "cell_type": "code", "execution_count": 92, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2020 221186\n", "2021 376290\n", "2002 516689\n", "2018 542312\n", "2017 551041\n", "1996 564901\n", "2019 584066\n", "2015 604382\n", "2000 617597\n", "2001 619041\n", "2012 624573\n", "2005 628464\n", "2006 632833\n", "2022 641397\n", "2011 642368\n", "1993 643387\n", "1995 652478\n", "1994 661409\n", "1998 677775\n", "1997 683434\n", "2014 685769\n", "2013 698332\n", "2007 717352\n", "2008 749478\n", "1999 756456\n", "2003 758363\n", "2004 777388\n", "2016 782114\n", "2010 829911\n", "1992 832939\n", "2009 842373\n", "dtype: int64" ] }, "execution_count": 92, "metadata": {}, "output_type": "execute_result" } ], "source": [ "yearly_incidence.sort_values()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4" } }, "nbformat": 4, "nbformat_minor": 2 }