{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "hideCode": false, "hidePrompt": false }, "outputs": [], "source": [ "%matplotlib inline\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import os\n", "import urllib.request" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "hideCode": false, "hidePrompt": false }, "outputs": [], "source": [ "data_url = \"https://www.sentiweb.fr/datasets/incidence-PAY-3.csv\"\n", "data_file = \"syndrome_grippal.csv\"\n", "if os.path.exists(data_file):\n", " pass\n", "else:\n", " urllib.request.urlretrieve(data_url, data_file)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "hideCode": false, "hidePrompt": false }, "outputs": [], "source": [ "data = pd.read_csv(data_file, skiprows=1)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "hideCode": false, "hidePrompt": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
weekindicatorincinc_lowinc_upinc100inc100_lowinc100_upgeo_inseegeo_name
020220437258565106.080064.010998.0120.0FRFrance
120220337461367144.082082.0113102.0124.0FRFrance
220220235592049511.062329.08474.094.0FRFrance
320220135762950699.064559.08777.097.0FRFrance
420215235434947029.061669.08271.093.0FRFrance
\n", "
" ], "text/plain": [ " week indicator inc inc_low inc_up inc100 inc100_low inc100_up \\\n", "0 202204 3 72585 65106.0 80064.0 109 98.0 120.0 \n", "1 202203 3 74613 67144.0 82082.0 113 102.0 124.0 \n", "2 202202 3 55920 49511.0 62329.0 84 74.0 94.0 \n", "3 202201 3 57629 50699.0 64559.0 87 77.0 97.0 \n", "4 202152 3 54349 47029.0 61669.0 82 71.0 93.0 \n", "\n", " geo_insee geo_name \n", "0 FR France \n", "1 FR France \n", "2 FR France \n", "3 FR France \n", "4 FR France " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "hideCode": false, "hidePrompt": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
weekindicatorincinc_lowinc_upinc100inc100_lowinc100_upgeo_inseegeo_name
193919844837862060634.096606.0143110.0176.0FRFrance
194019844737202954274.089784.013199.0163.0FRFrance
194119844638733067686.0106974.0159123.0195.0FRFrance
19421984453135223101414.0169032.0246184.0308.0FRFrance
194319844436842220056.0116788.012537.0213.0FRFrance
\n", "
" ], "text/plain": [ " week indicator inc inc_low inc_up inc100 inc100_low \\\n", "1939 198448 3 78620 60634.0 96606.0 143 110.0 \n", "1940 198447 3 72029 54274.0 89784.0 131 99.0 \n", "1941 198446 3 87330 67686.0 106974.0 159 123.0 \n", "1942 198445 3 135223 101414.0 169032.0 246 184.0 \n", "1943 198444 3 68422 20056.0 116788.0 125 37.0 \n", "\n", " inc100_up geo_insee geo_name \n", "1939 176.0 FR France \n", "1940 163.0 FR France \n", "1941 195.0 FR France \n", "1942 308.0 FR France \n", "1943 213.0 FR France " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.tail()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "hideCode": false, "hidePrompt": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "week int64\n", "indicator int64\n", "inc int64\n", "inc_low float64\n", "inc_up float64\n", "inc100 int64\n", "inc100_low float64\n", "inc100_up float64\n", "geo_insee object\n", "geo_name object\n", "dtype: object\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
weekindicatorincinc_lowinc_upinc100inc100_lowinc100_up
count1944.0000001944.01.944000e+031943.0000001.943000e+031944.0000001943.0000001943.000000
mean200321.9645063.06.006592e+0452573.9372106.767021e+04100.75051488.210499113.476068
std1075.7075100.01.135715e+05107879.9679121.194434e+05193.441495183.805111203.392822
min198444.0000003.00.000000e+000.0000000.000000e+000.0000000.0000000.000000
25%199407.7500003.05.170750e+032048.5000008.432500e+039.0000003.00000014.000000
50%200324.5000003.01.618500e+0410915.0000002.161300e+0427.00000018.00000036.000000
75%201240.2500003.04.819375e+0438644.5000005.730750e+0482.00000066.00000097.500000
max202204.0000003.01.001824e+06974799.0000001.028849e+061793.0000001745.0000001841.000000
\n", "
" ], "text/plain": [ " week indicator inc inc_low inc_up \\\n", "count 1944.000000 1944.0 1.944000e+03 1943.000000 1.943000e+03 \n", "mean 200321.964506 3.0 6.006592e+04 52573.937210 6.767021e+04 \n", "std 1075.707510 0.0 1.135715e+05 107879.967912 1.194434e+05 \n", "min 198444.000000 3.0 0.000000e+00 0.000000 0.000000e+00 \n", "25% 199407.750000 3.0 5.170750e+03 2048.500000 8.432500e+03 \n", "50% 200324.500000 3.0 1.618500e+04 10915.000000 2.161300e+04 \n", "75% 201240.250000 3.0 4.819375e+04 38644.500000 5.730750e+04 \n", "max 202204.000000 3.0 1.001824e+06 974799.000000 1.028849e+06 \n", "\n", " inc100 inc100_low inc100_up \n", "count 1944.000000 1943.000000 1943.000000 \n", "mean 100.750514 88.210499 113.476068 \n", "std 193.441495 183.805111 203.392822 \n", "min 0.000000 0.000000 0.000000 \n", "25% 9.000000 3.000000 14.000000 \n", "50% 27.000000 18.000000 36.000000 \n", "75% 82.000000 66.000000 97.500000 \n", "max 1793.000000 1745.000000 1841.000000 " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(data.dtypes)\n", "data.describe()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "hideCode": false, "hidePrompt": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
weekindicatorincinc_lowinc_upinc100inc100_lowinc100_upgeo_inseegeo_name
170719891930NaNNaN0NaNNaNFRFrance
\n", "
" ], "text/plain": [ " week indicator inc inc_low inc_up inc100 inc100_low inc100_up \\\n", "1707 198919 3 0 NaN NaN 0 NaN NaN \n", "\n", " geo_insee geo_name \n", "1707 FR France " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "missing_data = data.loc[data.isna().any(axis=1), :]\n", "missing_data" ] }, { "cell_type": "markdown", "metadata": { "hideCode": false, "hidePrompt": false }, "source": [ "In the original datafile: data is simply missing for week \"198919\", hence the NaN obtained.\n", "We decide to continue the analysis without this line which, hopefully, should have little impact on the analysis." ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "hideCode": false, "hidePrompt": false }, "outputs": [], "source": [ "data = data.drop(index=missing_data.index)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "hideCode": false, "hidePrompt": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
weekindicatorincinc_lowinc_upinc100inc100_lowinc100_up
count1943.0000001943.01.943000e+031943.0000001.943000e+031943.0000001943.0000001943.000000
mean200322.6865673.06.009684e+0452573.9372106.767021e+04100.80236788.210499113.476068
std1075.5131020.01.135925e+05107879.9679121.194434e+05193.477779183.805111203.392822
min198444.0000003.00.000000e+000.0000000.000000e+000.0000000.0000000.000000
25%199408.5000003.05.177000e+032048.5000008.432500e+039.0000003.00000014.000000
50%200325.0000003.01.618800e+0410915.0000002.161300e+0427.00000018.00000036.000000
75%201240.5000003.04.821350e+0438644.5000005.730750e+0482.00000066.00000097.500000
max202204.0000003.01.001824e+06974799.0000001.028849e+061793.0000001745.0000001841.000000
\n", "
" ], "text/plain": [ " week indicator inc inc_low inc_up \\\n", "count 1943.000000 1943.0 1.943000e+03 1943.000000 1.943000e+03 \n", "mean 200322.686567 3.0 6.009684e+04 52573.937210 6.767021e+04 \n", "std 1075.513102 0.0 1.135925e+05 107879.967912 1.194434e+05 \n", "min 198444.000000 3.0 0.000000e+00 0.000000 0.000000e+00 \n", "25% 199408.500000 3.0 5.177000e+03 2048.500000 8.432500e+03 \n", "50% 200325.000000 3.0 1.618800e+04 10915.000000 2.161300e+04 \n", "75% 201240.500000 3.0 4.821350e+04 38644.500000 5.730750e+04 \n", "max 202204.000000 3.0 1.001824e+06 974799.000000 1.028849e+06 \n", "\n", " inc100 inc100_low inc100_up \n", "count 1943.000000 1943.000000 1943.000000 \n", "mean 100.802367 88.210499 113.476068 \n", "std 193.477779 183.805111 203.392822 \n", "min 0.000000 0.000000 0.000000 \n", "25% 9.000000 3.000000 14.000000 \n", "50% 27.000000 18.000000 36.000000 \n", "75% 82.000000 66.000000 97.500000 \n", "max 1793.000000 1745.000000 1841.000000 " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**TODO:** Change week format" ] } ], "metadata": { "celltoolbar": "Hide code", "hide_code_all_hidden": false, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4" } }, "nbformat": 4, "nbformat_minor": 2 }