module3/exo1/analyse-syndrome-grippal.ipynb

parent c77fe086
...@@ -13,18 +13,22 @@ ...@@ -13,18 +13,22 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# Activer l'affichage des graphiques dans le notebook (jupyter) en ligne.\n",
"%matplotlib inline\n", "%matplotlib inline\n",
"import matplotlib.pyplot as plt\n", "\n",
"import pandas as pd\n", "# Importer les bibliothèques nécessaires\n",
"import isoweek" "import matplotlib.pyplot as plt # Pour la création de graphiques\n",
"import pandas as pd # Pour la manipulation des données\n",
"import isoweek # Pour gérer les semaines ISO"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 24, "execution_count": 41,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# URL où les données d'incidence du syndrome grippal sont téléchargées\n",
"data_url = \"http://www.sentiweb.fr/datasets/incidence-PAY-3.csv\"" "data_url = \"http://www.sentiweb.fr/datasets/incidence-PAY-3.csv\""
] ]
}, },
...@@ -37,15 +41,20 @@ ...@@ -37,15 +41,20 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 25, "execution_count": 42,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# Nom du fichier local où les données seront stockées\n",
"data_file = \"syndrome-grippal.csv\"\n", "data_file = \"syndrome-grippal.csv\"\n",
"\n", "\n",
"# Vérifier si le fichier local existe, et s'il n'existe pas, le télécharger depuis l'URL\n",
"import os\n", "import os\n",
"import urllib.request\n", "import urllib.request\n",
"\n",
"# Vérifier si le fichier local n'existe pas\n",
"if not os.path.exists(data_file):\n", "if not os.path.exists(data_file):\n",
" # Télécharger les données depuis l'URL et les enregistrer dans le fichier local\n",
" urllib.request.urlretrieve(data_url, data_file)" " urllib.request.urlretrieve(data_url, data_file)"
] ]
}, },
...@@ -73,7 +82,7 @@ ...@@ -73,7 +82,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 26, "execution_count": 43,
"metadata": { "metadata": {
"scrolled": true "scrolled": true
}, },
...@@ -1040,13 +1049,16 @@ ...@@ -1040,13 +1049,16 @@
"[2031 rows x 10 columns]" "[2031 rows x 10 columns]"
] ]
}, },
"execution_count": 26, "execution_count": 43,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
], ],
"source": [ "source": [
"# Lire les données depuis le fichier local CSV en sautant la première ligne (commentaire)\n",
"raw_data = pd.read_csv(data_file, skiprows=1)\n", "raw_data = pd.read_csv(data_file, skiprows=1)\n",
"\n",
"# Afficher les données brutes\n",
"raw_data" "raw_data"
] ]
}, },
...@@ -1059,7 +1071,7 @@ ...@@ -1059,7 +1071,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 27, "execution_count": 44,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
...@@ -1121,12 +1133,13 @@ ...@@ -1121,12 +1133,13 @@
"1794 FR France " "1794 FR France "
] ]
}, },
"execution_count": 27, "execution_count": 44,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
], ],
"source": [ "source": [
"# Sélectionner les lignes contenant au moins une valeur manquante (NaN)\n",
"raw_data[raw_data.isnull().any(axis=1)]" "raw_data[raw_data.isnull().any(axis=1)]"
] ]
}, },
...@@ -1139,7 +1152,7 @@ ...@@ -1139,7 +1152,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 28, "execution_count": 45,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
...@@ -2104,13 +2117,16 @@ ...@@ -2104,13 +2117,16 @@
"[2030 rows x 10 columns]" "[2030 rows x 10 columns]"
] ]
}, },
"execution_count": 28, "execution_count": 45,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
], ],
"source": [ "source": [
"# Supprimer les lignes contenant des valeurs manquantes (NaN) à partir des données brutes\n",
"data = raw_data.dropna().copy()\n", "data = raw_data.dropna().copy()\n",
"\n",
"# Afficher les données nettoyées (sans valeurs manquantes) et en créer une copie\n",
"data" "data"
] ]
}, },
...@@ -2134,17 +2150,28 @@ ...@@ -2134,17 +2150,28 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 29, "execution_count": 46,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# Définition d'une fonction pour convertir l'année et la semaine en période\n",
"def convert_week(year_and_week_int):\n", "def convert_week(year_and_week_int):\n",
" # Convertir l'entier en une chaîne de caractères\n",
" year_and_week_str = str(year_and_week_int)\n", " year_and_week_str = str(year_and_week_int)\n",
" \n",
" # Extraire l'année (les 4 premiers caractères de la chaîne)\n",
" year = int(year_and_week_str[:4])\n", " year = int(year_and_week_str[:4])\n",
" \n",
" # Extraire le numéro de semaine (le reste de la chaîne)\n",
" week = int(year_and_week_str[4:])\n", " week = int(year_and_week_str[4:])\n",
" \n",
" # Créer un objet isoweek.Week avec l'année et la semaine\n",
" w = isoweek.Week(year, week)\n", " w = isoweek.Week(year, week)\n",
" \n",
" # Convertir l'objet isoweek.Week en une période pandas\n",
" return pd.Period(w.day(0), 'W')\n", " return pd.Period(w.day(0), 'W')\n",
"\n", "\n",
"# Appliquer la fonction convert_week à la colonne 'week' et créer une nouvelle colonne 'period'\n",
"data['period'] = [convert_week(yw) for yw in data['week']]" "data['period'] = [convert_week(yw) for yw in data['week']]"
] ]
}, },
...@@ -2164,10 +2191,11 @@ ...@@ -2164,10 +2191,11 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 30, "execution_count": 47,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# Définir la colonne 'period' comme index du DataFrame et trier le DataFrame par cet index\n",
"sorted_data = data.set_index('period').sort_index()" "sorted_data = data.set_index('period').sort_index()"
] ]
}, },
...@@ -2189,7 +2217,7 @@ ...@@ -2189,7 +2217,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 32, "execution_count": 48,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
...@@ -2201,10 +2229,17 @@ ...@@ -2201,10 +2229,17 @@
} }
], ],
"source": [ "source": [
"# Obtenir l'index (colonne 'period') du DataFrame trié\n",
"periods = sorted_data.index\n", "periods = sorted_data.index\n",
"\n",
"# Parcourir les périodes consécutives et vérifier la différence temporelle entre elles\n",
"for p1, p2 in zip(periods[:-1], periods[1:]):\n", "for p1, p2 in zip(periods[:-1], periods[1:]):\n",
" # Calculer la différence temporelle entre la fin de la période p1 et le début de la période p2\n",
" delta = p2.to_timestamp() - p1.end_time\n", " delta = p2.to_timestamp() - p1.end_time\n",
" \n",
" # Vérifier si la différence temporelle est supérieure à 1 seconde\n",
" if delta > pd.Timedelta('1s'):\n", " if delta > pd.Timedelta('1s'):\n",
" # Afficher les paires de périodes consécutives qui ont une différence temporelle inattendue\n",
" print(p1, p2)" " print(p1, p2)"
] ]
}, },
...@@ -2217,7 +2252,7 @@ ...@@ -2217,7 +2252,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 33, "execution_count": 49,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
...@@ -2231,21 +2266,21 @@ ...@@ -2231,21 +2266,21 @@
"source": [ "source": [
"# Convertir la colonne 'inc' en type numérique (float)\n", "# Convertir la colonne 'inc' en type numérique (float)\n",
"sorted_data['inc'] = pd.to_numeric(sorted_data['inc'], errors='coerce')\n", "sorted_data['inc'] = pd.to_numeric(sorted_data['inc'], errors='coerce')\n",
"print(sorted_data['inc'].dtypes)\n" "print(sorted_data['inc'].dtypes)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 34, "execution_count": 50,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"<matplotlib.axes._subplots.AxesSubplot at 0x7f2fc7e208d0>" "<matplotlib.axes._subplots.AxesSubplot at 0x7f2fc7b8bf28>"
] ]
}, },
"execution_count": 34, "execution_count": 50,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
}, },
...@@ -2263,6 +2298,7 @@ ...@@ -2263,6 +2298,7 @@
} }
], ],
"source": [ "source": [
"# Tracer un graphique de la colonne 'inc' du DataFrame trié\n",
"sorted_data['inc'].plot()" "sorted_data['inc'].plot()"
] ]
}, },
...@@ -2275,16 +2311,16 @@ ...@@ -2275,16 +2311,16 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 35, "execution_count": 51,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"<matplotlib.axes._subplots.AxesSubplot at 0x7f2fc7d585c0>" "<matplotlib.axes._subplots.AxesSubplot at 0x7f2fc79da780>"
] ]
}, },
"execution_count": 35, "execution_count": 51,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
}, },
...@@ -2302,6 +2338,7 @@ ...@@ -2302,6 +2338,7 @@
} }
], ],
"source": [ "source": [
"# Tracer un graphique des 200 dernières entrées de la colonne 'inc' du DataFrame trié\n",
"sorted_data['inc'][-200:].plot()" "sorted_data['inc'][-200:].plot()"
] ]
}, },
...@@ -2335,10 +2372,11 @@ ...@@ -2335,10 +2372,11 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 36, "execution_count": 52,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# Créer une liste des premières semaines d'août pour chaque année entre 1985 et la dernière année de l'index de sorted_data\n",
"first_august_week = [pd.Period(pd.Timestamp(y, 8, 1), 'W')\n", "first_august_week = [pd.Period(pd.Timestamp(y, 8, 1), 'W')\n",
" for y in range(1985,\n", " for y in range(1985,\n",
" sorted_data.index[-1].year)]" " sorted_data.index[-1].year)]"
...@@ -2355,19 +2393,31 @@ ...@@ -2355,19 +2393,31 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 37, "execution_count": 53,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"year = []\n", "# Initialisation des listes pour stocker les données annuelles\n",
"yearly_incidence = []\n", "year = [] # Liste des années\n",
"yearly_incidence = [] # Liste des incidences annuelles\n",
"\n",
"# Parcourir les paires d'intervalles annuels définies par first_august_week\n",
"for week1, week2 in zip(first_august_week[:-1],\n", "for week1, week2 in zip(first_august_week[:-1],\n",
" first_august_week[1:]):\n", " first_august_week[1:]):\n",
" # Extraire les données d'incidence pour une année donnée\n",
" one_year = sorted_data['inc'][week1:week2-1]\n", " one_year = sorted_data['inc'][week1:week2-1]\n",
" assert abs(len(one_year)-52) < 2\n", " \n",
" # Vérifier que chaque année a environ 52 semaines d'incidence\n",
" assert abs(len(one_year) - 52) < 2\n",
" \n",
" # Ajouter la somme des incidences de l'année à la liste yearly_incidence\n",
" yearly_incidence.append(one_year.sum())\n", " yearly_incidence.append(one_year.sum())\n",
" \n",
" # Ajouter l'année correspondante à la liste year\n",
" year.append(week2.year)\n", " year.append(week2.year)\n",
"yearly_incidence = pd.Series(data=yearly_incidence, index=year)" "\n",
"# Créer une série pandas avec les données annuelles et les années comme index\n",
"yearly_incidence = pd.Series(data=yearly_incidence, index=year)\n"
] ]
}, },
{ {
...@@ -2406,6 +2456,7 @@ ...@@ -2406,6 +2456,7 @@
} }
], ],
"source": [ "source": [
"# Tracer un graphique de dispersion des données d'incidence annuelle avec un style en étoile\n",
"yearly_incidence.plot(style='*')" "yearly_incidence.plot(style='*')"
] ]
}, },
...@@ -2418,7 +2469,7 @@ ...@@ -2418,7 +2469,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 39, "execution_count": 54,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
...@@ -2464,12 +2515,13 @@ ...@@ -2464,12 +2515,13 @@
"dtype: int64" "dtype: int64"
] ]
}, },
"execution_count": 39, "execution_count": 54,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
], ],
"source": [ "source": [
"# Trier les données d'incidence annuelle par ordre croissant\n",
"yearly_incidence.sort_values()" "yearly_incidence.sort_values()"
] ]
}, },
...@@ -2483,16 +2535,16 @@ ...@@ -2483,16 +2535,16 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 40, "execution_count": 55,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"<matplotlib.axes._subplots.AxesSubplot at 0x7f2fc7c42588>" "<matplotlib.axes._subplots.AxesSubplot at 0x7f2fc7b7b5f8>"
] ]
}, },
"execution_count": 40, "execution_count": 55,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
}, },
...@@ -2510,8 +2562,16 @@ ...@@ -2510,8 +2562,16 @@
} }
], ],
"source": [ "source": [
"# Tracer un histogramme des données d'incidence annuelle avec une rotation de l'axe des x de 20 degrés\n",
"yearly_incidence.hist(xrot=20)" "yearly_incidence.hist(xrot=20)"
] ]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
} }
], ],
"metadata": { "metadata": {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment