recherche de valeurs manquantes

4cd8fdef · d0e6f90fc91537e71d96befd9f7f0f02 · f8e615cc · 4cd8fdef · 4cd8fdef
Commit 4cd8fdef authored Jan 03, 2025 by d0e6f90fc91537e71d96befd9f7f0f02
Expand all Hide whitespace changes
Inline Side-by-side

Showing with 1969 additions and 6 deletions

exercice.ipynb module3/exo2/exercice.ipynb +190 -6

varicelle.csv module3/exo2/varicelle.csv +1779 -0

No files found.
--- a/module3/exo2/exercice.ipynb
+++ b/module3/exo2/exercice.ipynb
@@ -30,7 +30,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -39,18 +39,202 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>week</th>\n",
+       "      <th>indicator</th>\n",
+       "      <th>inc</th>\n",
+       "      <th>inc_low</th>\n",
+       "      <th>inc_up</th>\n",
+       "      <th>inc100</th>\n",
+       "      <th>inc100_low</th>\n",
+       "      <th>inc100_up</th>\n",
+       "      <th>geo_insee</th>\n",
+       "      <th>geo_name</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>202452</td>\n",
+       "      <td>7</td>\n",
+       "      <td>4952</td>\n",
+       "      <td>1940</td>\n",
+       "      <td>7964</td>\n",
+       "      <td>7</td>\n",
+       "      <td>2</td>\n",
+       "      <td>12</td>\n",
+       "      <td>FR</td>\n",
+       "      <td>France</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>202451</td>\n",
+       "      <td>7</td>\n",
+       "      <td>4705</td>\n",
+       "      <td>2265</td>\n",
+       "      <td>7145</td>\n",
+       "      <td>7</td>\n",
+       "      <td>3</td>\n",
+       "      <td>11</td>\n",
+       "      <td>FR</td>\n",
+       "      <td>France</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>202450</td>\n",
+       "      <td>7</td>\n",
+       "      <td>7363</td>\n",
+       "      <td>4438</td>\n",
+       "      <td>10288</td>\n",
+       "      <td>11</td>\n",
+       "      <td>7</td>\n",
+       "      <td>15</td>\n",
+       "      <td>FR</td>\n",
+       "      <td>France</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>202449</td>\n",
+       "      <td>7</td>\n",
+       "      <td>6077</td>\n",
+       "      <td>3631</td>\n",
+       "      <td>8523</td>\n",
+       "      <td>9</td>\n",
+       "      <td>5</td>\n",
+       "      <td>13</td>\n",
+       "      <td>FR</td>\n",
+       "      <td>France</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>202448</td>\n",
+       "      <td>7</td>\n",
+       "      <td>4189</td>\n",
+       "      <td>1454</td>\n",
+       "      <td>6924</td>\n",
+       "      <td>6</td>\n",
+       "      <td>2</td>\n",
+       "      <td>10</td>\n",
+       "      <td>FR</td>\n",
+       "      <td>France</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     week  indicator   inc  inc_low  inc_up  inc100  inc100_low  inc100_up  \\\n",
+       "0  202452          7  4952     1940    7964       7           2         12   \n",
+       "1  202451          7  4705     2265    7145       7           3         11   \n",
+       "2  202450          7  7363     4438   10288      11           7         15   \n",
+       "3  202449          7  6077     3631    8523       9           5         13   \n",
+       "4  202448          7  4189     1454    6924       6           2         10   \n",
+       "\n",
+       "  geo_insee geo_name  \n",
+       "0        FR   France  \n",
+       "1        FR   France  \n",
+       "2        FR   France  \n",
+       "3        FR   France  \n",
+       "4        FR   France  "
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
    "try:\n",
-    "    raw_data = pd.read_csv('./incidence_gripale.csv',index=True)\n",
+    "    raw_data = pd.read_csv('./varicelle.csv',index=True)\n",
    "except:\n",
-    "    raw_data = pd.read_csv(data_url, skiprows=1)\n",
-    "    raw_data.to_csv('./incidence_gripale.csv')\n",
+    "    raw_data = pd.read_csv(url, skiprows=1)\n",
+    "    raw_data.to_csv('./varicelle.csv')\n",
    "raw_data.head()"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Pour la signification des colonnes, il faut vérifier le schéma csv, [ici](https://ns.sentiweb.fr/incidence/csv-schema-v1.json).\n",
+    "À retenir : les incertitudes se font à 95%.\n",
+    "La colonne \"inc100\" représente les incidences pour 100 000 habitants."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "On vérifie s'il y a des données manquantes : visiblement non."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "raw_data.isnull().any(axis=1).sum()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Pré-traitement des données\n",
+    "Il faut adapter le format des semaines, qui n'est pas lisible en l'état."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def convert_week(ynw_int):\n",
+    "    '''Prend un entier représentant l'année et le numéro de semaine\n",
+    "    et renvoie  un objet adapté à pandas.'''\n",
+    "    ynw_str = str(ynw_int)\n",
+    "    y = int(ynw_str[:4])\n",
+    "    w = int(ynw_str[4:])\n",
+    "    week = isoweek.Week(y,w)\n",
+    "    return pd.Period(week.day(0),'W')"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,

--- a/module3/exo2/varicelle.csv
+++ b/module3/exo2/varicelle.csv