TP : Extraction des données

3e881100 · 8ae4836869d9dfa2662a12d59ff25279 · 4ff0b6f9 · 3e881100
Commit 3e881100 authored Nov 04, 2021 by 8ae4836869d9dfa2662a12d59ff25279
Hide whitespace changes
Inline Side-by-side

Showing with 548 additions and 136 deletions

exercice.ipynb module3/exo3/exercice.ipynb +548 -136

No files found.
--- a/module3/exo3/exercice.ipynb
+++ b/module3/exo3/exercice.ipynb
@@ -46,16 +46,25 @@
    "## Téléchargement des données"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "On s'intéresse à deux jeux de données hébergés en ligne. La première étape est de les télécharger en local."
+   ]
+  },
  {
   "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
+    "# Noms des fichiers en local\n",
    "filenames = [\n",
    "    \"liglab2.log\", \n",
    "    \"stackoverflow.log\",\n",
    "]\n",
+    "# Adresse où les fichiers sont hébergés\n",
    "urls = [\n",
    "    \"http://mescal.imag.fr/membres/arnaud.legrand/teaching/2014/RICM4_EP_ping/liglab2.log.gz\",\n",
    "    \"http://mescal.imag.fr/membres/arnaud.legrand/teaching/2014/RICM4_EP_ping/stackoverflow.log.gz\",\n",
@@ -78,7 +87,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
@@ -95,9 +104,11 @@
    "\n",
    "def download_archive(filename, url):\n",
    "    if not exists(filename):\n",
-    "        # Le fichier est une archive .gz\n",
+    "        # On utilise le module requests pour récupérer les données en ligne\n",
    "        archive = requests.get(url)\n",
+    "        # Le fichier est une archive .gz, on l'extrait avec le module gzip\n",
    "        content = gzip.decompress(archive.content)\n",
+    "        \n",
    "        open(filename,'wb').write(content)\n",
    "        print(f\"Téléchargement de {url} et extraction vers {filename}.\")\n",
    "    else:\n",
@@ -108,150 +119,551 @@
    "    download_archive(filename, url)\n"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Lecture des données\n",
+    "On extrait maintenant les données de l'outil `ping` sous forme d'un tableau `pandas`.\n",
+    "\n",
+    "Le format étant relativement simple, il est possible de le faire en utilisant uniquement les fonctions de base des chaînes de caractères de Python.\n",
+    "\n",
+    "Chaque ligne a la forme suivante:\n",
+    "```\n",
+    "[1421761682.052172] 665 bytes from lig-publig.imag.fr (129.88.11.7): icmp_seq=1 ttl=60 time=22.5 ms\n",
+    "```\n",
+    "On extrait uniquement les données qui nous intéressent :\n",
+    "\n",
+    " * la date de mesure (en secondes depuis le 1er janvier 1970) du 2e au 18e caractère\n",
+    " * la taille du message (en octets), qui est suivi de la sous-chaîne `\" bytes\"`\n",
+    " * la durée de réponse (en millisecondes), qui est précédé de `\"time=\"` et suivi de `\" ms\"`"
+   ]
+  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Help on Response in module requests.models object:\n",
-      "\n",
-      "class Response(builtins.object)\n",
-      " |  The :class:`Response <Response>` object, which contains a\n",
-      " |  server's response to an HTTP request.\n",
-      " |  \n",
-      " |  Methods defined here:\n",
-      " |  \n",
-      " |  __bool__(self)\n",
-      " |      Returns True if :attr:`status_code` is less than 400.\n",
-      " |      \n",
-      " |      This attribute checks if the status code of the response is between\n",
-      " |      400 and 600 to see if there was a client error or a server error. If\n",
-      " |      the status code, is between 200 and 400, this will return True. This\n",
-      " |      is **not** a check to see if the response code is ``200 OK``.\n",
-      " |  \n",
-      " |  __enter__(self)\n",
-      " |  \n",
-      " |  __exit__(self, *args)\n",
-      " |  \n",
-      " |  __getstate__(self)\n",
-      " |  \n",
-      " |  __init__(self)\n",
-      " |      Initialize self.  See help(type(self)) for accurate signature.\n",
-      " |  \n",
-      " |  __iter__(self)\n",
-      " |      Allows you to use a response as an iterator.\n",
-      " |  \n",
-      " |  __nonzero__(self)\n",
-      " |      Returns True if :attr:`status_code` is less than 400.\n",
-      " |      \n",
-      " |      This attribute checks if the status code of the response is between\n",
-      " |      400 and 600 to see if there was a client error or a server error. If\n",
-      " |      the status code, is between 200 and 400, this will return True. This\n",
-      " |      is **not** a check to see if the response code is ``200 OK``.\n",
-      " |  \n",
-      " |  __repr__(self)\n",
-      " |      Return repr(self).\n",
-      " |  \n",
-      " |  __setstate__(self, state)\n",
-      " |  \n",
-      " |  close(self)\n",
-      " |      Releases the connection back to the pool. Once this method has been\n",
-      " |      called the underlying ``raw`` object must not be accessed again.\n",
-      " |      \n",
-      " |      *Note: Should not normally need to be called explicitly.*\n",
-      " |  \n",
-      " |  iter_content(self, chunk_size=1, decode_unicode=False)\n",
-      " |      Iterates over the response data.  When stream=True is set on the\n",
-      " |      request, this avoids reading the content at once into memory for\n",
-      " |      large responses.  The chunk size is the number of bytes it should\n",
-      " |      read into memory.  This is not necessarily the length of each item\n",
-      " |      returned as decoding can take place.\n",
-      " |      \n",
-      " |      chunk_size must be of type int or None. A value of None will\n",
-      " |      function differently depending on the value of `stream`.\n",
-      " |      stream=True will read data as it arrives in whatever size the\n",
-      " |      chunks are received. If stream=False, data is returned as\n",
-      " |      a single chunk.\n",
-      " |      \n",
-      " |      If decode_unicode is True, content will be decoded using the best\n",
-      " |      available encoding based on the response.\n",
-      " |  \n",
-      " |  iter_lines(self, chunk_size=512, decode_unicode=False, delimiter=None)\n",
-      " |      Iterates over the response data, one line at a time.  When\n",
-      " |      stream=True is set on the request, this avoids reading the\n",
-      " |      content at once into memory for large responses.\n",
-      " |      \n",
-      " |      .. note:: This method is not reentrant safe.\n",
-      " |  \n",
-      " |  json(self, **kwargs)\n",
-      " |      Returns the json-encoded content of a response, if any.\n",
-      " |      \n",
-      " |      :param \\*\\*kwargs: Optional arguments that ``json.loads`` takes.\n",
-      " |      :raises ValueError: If the response body does not contain valid json.\n",
-      " |  \n",
-      " |  raise_for_status(self)\n",
-      " |      Raises stored :class:`HTTPError`, if one occurred.\n",
-      " |  \n",
-      " |  ----------------------------------------------------------------------\n",
-      " |  Data descriptors defined here:\n",
-      " |  \n",
-      " |  __dict__\n",
-      " |      dictionary for instance variables (if defined)\n",
-      " |  \n",
-      " |  __weakref__\n",
-      " |      list of weak references to the object (if defined)\n",
-      " |  \n",
-      " |  apparent_encoding\n",
-      " |      The apparent encoding, provided by the chardet library.\n",
-      " |  \n",
-      " |  content\n",
-      " |      Content of the response, in bytes.\n",
-      " |  \n",
-      " |  is_permanent_redirect\n",
-      " |      True if this Response one of the permanent versions of redirect.\n",
-      " |  \n",
-      " |  is_redirect\n",
-      " |      True if this Response is a well-formed HTTP redirect that could have\n",
-      " |      been processed automatically (by :meth:`Session.resolve_redirects`).\n",
-      " |  \n",
-      " |  links\n",
-      " |      Returns the parsed header links of the response, if any.\n",
-      " |  \n",
-      " |  next\n",
-      " |      Returns a PreparedRequest for the next request in a redirect chain, if there is one.\n",
-      " |  \n",
-      " |  ok\n",
-      " |      Returns True if :attr:`status_code` is less than 400, False if not.\n",
-      " |      \n",
-      " |      This attribute checks if the status code of the response is between\n",
-      " |      400 and 600 to see if there was a client error or a server error. If\n",
-      " |      the status code is between 200 and 400, this will return True. This\n",
-      " |      is **not** a check to see if the response code is ``200 OK``.\n",
-      " |  \n",
-      " |  text\n",
-      " |      Content of the response, in unicode.\n",
-      " |      \n",
-      " |      If Response.encoding is None, encoding will be guessed using\n",
-      " |      ``chardet``.\n",
-      " |      \n",
-      " |      The encoding of the response content is determined based solely on HTTP\n",
-      " |      headers, following RFC 2616 to the letter. If you can take advantage of\n",
-      " |      non-HTTP knowledge to make a better guess at the encoding, you should\n",
-      " |      set ``r.encoding`` appropriately before accessing this property.\n",
-      " |  \n",
-      " |  ----------------------------------------------------------------------\n",
-      " |  Data and other attributes defined here:\n",
-      " |  \n",
-      " |  __attrs__ = ['_content', 'status_code', 'headers', 'url', 'history', '...\n",
-      "\n"
+      "44036 lignes lues avec succès, (0.85% d'échecs)\n"
     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>msgsize</th>\n",
+       "      <th>time</th>\n",
+       "      <th>timestamp</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>665</td>\n",
+       "      <td>22.50</td>\n",
+       "      <td>2015-01-20 13:48:02.052172</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1373</td>\n",
+       "      <td>21.20</td>\n",
+       "      <td>2015-01-20 13:48:02.277315</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>262</td>\n",
+       "      <td>21.20</td>\n",
+       "      <td>2015-01-20 13:48:02.502054</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>1107</td>\n",
+       "      <td>23.30</td>\n",
+       "      <td>2015-01-20 13:48:02.729257</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>1128</td>\n",
+       "      <td>1.41</td>\n",
+       "      <td>2015-01-20 13:48:02.934648</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>489</td>\n",
+       "      <td>21.90</td>\n",
+       "      <td>2015-01-20 13:48:03.160397</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>1759</td>\n",
+       "      <td>78.70</td>\n",
+       "      <td>2015-01-20 13:48:03.443055</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>1146</td>\n",
+       "      <td>25.10</td>\n",
+       "      <td>2015-01-20 13:48:03.672157</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>884</td>\n",
+       "      <td>24.00</td>\n",
+       "      <td>2015-01-20 13:48:03.899933</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>1422</td>\n",
+       "      <td>19.50</td>\n",
+       "      <td>2015-01-20 13:48:04.122687</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>1180</td>\n",
+       "      <td>18.00</td>\n",
+       "      <td>2015-01-20 13:48:04.344135</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>999</td>\n",
+       "      <td>18.80</td>\n",
+       "      <td>2015-01-20 13:48:04.566271</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>1020</td>\n",
+       "      <td>24.30</td>\n",
+       "      <td>2015-01-20 13:48:04.998504</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>71</td>\n",
+       "      <td>3.45</td>\n",
+       "      <td>2015-01-20 13:48:05.205172</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>34</td>\n",
+       "      <td>5.85</td>\n",
+       "      <td>2015-01-20 13:48:05.414106</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>1843</td>\n",
+       "      <td>2.31</td>\n",
+       "      <td>2015-01-20 13:48:05.620117</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>407</td>\n",
+       "      <td>1.14</td>\n",
+       "      <td>2015-01-20 13:48:05.824949</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>356</td>\n",
+       "      <td>1.10</td>\n",
+       "      <td>2015-01-20 13:48:06.029177</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>1511</td>\n",
+       "      <td>2.18</td>\n",
+       "      <td>2015-01-20 13:48:06.234464</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>587</td>\n",
+       "      <td>1.27</td>\n",
+       "      <td>2015-01-20 13:48:06.438772</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20</th>\n",
+       "      <td>809</td>\n",
+       "      <td>1.33</td>\n",
+       "      <td>2015-01-20 13:48:06.643208</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>21</th>\n",
+       "      <td>1364</td>\n",
+       "      <td>1.51</td>\n",
+       "      <td>2015-01-20 13:48:06.848323</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>22</th>\n",
+       "      <td>1153</td>\n",
+       "      <td>1.44</td>\n",
+       "      <td>2015-01-20 13:48:07.053400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>23</th>\n",
+       "      <td>853</td>\n",
+       "      <td>1.30</td>\n",
+       "      <td>2015-01-20 13:48:07.257704</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24</th>\n",
+       "      <td>1510</td>\n",
+       "      <td>2.17</td>\n",
+       "      <td>2015-01-20 13:48:07.463275</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25</th>\n",
+       "      <td>123</td>\n",
+       "      <td>1.21</td>\n",
+       "      <td>2015-01-20 13:48:07.668423</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>26</th>\n",
+       "      <td>1966</td>\n",
+       "      <td>2.20</td>\n",
+       "      <td>2015-01-20 13:48:07.874230</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>27</th>\n",
+       "      <td>933</td>\n",
+       "      <td>1.34</td>\n",
+       "      <td>2015-01-20 13:48:08.078667</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>28</th>\n",
+       "      <td>922</td>\n",
+       "      <td>1.42</td>\n",
+       "      <td>2015-01-20 13:48:08.283655</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>29</th>\n",
+       "      <td>24</td>\n",
+       "      <td>1.12</td>\n",
+       "      <td>2015-01-20 13:48:08.488688</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>44006</th>\n",
+       "      <td>1772</td>\n",
+       "      <td>28.80</td>\n",
+       "      <td>2015-01-20 16:26:20.743715</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>44007</th>\n",
+       "      <td>41</td>\n",
+       "      <td>1.14</td>\n",
+       "      <td>2015-01-20 16:26:20.949053</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>44008</th>\n",
+       "      <td>1944</td>\n",
+       "      <td>2.32</td>\n",
+       "      <td>2015-01-20 16:26:21.155685</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>44009</th>\n",
+       "      <td>400</td>\n",
+       "      <td>1.98</td>\n",
+       "      <td>2015-01-20 16:26:21.362095</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>44010</th>\n",
+       "      <td>226</td>\n",
+       "      <td>3.01</td>\n",
+       "      <td>2015-01-20 16:26:21.569409</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>44011</th>\n",
+       "      <td>466</td>\n",
+       "      <td>7.45</td>\n",
+       "      <td>2015-01-20 16:26:21.780805</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>44012</th>\n",
+       "      <td>350</td>\n",
+       "      <td>13.50</td>\n",
+       "      <td>2015-01-20 16:26:21.998869</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>44013</th>\n",
+       "      <td>1829</td>\n",
+       "      <td>45.90</td>\n",
+       "      <td>2015-01-20 16:26:22.248969</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>44014</th>\n",
+       "      <td>1954</td>\n",
+       "      <td>58.50</td>\n",
+       "      <td>2015-01-20 16:26:22.512386</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>44015</th>\n",
+       "      <td>1074</td>\n",
+       "      <td>1.45</td>\n",
+       "      <td>2015-01-20 16:26:22.717961</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>44016</th>\n",
+       "      <td>46</td>\n",
+       "      <td>1.11</td>\n",
+       "      <td>2015-01-20 16:26:22.923292</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>44017</th>\n",
+       "      <td>1844</td>\n",
+       "      <td>2.26</td>\n",
+       "      <td>2015-01-20 16:26:23.129965</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>44018</th>\n",
+       "      <td>645</td>\n",
+       "      <td>1.24</td>\n",
+       "      <td>2015-01-20 16:26:23.335449</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>44019</th>\n",
+       "      <td>444</td>\n",
+       "      <td>1.25</td>\n",
+       "      <td>2015-01-20 16:26:23.540901</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>44020</th>\n",
+       "      <td>1940</td>\n",
+       "      <td>2.46</td>\n",
+       "      <td>2015-01-20 16:26:23.747983</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>44021</th>\n",
+       "      <td>1411</td>\n",
+       "      <td>1.47</td>\n",
+       "      <td>2015-01-20 16:26:23.954099</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>44022</th>\n",
+       "      <td>49</td>\n",
+       "      <td>1.21</td>\n",
+       "      <td>2015-01-20 16:26:24.159879</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>44023</th>\n",
+       "      <td>420</td>\n",
+       "      <td>1.55</td>\n",
+       "      <td>2015-01-20 16:26:24.365815</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>44024</th>\n",
+       "      <td>227</td>\n",
+       "      <td>1.22</td>\n",
+       "      <td>2015-01-20 16:26:24.571516</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>44025</th>\n",
+       "      <td>947</td>\n",
+       "      <td>1.34</td>\n",
+       "      <td>2015-01-20 16:26:24.777325</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>44026</th>\n",
+       "      <td>1960</td>\n",
+       "      <td>2.43</td>\n",
+       "      <td>2015-01-20 16:26:24.983905</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>44027</th>\n",
+       "      <td>531</td>\n",
+       "      <td>1.19</td>\n",
+       "      <td>2015-01-20 16:26:25.188976</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>44028</th>\n",
+       "      <td>374</td>\n",
+       "      <td>1.14</td>\n",
+       "      <td>2015-01-20 16:26:25.394275</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>44029</th>\n",
+       "      <td>1503</td>\n",
+       "      <td>2.19</td>\n",
+       "      <td>2015-01-20 16:26:25.600745</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>44030</th>\n",
+       "      <td>572</td>\n",
+       "      <td>1.29</td>\n",
+       "      <td>2015-01-20 16:26:25.805877</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>44031</th>\n",
+       "      <td>1338</td>\n",
+       "      <td>1.47</td>\n",
+       "      <td>2015-01-20 16:26:26.011910</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>44032</th>\n",
+       "      <td>1515</td>\n",
+       "      <td>7.02</td>\n",
+       "      <td>2015-01-20 16:26:26.222729</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>44033</th>\n",
+       "      <td>1875</td>\n",
+       "      <td>2.33</td>\n",
+       "      <td>2015-01-20 16:26:26.429007</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>44034</th>\n",
+       "      <td>1006</td>\n",
+       "      <td>1.61</td>\n",
+       "      <td>2015-01-20 16:26:26.634747</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>44035</th>\n",
+       "      <td>1273</td>\n",
+       "      <td>1.35</td>\n",
+       "      <td>2015-01-20 16:26:26.840222</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>44036 rows × 3 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       msgsize   time                  timestamp\n",
+       "0          665  22.50 2015-01-20 13:48:02.052172\n",
+       "1         1373  21.20 2015-01-20 13:48:02.277315\n",
+       "2          262  21.20 2015-01-20 13:48:02.502054\n",
+       "3         1107  23.30 2015-01-20 13:48:02.729257\n",
+       "4         1128   1.41 2015-01-20 13:48:02.934648\n",
+       "5          489  21.90 2015-01-20 13:48:03.160397\n",
+       "6         1759  78.70 2015-01-20 13:48:03.443055\n",
+       "7         1146  25.10 2015-01-20 13:48:03.672157\n",
+       "8          884  24.00 2015-01-20 13:48:03.899933\n",
+       "9         1422  19.50 2015-01-20 13:48:04.122687\n",
+       "10        1180  18.00 2015-01-20 13:48:04.344135\n",
+       "11         999  18.80 2015-01-20 13:48:04.566271\n",
+       "12        1020  24.30 2015-01-20 13:48:04.998504\n",
+       "13          71   3.45 2015-01-20 13:48:05.205172\n",
+       "14          34   5.85 2015-01-20 13:48:05.414106\n",
+       "15        1843   2.31 2015-01-20 13:48:05.620117\n",
+       "16         407   1.14 2015-01-20 13:48:05.824949\n",
+       "17         356   1.10 2015-01-20 13:48:06.029177\n",
+       "18        1511   2.18 2015-01-20 13:48:06.234464\n",
+       "19         587   1.27 2015-01-20 13:48:06.438772\n",
+       "20         809   1.33 2015-01-20 13:48:06.643208\n",
+       "21        1364   1.51 2015-01-20 13:48:06.848323\n",
+       "22        1153   1.44 2015-01-20 13:48:07.053400\n",
+       "23         853   1.30 2015-01-20 13:48:07.257704\n",
+       "24        1510   2.17 2015-01-20 13:48:07.463275\n",
+       "25         123   1.21 2015-01-20 13:48:07.668423\n",
+       "26        1966   2.20 2015-01-20 13:48:07.874230\n",
+       "27         933   1.34 2015-01-20 13:48:08.078667\n",
+       "28         922   1.42 2015-01-20 13:48:08.283655\n",
+       "29          24   1.12 2015-01-20 13:48:08.488688\n",
+       "...        ...    ...                        ...\n",
+       "44006     1772  28.80 2015-01-20 16:26:20.743715\n",
+       "44007       41   1.14 2015-01-20 16:26:20.949053\n",
+       "44008     1944   2.32 2015-01-20 16:26:21.155685\n",
+       "44009      400   1.98 2015-01-20 16:26:21.362095\n",
+       "44010      226   3.01 2015-01-20 16:26:21.569409\n",
+       "44011      466   7.45 2015-01-20 16:26:21.780805\n",
+       "44012      350  13.50 2015-01-20 16:26:21.998869\n",
+       "44013     1829  45.90 2015-01-20 16:26:22.248969\n",
+       "44014     1954  58.50 2015-01-20 16:26:22.512386\n",
+       "44015     1074   1.45 2015-01-20 16:26:22.717961\n",
+       "44016       46   1.11 2015-01-20 16:26:22.923292\n",
+       "44017     1844   2.26 2015-01-20 16:26:23.129965\n",
+       "44018      645   1.24 2015-01-20 16:26:23.335449\n",
+       "44019      444   1.25 2015-01-20 16:26:23.540901\n",
+       "44020     1940   2.46 2015-01-20 16:26:23.747983\n",
+       "44021     1411   1.47 2015-01-20 16:26:23.954099\n",
+       "44022       49   1.21 2015-01-20 16:26:24.159879\n",
+       "44023      420   1.55 2015-01-20 16:26:24.365815\n",
+       "44024      227   1.22 2015-01-20 16:26:24.571516\n",
+       "44025      947   1.34 2015-01-20 16:26:24.777325\n",
+       "44026     1960   2.43 2015-01-20 16:26:24.983905\n",
+       "44027      531   1.19 2015-01-20 16:26:25.188976\n",
+       "44028      374   1.14 2015-01-20 16:26:25.394275\n",
+       "44029     1503   2.19 2015-01-20 16:26:25.600745\n",
+       "44030      572   1.29 2015-01-20 16:26:25.805877\n",
+       "44031     1338   1.47 2015-01-20 16:26:26.011910\n",
+       "44032     1515   7.02 2015-01-20 16:26:26.222729\n",
+       "44033     1875   2.33 2015-01-20 16:26:26.429007\n",
+       "44034     1006   1.61 2015-01-20 16:26:26.634747\n",
+       "44035     1273   1.35 2015-01-20 16:26:26.840222\n",
+       "\n",
+       "[44036 rows x 3 columns]"
+      ]
+     },
+     "execution_count": 59,
+     "metadata": {},
+     "output_type": "execute_result"
    }
   ],
-   "source": []
+   "source": [
+    "def extract_data(filename):\n",
+    "    timestamps = []\n",
+    "    msgsizes = []\n",
+    "    times = []\n",
+    "    err_count = 0\n",
+    "    success_count = 0\n",
+    "    \n",
+    "    with open(filename, 'r') as file:\n",
+    "        for line in file.readlines():\n",
+    "            try:\n",
+    "                # Date de mesure (timestamp)\n",
+    "                # du caractère n°1 au n°17 inclus (numérotés à partir de 0)\n",
+    "                ts_str = line[1:18]\n",
+    "                ts_float = float(ts_str)\n",
+    "                # On convertit en date pandas\n",
+    "                ts = pd.Timestamp(ts_float, unit='s')\n",
+    "\n",
+    "                # Taille du message (message size)\n",
+    "                ms_str = line[20:line.index(\" bytes\")]\n",
+    "                ms_int = int(ms_str)\n",
+    "\n",
+    "                # Durée de l'échange (time)\n",
+    "                time_str = line[line.index('time=')+5:line.rindex(\" ms\")]\n",
+    "                time_float = float(time_str)\n",
+    "                \n",
+    "                # Une fois les valeurs trouvées, on les ajoute au tableau\n",
+    "                timestamps.append(ts)\n",
+    "                msgsizes.append(ms_int)\n",
+    "                times.append(time_float)\n",
+    "                success_count += 1\n",
+    "                \n",
+    "            except ValueError:\n",
+    "                # Lorsqu'il manque l'une des valeurs, on oublie la ligne correspondante\n",
+    "                err_count += 1\n",
+    "           \n",
+    "    total_count = success_count + err_count\n",
+    "    print(f\"{success_count} lignes lues avec succès ({100*err_count/total_count:.2f}% d'échecs)\")\n",
+    "    return pd.DataFrame({\"timestamp\":timestamps, \"msgsize\":msgsizes, \"time\":times})\n",
+    "    \n",
+    "\n",
+    "extract_data(filenames[0])"
+   ]
  },
  {
   "cell_type": "code",