From 3e881100e16ee0ed63bd31cf2d4bb84fee5c9ef9 Mon Sep 17 00:00:00 2001 From: 8ae4836869d9dfa2662a12d59ff25279 <8ae4836869d9dfa2662a12d59ff25279@app-learninglab.inria.fr> Date: Thu, 4 Nov 2021 16:45:49 +0000 Subject: [PATCH] =?UTF-8?q?TP=20:=20Extraction=20des=20donn=C3=A9es?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- module3/exo3/exercice.ipynb | 684 +++++++++++++++++++++++++++++------- 1 file changed, 548 insertions(+), 136 deletions(-) diff --git a/module3/exo3/exercice.ipynb b/module3/exo3/exercice.ipynb index eac369b..4813cd0 100644 --- a/module3/exo3/exercice.ipynb +++ b/module3/exo3/exercice.ipynb @@ -46,16 +46,25 @@ "## Téléchargement des données" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "On s'intéresse à deux jeux de données hébergés en ligne. La première étape est de les télécharger en local." + ] + }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 42, "metadata": {}, "outputs": [], "source": [ + "# Noms des fichiers en local\n", "filenames = [\n", " \"liglab2.log\", \n", " \"stackoverflow.log\",\n", "]\n", + "# Adresse où les fichiers sont hébergés\n", "urls = [\n", " \"http://mescal.imag.fr/membres/arnaud.legrand/teaching/2014/RICM4_EP_ping/liglab2.log.gz\",\n", " \"http://mescal.imag.fr/membres/arnaud.legrand/teaching/2014/RICM4_EP_ping/stackoverflow.log.gz\",\n", @@ -78,7 +87,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 44, "metadata": {}, "outputs": [ { @@ -95,9 +104,11 @@ "\n", "def download_archive(filename, url):\n", " if not exists(filename):\n", - " # Le fichier est une archive .gz\n", + " # On utilise le module requests pour récupérer les données en ligne\n", " archive = requests.get(url)\n", + " # Le fichier est une archive .gz, on l'extrait avec le module gzip\n", " content = gzip.decompress(archive.content)\n", + " \n", " open(filename,'wb').write(content)\n", " print(f\"Téléchargement de {url} et extraction vers {filename}.\")\n", " else:\n", @@ -108,150 +119,551 @@ " download_archive(filename, url)\n" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Lecture des données\n", + "On extrait maintenant les données de l'outil `ping` sous forme d'un tableau `pandas`.\n", + "\n", + "Le format étant relativement simple, il est possible de le faire en utilisant uniquement les fonctions de base des chaînes de caractères de Python.\n", + "\n", + "Chaque ligne a la forme suivante:\n", + "```\n", + "[1421761682.052172] 665 bytes from lig-publig.imag.fr (129.88.11.7): icmp_seq=1 ttl=60 time=22.5 ms\n", + "```\n", + "On extrait uniquement les données qui nous intéressent :\n", + "\n", + " * la date de mesure (en secondes depuis le 1er janvier 1970) du 2e au 18e caractère\n", + " * la taille du message (en octets), qui est suivi de la sous-chaîne `\" bytes\"`\n", + " * la durée de réponse (en millisecondes), qui est précédé de `\"time=\"` et suivi de `\" ms\"`" + ] + }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 59, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Help on Response in module requests.models object:\n", - "\n", - "class Response(builtins.object)\n", - " | The :class:`Response ` object, which contains a\n", - " | server's response to an HTTP request.\n", - " | \n", - " | Methods defined here:\n", - " | \n", - " | __bool__(self)\n", - " | Returns True if :attr:`status_code` is less than 400.\n", - " | \n", - " | This attribute checks if the status code of the response is between\n", - " | 400 and 600 to see if there was a client error or a server error. If\n", - " | the status code, is between 200 and 400, this will return True. This\n", - " | is **not** a check to see if the response code is ``200 OK``.\n", - " | \n", - " | __enter__(self)\n", - " | \n", - " | __exit__(self, *args)\n", - " | \n", - " | __getstate__(self)\n", - " | \n", - " | __init__(self)\n", - " | Initialize self. See help(type(self)) for accurate signature.\n", - " | \n", - " | __iter__(self)\n", - " | Allows you to use a response as an iterator.\n", - " | \n", - " | __nonzero__(self)\n", - " | Returns True if :attr:`status_code` is less than 400.\n", - " | \n", - " | This attribute checks if the status code of the response is between\n", - " | 400 and 600 to see if there was a client error or a server error. If\n", - " | the status code, is between 200 and 400, this will return True. This\n", - " | is **not** a check to see if the response code is ``200 OK``.\n", - " | \n", - " | __repr__(self)\n", - " | Return repr(self).\n", - " | \n", - " | __setstate__(self, state)\n", - " | \n", - " | close(self)\n", - " | Releases the connection back to the pool. Once this method has been\n", - " | called the underlying ``raw`` object must not be accessed again.\n", - " | \n", - " | *Note: Should not normally need to be called explicitly.*\n", - " | \n", - " | iter_content(self, chunk_size=1, decode_unicode=False)\n", - " | Iterates over the response data. When stream=True is set on the\n", - " | request, this avoids reading the content at once into memory for\n", - " | large responses. The chunk size is the number of bytes it should\n", - " | read into memory. This is not necessarily the length of each item\n", - " | returned as decoding can take place.\n", - " | \n", - " | chunk_size must be of type int or None. A value of None will\n", - " | function differently depending on the value of `stream`.\n", - " | stream=True will read data as it arrives in whatever size the\n", - " | chunks are received. If stream=False, data is returned as\n", - " | a single chunk.\n", - " | \n", - " | If decode_unicode is True, content will be decoded using the best\n", - " | available encoding based on the response.\n", - " | \n", - " | iter_lines(self, chunk_size=512, decode_unicode=False, delimiter=None)\n", - " | Iterates over the response data, one line at a time. When\n", - " | stream=True is set on the request, this avoids reading the\n", - " | content at once into memory for large responses.\n", - " | \n", - " | .. note:: This method is not reentrant safe.\n", - " | \n", - " | json(self, **kwargs)\n", - " | Returns the json-encoded content of a response, if any.\n", - " | \n", - " | :param \\*\\*kwargs: Optional arguments that ``json.loads`` takes.\n", - " | :raises ValueError: If the response body does not contain valid json.\n", - " | \n", - " | raise_for_status(self)\n", - " | Raises stored :class:`HTTPError`, if one occurred.\n", - " | \n", - " | ----------------------------------------------------------------------\n", - " | Data descriptors defined here:\n", - " | \n", - " | __dict__\n", - " | dictionary for instance variables (if defined)\n", - " | \n", - " | __weakref__\n", - " | list of weak references to the object (if defined)\n", - " | \n", - " | apparent_encoding\n", - " | The apparent encoding, provided by the chardet library.\n", - " | \n", - " | content\n", - " | Content of the response, in bytes.\n", - " | \n", - " | is_permanent_redirect\n", - " | True if this Response one of the permanent versions of redirect.\n", - " | \n", - " | is_redirect\n", - " | True if this Response is a well-formed HTTP redirect that could have\n", - " | been processed automatically (by :meth:`Session.resolve_redirects`).\n", - " | \n", - " | links\n", - " | Returns the parsed header links of the response, if any.\n", - " | \n", - " | next\n", - " | Returns a PreparedRequest for the next request in a redirect chain, if there is one.\n", - " | \n", - " | ok\n", - " | Returns True if :attr:`status_code` is less than 400, False if not.\n", - " | \n", - " | This attribute checks if the status code of the response is between\n", - " | 400 and 600 to see if there was a client error or a server error. If\n", - " | the status code is between 200 and 400, this will return True. This\n", - " | is **not** a check to see if the response code is ``200 OK``.\n", - " | \n", - " | text\n", - " | Content of the response, in unicode.\n", - " | \n", - " | If Response.encoding is None, encoding will be guessed using\n", - " | ``chardet``.\n", - " | \n", - " | The encoding of the response content is determined based solely on HTTP\n", - " | headers, following RFC 2616 to the letter. If you can take advantage of\n", - " | non-HTTP knowledge to make a better guess at the encoding, you should\n", - " | set ``r.encoding`` appropriately before accessing this property.\n", - " | \n", - " | ----------------------------------------------------------------------\n", - " | Data and other attributes defined here:\n", - " | \n", - " | __attrs__ = ['_content', 'status_code', 'headers', 'url', 'history', '...\n", - "\n" + "44036 lignes lues avec succès, (0.85% d'échecs)\n" ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
msgsizetimetimestamp
066522.502015-01-20 13:48:02.052172
1137321.202015-01-20 13:48:02.277315
226221.202015-01-20 13:48:02.502054
3110723.302015-01-20 13:48:02.729257
411281.412015-01-20 13:48:02.934648
548921.902015-01-20 13:48:03.160397
6175978.702015-01-20 13:48:03.443055
7114625.102015-01-20 13:48:03.672157
888424.002015-01-20 13:48:03.899933
9142219.502015-01-20 13:48:04.122687
10118018.002015-01-20 13:48:04.344135
1199918.802015-01-20 13:48:04.566271
12102024.302015-01-20 13:48:04.998504
13713.452015-01-20 13:48:05.205172
14345.852015-01-20 13:48:05.414106
1518432.312015-01-20 13:48:05.620117
164071.142015-01-20 13:48:05.824949
173561.102015-01-20 13:48:06.029177
1815112.182015-01-20 13:48:06.234464
195871.272015-01-20 13:48:06.438772
208091.332015-01-20 13:48:06.643208
2113641.512015-01-20 13:48:06.848323
2211531.442015-01-20 13:48:07.053400
238531.302015-01-20 13:48:07.257704
2415102.172015-01-20 13:48:07.463275
251231.212015-01-20 13:48:07.668423
2619662.202015-01-20 13:48:07.874230
279331.342015-01-20 13:48:08.078667
289221.422015-01-20 13:48:08.283655
29241.122015-01-20 13:48:08.488688
............
44006177228.802015-01-20 16:26:20.743715
44007411.142015-01-20 16:26:20.949053
4400819442.322015-01-20 16:26:21.155685
440094001.982015-01-20 16:26:21.362095
440102263.012015-01-20 16:26:21.569409
440114667.452015-01-20 16:26:21.780805
4401235013.502015-01-20 16:26:21.998869
44013182945.902015-01-20 16:26:22.248969
44014195458.502015-01-20 16:26:22.512386
4401510741.452015-01-20 16:26:22.717961
44016461.112015-01-20 16:26:22.923292
4401718442.262015-01-20 16:26:23.129965
440186451.242015-01-20 16:26:23.335449
440194441.252015-01-20 16:26:23.540901
4402019402.462015-01-20 16:26:23.747983
4402114111.472015-01-20 16:26:23.954099
44022491.212015-01-20 16:26:24.159879
440234201.552015-01-20 16:26:24.365815
440242271.222015-01-20 16:26:24.571516
440259471.342015-01-20 16:26:24.777325
4402619602.432015-01-20 16:26:24.983905
440275311.192015-01-20 16:26:25.188976
440283741.142015-01-20 16:26:25.394275
4402915032.192015-01-20 16:26:25.600745
440305721.292015-01-20 16:26:25.805877
4403113381.472015-01-20 16:26:26.011910
4403215157.022015-01-20 16:26:26.222729
4403318752.332015-01-20 16:26:26.429007
4403410061.612015-01-20 16:26:26.634747
4403512731.352015-01-20 16:26:26.840222
\n", + "

44036 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " msgsize time timestamp\n", + "0 665 22.50 2015-01-20 13:48:02.052172\n", + "1 1373 21.20 2015-01-20 13:48:02.277315\n", + "2 262 21.20 2015-01-20 13:48:02.502054\n", + "3 1107 23.30 2015-01-20 13:48:02.729257\n", + "4 1128 1.41 2015-01-20 13:48:02.934648\n", + "5 489 21.90 2015-01-20 13:48:03.160397\n", + "6 1759 78.70 2015-01-20 13:48:03.443055\n", + "7 1146 25.10 2015-01-20 13:48:03.672157\n", + "8 884 24.00 2015-01-20 13:48:03.899933\n", + "9 1422 19.50 2015-01-20 13:48:04.122687\n", + "10 1180 18.00 2015-01-20 13:48:04.344135\n", + "11 999 18.80 2015-01-20 13:48:04.566271\n", + "12 1020 24.30 2015-01-20 13:48:04.998504\n", + "13 71 3.45 2015-01-20 13:48:05.205172\n", + "14 34 5.85 2015-01-20 13:48:05.414106\n", + "15 1843 2.31 2015-01-20 13:48:05.620117\n", + "16 407 1.14 2015-01-20 13:48:05.824949\n", + "17 356 1.10 2015-01-20 13:48:06.029177\n", + "18 1511 2.18 2015-01-20 13:48:06.234464\n", + "19 587 1.27 2015-01-20 13:48:06.438772\n", + "20 809 1.33 2015-01-20 13:48:06.643208\n", + "21 1364 1.51 2015-01-20 13:48:06.848323\n", + "22 1153 1.44 2015-01-20 13:48:07.053400\n", + "23 853 1.30 2015-01-20 13:48:07.257704\n", + "24 1510 2.17 2015-01-20 13:48:07.463275\n", + "25 123 1.21 2015-01-20 13:48:07.668423\n", + "26 1966 2.20 2015-01-20 13:48:07.874230\n", + "27 933 1.34 2015-01-20 13:48:08.078667\n", + "28 922 1.42 2015-01-20 13:48:08.283655\n", + "29 24 1.12 2015-01-20 13:48:08.488688\n", + "... ... ... ...\n", + "44006 1772 28.80 2015-01-20 16:26:20.743715\n", + "44007 41 1.14 2015-01-20 16:26:20.949053\n", + "44008 1944 2.32 2015-01-20 16:26:21.155685\n", + "44009 400 1.98 2015-01-20 16:26:21.362095\n", + "44010 226 3.01 2015-01-20 16:26:21.569409\n", + "44011 466 7.45 2015-01-20 16:26:21.780805\n", + "44012 350 13.50 2015-01-20 16:26:21.998869\n", + "44013 1829 45.90 2015-01-20 16:26:22.248969\n", + "44014 1954 58.50 2015-01-20 16:26:22.512386\n", + "44015 1074 1.45 2015-01-20 16:26:22.717961\n", + "44016 46 1.11 2015-01-20 16:26:22.923292\n", + "44017 1844 2.26 2015-01-20 16:26:23.129965\n", + "44018 645 1.24 2015-01-20 16:26:23.335449\n", + "44019 444 1.25 2015-01-20 16:26:23.540901\n", + "44020 1940 2.46 2015-01-20 16:26:23.747983\n", + "44021 1411 1.47 2015-01-20 16:26:23.954099\n", + "44022 49 1.21 2015-01-20 16:26:24.159879\n", + "44023 420 1.55 2015-01-20 16:26:24.365815\n", + "44024 227 1.22 2015-01-20 16:26:24.571516\n", + "44025 947 1.34 2015-01-20 16:26:24.777325\n", + "44026 1960 2.43 2015-01-20 16:26:24.983905\n", + "44027 531 1.19 2015-01-20 16:26:25.188976\n", + "44028 374 1.14 2015-01-20 16:26:25.394275\n", + "44029 1503 2.19 2015-01-20 16:26:25.600745\n", + "44030 572 1.29 2015-01-20 16:26:25.805877\n", + "44031 1338 1.47 2015-01-20 16:26:26.011910\n", + "44032 1515 7.02 2015-01-20 16:26:26.222729\n", + "44033 1875 2.33 2015-01-20 16:26:26.429007\n", + "44034 1006 1.61 2015-01-20 16:26:26.634747\n", + "44035 1273 1.35 2015-01-20 16:26:26.840222\n", + "\n", + "[44036 rows x 3 columns]" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" } ], - "source": [] + "source": [ + "def extract_data(filename):\n", + " timestamps = []\n", + " msgsizes = []\n", + " times = []\n", + " err_count = 0\n", + " success_count = 0\n", + " \n", + " with open(filename, 'r') as file:\n", + " for line in file.readlines():\n", + " try:\n", + " # Date de mesure (timestamp)\n", + " # du caractère n°1 au n°17 inclus (numérotés à partir de 0)\n", + " ts_str = line[1:18]\n", + " ts_float = float(ts_str)\n", + " # On convertit en date pandas\n", + " ts = pd.Timestamp(ts_float, unit='s')\n", + "\n", + " # Taille du message (message size)\n", + " ms_str = line[20:line.index(\" bytes\")]\n", + " ms_int = int(ms_str)\n", + "\n", + " # Durée de l'échange (time)\n", + " time_str = line[line.index('time=')+5:line.rindex(\" ms\")]\n", + " time_float = float(time_str)\n", + " \n", + " # Une fois les valeurs trouvées, on les ajoute au tableau\n", + " timestamps.append(ts)\n", + " msgsizes.append(ms_int)\n", + " times.append(time_float)\n", + " success_count += 1\n", + " \n", + " except ValueError:\n", + " # Lorsqu'il manque l'une des valeurs, on oublie la ligne correspondante\n", + " err_count += 1\n", + " \n", + " total_count = success_count + err_count\n", + " print(f\"{success_count} lignes lues avec succès ({100*err_count/total_count:.2f}% d'échecs)\")\n", + " return pd.DataFrame({\"timestamp\":timestamps, \"msgsize\":msgsizes, \"time\":times})\n", + " \n", + "\n", + "extract_data(filenames[0])" + ] }, { "cell_type": "code", -- 2.18.1