From b918ae3d357ae5f5b444baa16f56e941ffd65d96 Mon Sep 17 00:00:00 2001 From: Samuel MEYNARD Date: Sun, 7 Jun 2020 13:17:42 +0200 Subject: [PATCH] Ajout etude stackoverflow --- module3/exo3/exercice_python_fr.org | 446 ++++++++++++++++++++++++++-- 1 file changed, 429 insertions(+), 17 deletions(-) diff --git a/module3/exo3/exercice_python_fr.org b/module3/exo3/exercice_python_fr.org index 3343e8c..a4a48fb 100644 --- a/module3/exo3/exercice_python_fr.org +++ b/module3/exo3/exercice_python_fr.org @@ -36,7 +36,8 @@ _Votre mission si vous l'acceptez :_ 5. Répétez les étapes précédentes avec le second jeu de données (stackoverflow) 6. Déposer dans FUN votre résultat -* Récupération du 1^er jeu de donnée +* Liglab2 +** Récupération du 1^er jeu de donnée ** Téléchargement #+BEGIN_SRC python :session :file step1.txt :results file from urllib.request import urlretrieve @@ -75,20 +76,19 @@ cleantable[:4] | [1421761682.502054] | 262 | bytes | from | lig-publig.imag.fr | (129.88.11.7): | icmp_seq=1 | ttl=60 | time=21.2 | ms | | [1421761682.729257] | 1107 | bytes | from | lig-publig.imag.fr | (129.88.11.7): | icmp_seq=1 | ttl=60 | time=23.3 | ms | -#+BEGIN_SRC python :session :results output replace +#+BEGIN_SRC python :session :results replace from datetime import datetime date = [datetime.utcfromtimestamp(float(row[0][1:-1])) for row in cleantable] -donnee = [int(row[1]) for row in cleantable] +S = [int(row[1]) for row in cleantable] source = [str(row[4]) for row in cleantable] ip = [str(row[5][1:-1]) for row in cleantable] -ltime = [float(row[8].split('=')[1]) for row in cleantable] - +T = [float(row[8].split('=')[1]) for row in cleantable] dataset = list(zip(date,donnee, ltime)) -print(dataset[:4]) +T[:10] #+END_SRC #+RESULTS: -: [(datetime.datetime(2015, 1, 20, 13, 48, 2, 52172), 665, 22.5), (datetime.datetime(2015, 1, 20, 13, 48, 2, 277315), 1373, 21.2), (datetime.datetime(2015, 1, 20, 13, 48, 2, 502054), 262, 21.2), (datetime.datetime(2015, 1, 20, 13, 48, 2, 729257), 1107, 23.3)] +| 22.5 | 21.2 | 21.2 | 23.3 | 1.41 | 21.9 | 78.7 | 25.1 | 24.0 | 19.5 | #+BEGIN_SRC python :session :results silent :file test.png import matplotlib @@ -98,20 +98,275 @@ fig, ax = plt.subplots(figsize=(12, 12)) # Add x-axis and y-axis ax.scatter(date, - ltime, + T, color='purple') # Set title and labels for axes ax.set(xlabel="Date", - ylabel="Latence", - title="Evolution de la latence dans le temps") + ylabel="Temps de transmission", + title="Evolution de la temps de transmission dans le temps") + +plt.savefig('evol_temps_transmission_dans_le_temps.png') +#+END_SRC +#+RESULTS: +: None + +Il ne semble pas avoir d'impact au travers le temps +** Evolution du temps de transmission à travers le temps +#+BEGIN_SRC python :session :results silent :file test2.png +import matplotlib +import matplotlib.pyplot as plt +# Create figure and plot space +fig, ax = plt.subplots(figsize=(12, 12)) + +# Add x-axis and y-axis +ax.scatter(S, + T, + color='purple') + +# Set title and labels for axes +ax.set(xlabel="Taille des donnee", + ylabel="Temps de transmission", + title="Evolution du temps de transmission en fonction de la taille des données") + +plt.savefig('evol_temps_transmission_en_fonction_de_la_taille.png') + +#+END_SRC +Ici, on voit l'impact de la MTU ici certainement à 1500 sur le temps de transport + +** Differenciation par rapport à la taille +*** Inférieur à la MTU +#+BEGIN_SRC python :session +table_l1500 = [row for row in cleantable if int(row[1]) <= 1485] +date_l1500 = [datetime.utcfromtimestamp(float(row[0][1:-1])) for row in table_l1500] +S_l1500 = [int(row[1]) for row in table_l1500] +T_l1500 = [float(row[8].split('=')[1]) for row in table_l1500] +dataset_l1500 = list(zip(date_l1500,S_l1500, T_l1500)) +dataset_l1500[:10] +#+END_SRC + +#+RESULTS: +| datetime.datetime | (2015 1 20 13 48 2 52172) | 665 | 22.5 | +| datetime.datetime | (2015 1 20 13 48 2 277315) | 1373 | 21.2 | +| datetime.datetime | (2015 1 20 13 48 2 502054) | 262 | 21.2 | +| datetime.datetime | (2015 1 20 13 48 2 729257) | 1107 | 23.3 | +| datetime.datetime | (2015 1 20 13 48 2 934648) | 1128 | 1.41 | +| datetime.datetime | (2015 1 20 13 48 3 160397) | 489 | 21.9 | +| datetime.datetime | (2015 1 20 13 48 3 672157) | 1146 | 25.1 | +| datetime.datetime | (2015 1 20 13 48 3 899933) | 884 | 24.0 | +| datetime.datetime | (2015 1 20 13 48 4 122687) | 1422 | 19.5 | +| datetime.datetime | (2015 1 20 13 48 4 344135) | 1180 | 18.0 | + +#+BEGIN_SRC python :session :results silent :file test2.png +import matplotlib +import matplotlib.pyplot as plt +# Create figure and plot space +fig, ax = plt.subplots(figsize=(12, 12)) + +# Add x-axis and y-axis +ax.scatter(S_l1500, + T_l1500, + color='purple') + +# Set title and labels for axes +ax.set(xlabel="Taille des donnee", + ylabel="Temps de transmission", + title="Evolution du temps de transmission en fonction de la taille des données") + +plt.savefig('l1500_evol_T-f(S).png') + +#+END_SRC +*** Supérieur à la MTU +Calcul d'un tableau avec les donnée supérieure à la MTU +#+BEGIN_SRC python :session +table_g1500 = [row for row in cleantable if int(row[1]) >= 1485] +date_g1500 = [datetime.utcfromtimestamp(float(row[0][1:-1])) for row in table_g1500] +S_g1500 = [int(row[1]) for row in table_g1500] +T_g1500 = [float(row[8].split('=')[1]) for row in table_g1500] +dataset_g1500 = list(zip(date_g1500,S_g1500, T_g1500)) +dataset_g1500[:10] +#+END_SRC + +#+RESULTS: +| datetime.datetime | (2015 1 20 13 48 3 443055) | 1759 | 78.7 | +| datetime.datetime | (2015 1 20 13 48 5 620117) | 1843 | 2.31 | +| datetime.datetime | (2015 1 20 13 48 6 234464) | 1511 | 2.18 | +| datetime.datetime | (2015 1 20 13 48 7 463275) | 1510 | 2.17 | +| datetime.datetime | (2015 1 20 13 48 7 874230) | 1966 | 2.2 | +| datetime.datetime | (2015 1 20 13 48 8 694652) | 1518 | 2.19 | +| datetime.datetime | (2015 1 20 13 48 10 335289) | 1732 | 2.29 | +| datetime.datetime | (2015 1 20 13 48 10 950126) | 1500 | 2.14 | +| datetime.datetime | (2015 1 20 13 48 11 359824) | 1520 | 2.1 | +| datetime.datetime | (2015 1 20 13 48 11 974735) | 1509 | 2.23 | + +#+BEGIN_SRC python :session :results silent :file test2.png +import matplotlib +import matplotlib.pyplot as plt +# Create figure and plot space +fig, ax = plt.subplots(figsize=(12, 12)) + +# Add x-axis and y-axis +ax.scatter(S_g1500, + T_g1500, + color='purple') + +# Set title and labels for axes +ax.set(xlabel="Taille des donnee", + ylabel="Temps de transmission", + title="Evolution du temps de transmission en fonction de la taille des données") + +plt.savefig('g1500_evol_T-f(S).png') + +#+END_SRC +** Régression linéaire +** Cas inférieur à la MTU +#+BEGIN_SRC python :session :results replace +import pandas as pd +import matplotlib.pyplot as plt +from matplotlib import dates +import numpy as np +from scipy import stats +import seaborn as sns +import statsmodels.api as sm +from sklearn import linear_model + +@plt.FuncFormatter +def fake_dates(x, pos): + """ Custom formater to turn floats into e.g., 2016-05-08""" + return dates.num2date(x).strftime('%Y-%m-%d') +sns.set(color_codes=True) +df = pd.DataFrame({ +'date': pd.to_datetime(date_l1500), +'datenum': dates.date2num(date_l1500), +'T': T_l1500, +'S': S_l1500}) +fig, ax = plt.subplots() +sns.regplot(x="datenum", y="T", color='purple', data=df, ax=ax) +# here's the magic: +ax.xaxis.set_major_formatter(fake_dates) +# legible labels +ax.tick_params(labelrotation=30) +fig.savefig('l1500_reglineaireT-f(S).png') +#+END_SRC + +#+RESULTS: +: None +#+BEGIN_SRC python :session +np.array(S_l1500).reshape(1, -1)[:9] +#+END_SRC + +#+RESULTS: +| 1759 | 1843 | 1511 | ... | 1503 | 1515 | 1875 | + +#+BEGIN_SRC python :session +from sklearn.linear_model import LinearRegression +from sklearn.metrics import mean_squared_error +S_tt = [[value] for value in S_l1500] +my_s = np.array(S_tt) +my_t = np.array(T_l1500) +#my_s = np.array([[1], [2], [3]]) +lmodel = LinearRegression() +lmodel.fit(my_s, my_t) +f"Les coeff sont L = {lmodel.intercept_} et C = { 1 / lmodel.coef_}" + +#+END_SRC + +#+RESULTS: +: Les coeff sont L = 3.257592785874401 et C = [2761.3155395] + +** Cas supérieur à la MTU +#+BEGIN_SRC python :session +from sklearn.linear_model import LinearRegression +from sklearn.metrics import mean_squared_error +S_tt = [[value] for value in S_g1500] +my_s = np.array(S_tt) +my_t = np.array(T_g1500) +#my_s = np.array([[1], [2], [3]]) +lmodel = LinearRegression() +lmodel.fit(my_s, my_t) +f"Les coeff sont L = {lmodel.intercept_} et C = { 1 / lmodel.coef_}" -plt.savefig('test.png') +#+END_SRC + +#+RESULTS: +: Les coeff sont L = 5.867233082184833 et C = [441.71908009] +* Stackoverflow +** Récupération du jeu de donnée +*** Téléchargement +#+BEGIN_SRC python :session :file step1.txt :results replace +from urllib.request import urlretrieve +from os import path +stacko_file = "stackoverflow.log" +stacko_filegz = stacko_file + ".gz" +url = "http://mescal.imag.fr/membres/arnaud.legrand/teaching/2014/RICM4_EP_ping/stackoverflow.log.gz" +if not path.exists(stacko_file): + urlretrieve(url, stacko_filegz) #+END_SRC #+RESULTS: : None -* Evolution du temps de transmission à travers le temps +*** Lecture du fichier +#+BEGIN_SRC python :session :results output +import gzip +f = gzip.open(stacko_filegz) +data = f.read().decode('latin-1').strip().splitlines() +f.close() +#+END_SRC + +#+RESULTS: + +#+BEGIN_SRC python :session :results replace +table = [row.split(' ') for row in data] +cleantable = [] +for row in table: + if len(row) == 10: + cleantable.append(row) +cleantable[:4] +#+END_SRC + +#+RESULTS: +| [1421771203.082701] | 1257 | bytes | from | stackoverflow.com | (198.252.206.140): | icmp_seq=1 | ttl=50 | time=120 | ms | +| [1421771203.408254] | 454 | bytes | from | stackoverflow.com | (198.252.206.140): | icmp_seq=1 | ttl=50 | time=120 | ms | +| [1421771203.739730] | 775 | bytes | from | stackoverflow.com | (198.252.206.140): | icmp_seq=1 | ttl=50 | time=126 | ms | +| [1421771204.056630] | 1334 | bytes | from | stackoverflow.com | (198.252.206.140): | icmp_seq=1 | ttl=50 | time=112 | ms | + +#+BEGIN_SRC python :session :results replace +from datetime import datetime +date = [datetime.utcfromtimestamp(float(row[0][1:-1])) for row in cleantable] +S = [int(row[1]) for row in cleantable] +source = [str(row[4]) for row in cleantable] +ip = [str(row[5][1:-1]) for row in cleantable] +T = [float(row[8].split('=')[1]) for row in cleantable] +dataset = list(zip(date,donnee, ltime)) +T[:10] +#+END_SRC + +#+RESULTS: +| 120.0 | 120.0 | 126.0 | 112.0 | 111.0 | 111.0 | 112.0 | 111.0 | 111.0 | 111.0 | + +#+BEGIN_SRC python :session :results silent :file test.png +import matplotlib +import matplotlib.pyplot as plt +# Create figure and plot space +fig, ax = plt.subplots(figsize=(12, 12)) + +# Add x-axis and y-axis +ax.scatter(date, + T, + color='purple') + +# Set title and labels for axes +ax.set(xlabel="Date", + ylabel="Temps de transmission", + title="Evolution de la temps de transmission dans le temps") + +plt.savefig('stacko_evol_temps_transmission_dans_le_temps.png') +#+END_SRC +#+RESULTS: +: None + +Il ne semble pas avoir d'impact au travers le temps +** Evolution du temps de transmission à travers le temps #+BEGIN_SRC python :session :results silent :file test2.png import matplotlib import matplotlib.pyplot as plt @@ -119,16 +374,173 @@ import matplotlib.pyplot as plt fig, ax = plt.subplots(figsize=(12, 12)) # Add x-axis and y-axis -ax.scatter(donnee, - ltime, +ax.scatter(S, + T, color='purple') # Set title and labels for axes ax.set(xlabel="Taille des donnee", - ylabel="Latence", - title="Evolution de la latence dans le temps") + ylabel="Temps de transmission", + title="Evolution du temps de transmission en fonction de la taille des données") -plt.savefig('test2.png') +plt.savefig('stacko_evol_temps_transmission_en_fonction_de_la_taille.png') #+END_SRC Ici, on voit l'impact de la MTU ici certainement à 1500 sur le temps de transport + +*** Differenciation par rapport à la taille +**** Inférieur à la MTU +#+BEGIN_SRC python :session +table_l1500 = [row for row in cleantable if int(row[1]) <= 1485] +date_l1500 = [datetime.utcfromtimestamp(float(row[0][1:-1])) for row in table_l1500] +S_l1500 = [int(row[1]) for row in table_l1500] +T_l1500 = [float(row[8].split('=')[1]) for row in table_l1500] +dataset_l1500 = list(zip(date_l1500,S_l1500, T_l1500)) +dataset_l1500[:10] +#+END_SRC + +#+RESULTS: +| datetime.datetime | (2015 1 20 16 26 43 82701) | 1257 | 120.0 | +| datetime.datetime | (2015 1 20 16 26 43 408254) | 454 | 120.0 | +| datetime.datetime | (2015 1 20 16 26 43 739730) | 775 | 126.0 | +| datetime.datetime | (2015 1 20 16 26 44 56630) | 1334 | 112.0 | +| datetime.datetime | (2015 1 20 16 26 44 372224) | 83 | 111.0 | +| datetime.datetime | (2015 1 20 16 26 44 688367) | 694 | 111.0 | +| datetime.datetime | (2015 1 20 16 26 45 321112) | 632 | 111.0 | +| datetime.datetime | (2015 1 20 16 26 45 637464) | 405 | 111.0 | +| datetime.datetime | (2015 1 20 16 26 45 953472) | 1419 | 111.0 | +| datetime.datetime | (2015 1 20 16 26 46 269163) | 329 | 111.0 | + +#+BEGIN_SRC python :session :results silent :file test2.png +import matplotlib +import matplotlib.pyplot as plt +# Create figure and plot space +fig, ax = plt.subplots(figsize=(12, 12)) + +# Add x-axis and y-axis +ax.scatter(S_l1500, + T_l1500, + color='purple') + +# Set title and labels for axes +ax.set(xlabel="Taille des donnee", + ylabel="Temps de transmission", + title="Evolution du temps de transmission en fonction de la taille des données") + +plt.savefig('stacko_l1500_evol_T-f(S).png') + +#+END_SRC +**** Supérieur à la MTU +Calcul d'un tableau avec les donnée supérieure à la MTU +#+BEGIN_SRC python :session +table_g1500 = [row for row in cleantable if int(row[1]) >= 1485] +date_g1500 = [datetime.utcfromtimestamp(float(row[0][1:-1])) for row in table_g1500] +S_g1500 = [int(row[1]) for row in table_g1500] +T_g1500 = [float(row[8].split('=')[1]) for row in table_g1500] +dataset_g1500 = list(zip(date_g1500,S_g1500, T_g1500)) +dataset_g1500[:10] +#+END_SRC + +#+RESULTS: +| datetime.datetime | (2015 1 20 16 26 45 5514) | 1577 | 112.0 | +| datetime.datetime | (2015 1 20 16 26 46 901972) | 1714 | 112.0 | +| datetime.datetime | (2015 1 20 16 26 47 851148) | 1598 | 112.0 | +| datetime.datetime | (2015 1 20 16 26 52 272504) | 1619 | 112.0 | +| datetime.datetime | (2015 1 20 16 26 55 749652) | 1655 | 112.0 | +| datetime.datetime | (2015 1 20 16 26 56 66885) | 1556 | 112.0 | +| datetime.datetime | (2015 1 20 16 26 57 648057) | 1839 | 112.0 | +| datetime.datetime | (2015 1 20 16 26 58 280820) | 1572 | 112.0 | +| datetime.datetime | (2015 1 20 16 27 1 133246) | 1491 | 120.0 | +| datetime.datetime | (2015 1 20 16 27 1 765499) | 1978 | 112.0 | + +#+BEGIN_SRC python :session :results silent :file test2.png +import matplotlib +import matplotlib.pyplot as plt +# Create figure and plot space +fig, ax = plt.subplots(figsize=(12, 12)) + +# Add x-axis and y-axis +ax.scatter(S_g1500, + T_g1500, + color='purple') + +# Set title and labels for axes +ax.set(xlabel="Taille des donnee", + ylabel="Temps de transmission", + title="Evolution du temps de transmission en fonction de la taille des données") + +plt.savefig('stacko_g1500_evol_T-f(S).png') + +#+END_SRC +** Régression linéaire +*** Cas inférieur à la MTU +#+BEGIN_SRC python :session :results replace +import pandas as pd +import matplotlib.pyplot as plt +from matplotlib import dates +import numpy as np +from scipy import stats +import seaborn as sns +import statsmodels.api as sm +from sklearn import linear_model + +@plt.FuncFormatter +def fake_dates(x, pos): + """ Custom formater to turn floats into e.g., 2016-05-08""" + return dates.num2date(x).strftime('%Y-%m-%d') +sns.set(color_codes=True) +df = pd.DataFrame({ +'date': pd.to_datetime(date_l1500), +'datenum': dates.date2num(date_l1500), +'T': T_l1500, +'S': S_l1500}) +fig, ax = plt.subplots() +sns.regplot(x="datenum", y="T", color='purple', data=df, ax=ax) +# here's the magic: +ax.xaxis.set_major_formatter(fake_dates) +# legible labels +ax.tick_params(labelrotation=30) +fig.savefig('stacko_l1500_reglineaireT-f(S).png') +#+END_SRC + +#+RESULTS: +: None +#+BEGIN_SRC python :session +np.array(S_l1500).reshape(1, -1)[:9] +#+END_SRC + +#+RESULTS: +| 1759 | 1843 | 1511 | ... | 1503 | 1515 | 1875 | + +#+BEGIN_SRC python :session +from sklearn.linear_model import LinearRegression +from sklearn.metrics import mean_squared_error +S_tt = [[value] for value in S_l1500] +my_s = np.array(S_tt) +my_t = np.array(T_l1500) +#my_s = np.array([[1], [2], [3]]) +lmodel = LinearRegression() +lmodel.fit(my_s, my_t) +f"Les coeff sont L = {lmodel.intercept_} et C = { 1 / lmodel.coef_}" + +#+END_SRC + +#+RESULTS: +: Les coeff sont L = 3.257592785874401 et C = [2761.3155395] + +*** Cas supérieur à la MTU +#+BEGIN_SRC python :session +from sklearn.linear_model import LinearRegression +from sklearn.metrics import mean_squared_error +S_tt = [[value] for value in S_g1500] +my_s = np.array(S_tt) +my_t = np.array(T_g1500) +#my_s = np.array([[1], [2], [3]]) +lmodel = LinearRegression() +lmodel.fit(my_s, my_t) +f"Les coeff sont L = {lmodel.intercept_} et C = { 1 / lmodel.coef_}" + +#+END_SRC + +#+RESULTS: +: Les coeff sont L = 5.867233082184833 et C = [441.71908009] -- 2.18.1