#+TITLE: Module 3 exo 3 #+AUTHOR: Fabien #+DATE: La date du jour #+LANGUAGE: fr # #+PROPERTY: header-args :exports both :session #+HTML_HEAD: #+HTML_HEAD: #+HTML_HEAD: #+HTML_HEAD: #+HTML_HEAD: #+HTML_HEAD: * Préface Le but est ici de reproduire des graphes semblables à ceux du South China Morning Post (SCMP), sur la page The Coronavirus Pandemic et qui montrent pour différents pays le nombre cumulé (c’est-à-dire le nombre total de cas depuis le début de l’épidémie) de personnes atteintes de la maladie à coronavirus 2019. Les données que nous utiliserons dans un premier temps sont compilées par le Johns Hopkins University Center for Systems Science and Engineering (JHU CSSE) et sont mises à disposition sur GitHub. C’est plus particulièrement sur les données time *seriescovid19confirmedglobal.csv* (des suites chronologiques au format csv) disponibles à l’adresse: https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv * Jeu de données ** Téléchargement du jeu de données Premièrement, nous allons télécharger les données en local pour permettre à nos analyses d'être effectuées, même si l'URL venait à être obsolète. Ensuite nous vérifions que les données ont bien été téléchargés en local sous le nom: SARS-COV-GLOB.csv avec l'aide du package /pandas/ dans python. #+begin_src python :results output :session :exports both import pandas as pd df = pd.read_csv('SARS-COV-GLOB.csv') #+end_src #+RESULTS: ** Visualisation des données Nous pouvons regarder à quoi ressemble le jeu de données. #+begin_src python :results output :session :exports both df['Country/Region' == 'China'] #+end_src #+RESULTS: #+begin_example Province/State Country/Region Lat Long 1/22/20 1/23/20 1/24/20 1/25/20 1/26/20 1/27/20 ... 2/28/23 3/1/23 3/2/23 3/3/23 3/4/23 3/5/23 3/6/23 3/7/23 3/8/23 3/9/23 0 NaN Afghanistan 33.939110 67.709953 0 0 0 0 0 0 ... 209322 209340 209358 209362 209369 209390 209406 209436 209451 209451 1 NaN Albania 41.153300 20.168300 0 0 0 0 0 0 ... 334391 334408 334408 334427 334427 334427 334427 334427 334443 334457 2 NaN Algeria 28.033900 1.659600 0 0 0 0 0 0 ... 271441 271448 271463 271469 271469 271477 271477 271490 271494 271496 3 NaN Andorra 42.506300 1.521800 0 0 0 0 0 0 ... 47866 47875 47875 47875 47875 47875 47875 47875 47890 47890 4 NaN Angola -11.202700 17.873900 0 0 0 0 0 0 ... 105255 105277 105277 105277 105277 105277 105277 105277 105288 105288 .. ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... 284 NaN West Bank and Gaza 31.952200 35.233200 0 0 0 0 0 0 ... 703228 703228 703228 703228 703228 703228 703228 703228 703228 703228 285 NaN Winter Olympics 2022 39.904200 116.407400 0 0 0 0 0 0 ... 535 535 535 535 535 535 535 535 535 535 286 NaN Yemen 15.552727 48.516388 0 0 0 0 0 0 ... 11945 11945 11945 11945 11945 11945 11945 11945 11945 11945 287 NaN Zambia -13.133897 27.849332 0 0 0 0 0 0 ... 343012 343012 343079 343079 343079 343135 343135 343135 343135 343135 288 NaN Zimbabwe -19.015438 29.154857 0 0 0 0 0 0 ... 263921 264127 264127 264127 264127 264127 264127 264127 264276 264276 [289 rows x 1147 columns] #+end_example Nous avons dans le jeu de donnée *brute* les colonnes suivantes: 1. La province/ l'état 2. Le pays/ la région 3. La latitude 4. La longitude 5. les données temporelles avec le nombre de cas par jour * Transformation des données ** Exclusion de certains pays Nous allons regarder l'évolution du nombre de cas cumulé au cours du temps pour le spays suivants:la Belgique (Belgium), la Chine - toutes les provinces sauf Hong-Kong (China), Hong Kong (China, Hong-Kong), la France métropolitaine (France), l’Allemagne (Germany), l’Iran (Iran), l’Italie (Italy), le Japon (Japan), la Corée du Sud (Korea, South), la Hollande sans les colonies (Netherlands), le Portugal (Portugal), l’Espagne (Spain), le Royaume-Unis sans les colonies (United Kingdom), les États-Unis (US). De ce fait nous allons transformer le tableau de manière à seulement garder ses pays et supprimer les pays non concernés par l'analyse. #+begin_src python :results output :session :exports both df2 = df.loc[df['Country/Region'].isin(['Belgium','China','France','Germany','Iran','Italy','Japan','Korea, South','Netherlands','Portugal','Spain','United Kingdom','US'])] #+end_src #+RESULTS: Nous pouvons maintenant vérifier si l'exclusion de certains pays a fonctionnée. #+begin_src python :results output :session :exports both df2 #+end_src #+RESULTS: #+begin_example Province/State Country/Region Lat Long 1/22/20 1/23/20 1/24/20 1/25/20 ... 3/2/23 3/3/23 3/4/23 3/5/23 3/6/23 3/7/23 3/8/23 3/9/23 24 NaN Belgium 50.833300 4.469936 0 0 0 0 ... 4727795 4727795 4727795 4727795 4727795 4727795 4727795 4739365 59 Anhui China 31.825700 117.226400 1 9 15 39 ... 2275 2275 2275 2275 2275 2275 2275 2275 60 Beijing China 40.182400 116.414200 14 22 36 41 ... 40774 40774 40774 40774 40774 40774 40774 40774 61 Chongqing China 30.057200 107.874000 6 9 27 57 ... 14715 14715 14715 14715 14715 14715 14715 14715 62 Fujian China 26.078900 117.987400 1 5 10 18 ... 17122 17122 17122 17122 17122 17122 17122 17122 .. ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... 274 Montserrat United Kingdom 16.742498 -62.187366 0 0 0 0 ... 1403 1403 1403 1403 1403 1403 1403 1403 275 Pitcairn Islands United Kingdom -24.376800 -128.324200 0 0 0 0 ... 4 4 4 4 4 4 4 4 276 Saint Helena, Ascension and Tristan da Cunha United Kingdom -7.946700 -14.355900 0 0 0 0 ... 2166 2166 2166 2166 2166 2166 2166 2166 277 Turks and Caicos Islands United Kingdom 21.694000 -71.797900 0 0 0 0 ... 6551 6551 6551 6551 6551 6557 6557 6561 278 NaN United Kingdom 55.378100 -3.436000 0 0 0 0 ... 24396530 24396530 24396530 24396530 24396530 24396530 24396530 24425309 [75 rows x 1147 columns] #+end_example Cette ligne de commande nous montre simplement le début et la fin du tableau mais ne nous renseigne pas sur les pays présent. Nous pouvons regarder plus précisément dnas la colonne Country/Region si les pays demandés sont bien dans le tableau #+begin_src python :results output :session :exports both df2['Country/Region'].unique() #+end_src #+RESULTS: : array(['Belgium', 'China', 'France', 'Germany', 'Iran', 'Italy', 'Japan', : 'Korea, South', 'Netherlands', 'Portugal', 'Spain', 'US', : 'United Kingdom'], dtype=object) Nous observons bien les pays demandés. La première transformation du jeu de données est réussie. ** Exclusion des territoires hors métropole pour la France, le Royaume-Uni et la Hollande Nous devons maintenant exclure les territoires d'outre-mer et les résidus coloniaux de la France, du Royaume-Uni et de la Hollande. *** France Premièrement, nous allons regarder quelles sont les territoires hors métropole présent dans le jeu de données #+begin_src python :results output :session :exports both df2FR = df2.loc[df2['Country/Region'] == 'France'] print(df2FR) #+end_src #+RESULTS: #+begin_example Province/State Country/Region Lat Long 1/22/20 1/23/20 1/24/20 1/25/20 1/26/20 ... 3/1/23 3/2/23 3/3/23 3/4/23 3/5/23 3/6/23 3/7/23 3/8/23 3/9/23 120 French Guiana France 3.933900 -53.125800 0 0 0 0 0 ... 98041 98041 98041 98041 98041 98041 98041 98041 98041 121 French Polynesia France -17.679700 149.406800 0 0 0 0 0 ... 77957 77957 77957 77957 77957 77957 77957 78055 78055 122 Guadeloupe France 16.265000 -61.551000 0 0 0 0 0 ... 201852 201852 201852 201852 201852 201852 201852 201886 201886 123 Martinique France 14.641500 -61.024200 0 0 0 0 0 ... 228875 228875 228875 228875 228875 228875 228875 229020 229020 124 Mayotte France -12.827500 45.166244 0 0 0 0 0 ... 42004 42004 42004 42004 42004 42004 42004 42004 42004 125 New Caledonia France -20.904305 165.618042 0 0 0 0 0 ... 80007 80007 80007 80007 80007 80007 80007 80017 80017 126 Reunion France -21.115100 55.536400 0 0 0 0 0 ... 494595 494595 494595 494595 494595 494595 494595 494595 494595 127 Saint Barthelemy France 17.900000 -62.833300 0 0 0 0 0 ... 5439 5439 5439 5439 5439 5439 5439 5441 5441 128 Saint Pierre and Miquelon France 46.885200 -56.315900 0 0 0 0 0 ... 3452 3452 3452 3452 3452 3452 3452 3452 3452 129 St Martin France 18.070800 -63.050100 0 0 0 0 0 ... 12257 12257 12257 12257 12257 12257 12257 12271 12271 130 Wallis and Futuna France -14.293800 -178.116500 0 0 0 0 0 ... 3427 3427 3427 3427 3427 3427 3427 3427 3427 131 NaN France 46.227600 2.213700 0 0 2 3 3 ... 38583794 38587990 38591184 38591184 38591184 38599330 38606393 38612201 38618509 [12 rows x 1147 columns] #+end_example Nous pouvons voir que la france métropolitaine occupe la dernière ligne de ce tableau (*Province/State = NaN*). Voici la liste des territoires à enlever: French Guiana, French Polynesia, Guadeloupe, Martinique, Mayotte, New Caledonia, Reunion, Saint Barthelemy, Saint Pierre and Miquelon, St Martin, Wallis and Futuna. #+begin_src python :results output :session :exports both df3 = df2.loc[~df2['Province/State'].isin(['French Guiana','French Polynesia','Guadeloupe','Martinique','Mayotte','New Caledonia','Reunion','Saint Barthelemy','Saint Pierre and Miquelon','St Martin','Wallis and Futuna'])] print(df3.loc[df3['Country/Region'] == 'France']) #+end_src #+RESULTS: : Province/State Country/Region Lat Long 1/22/20 1/23/20 1/24/20 1/25/20 1/26/20 1/27/20 ... 2/28/23 3/1/23 3/2/23 3/3/23 3/4/23 3/5/23 3/6/23 3/7/23 3/8/23 3/9/23 : 131 NaN France 46.2276 2.2137 0 0 2 3 3 3 ... 38579269 38583794 38587990 38591184 38591184 38591184 38599330 38606393 38612201 38618509 : : [1 rows x 1147 columns] Il ne reste bien que la France métropolitaine! Nous pouvons passer au Royaume-Uni. *** Royaume-Uni Nous allons reprendre la même méthode que pour la France. #+begin_src python :results output :session :exports both df2RU = df2.loc[df2['Country/Region'] == 'United Kingdom'] print(df2RU) #+end_src #+RESULTS: #+begin_example Province/State Country/Region Lat Long 1/22/20 1/23/20 1/24/20 1/25/20 ... 3/2/23 3/3/23 3/4/23 3/5/23 3/6/23 3/7/23 3/8/23 3/9/23 264 Anguilla United Kingdom 18.220600 -63.068600 0 0 0 0 ... 3904 3904 3904 3904 3904 3904 3904 3904 265 Bermuda United Kingdom 32.307800 -64.750500 0 0 0 0 ... 18814 18814 18814 18814 18814 18814 18828 18828 266 British Virgin Islands United Kingdom 18.420700 -64.640000 0 0 0 0 ... 7305 7305 7305 7305 7305 7305 7305 7305 267 Cayman Islands United Kingdom 19.313300 -81.254600 0 0 0 0 ... 31472 31472 31472 31472 31472 31472 31472 31472 268 Channel Islands United Kingdom 49.372300 -2.364400 0 0 0 0 ... 0 0 0 0 0 0 0 0 269 Falkland Islands (Malvinas) United Kingdom -51.796300 -59.523600 0 0 0 0 ... 1930 1930 1930 1930 1930 1930 1930 1930 270 Gibraltar United Kingdom 36.140800 -5.353600 0 0 0 0 ... 20423 20433 20433 20433 20433 20433 20433 20433 271 Guernsey United Kingdom 49.448196 -2.589490 0 0 0 0 ... 34929 34929 34929 34929 34929 34929 34991 34991 272 Isle of Man United Kingdom 54.236100 -4.548100 0 0 0 0 ... 38008 38008 38008 38008 38008 38008 38008 38008 273 Jersey United Kingdom 49.213800 -2.135800 0 0 0 0 ... 66391 66391 66391 66391 66391 66391 66391 66391 274 Montserrat United Kingdom 16.742498 -62.187366 0 0 0 0 ... 1403 1403 1403 1403 1403 1403 1403 1403 275 Pitcairn Islands United Kingdom -24.376800 -128.324200 0 0 0 0 ... 4 4 4 4 4 4 4 4 276 Saint Helena, Ascension and Tristan da Cunha United Kingdom -7.946700 -14.355900 0 0 0 0 ... 2166 2166 2166 2166 2166 2166 2166 2166 277 Turks and Caicos Islands United Kingdom 21.694000 -71.797900 0 0 0 0 ... 6551 6551 6551 6551 6551 6557 6557 6561 278 NaN United Kingdom 55.378100 -3.436000 0 0 0 0 ... 24396530 24396530 24396530 24396530 24396530 24396530 24396530 24425309 [15 rows x 1147 columns] #+end_example #+begin_src python :results output :session :exports both df4 = df3.loc[~df3['Province/State'].isin(['Anguilla','Bermuda','British Virgin Islands','Cayman Islands','Channel Islands','Falkland Islands (Malvinas)','Gibraltar','Guernsey','Isle of Man','Jersey','Montserrat','Pitcairn Islands','Saint Helena, Ascension and Tristan da Cunha','Turks and Caicos Islands'])] print(df4.loc[df4['Country/Region'] == 'United Kingdom']) #+end_src #+RESULTS: : Province/State Country/Region Lat Long 1/22/20 1/23/20 1/24/20 1/25/20 1/26/20 1/27/20 ... 2/28/23 3/1/23 3/2/23 3/3/23 3/4/23 3/5/23 3/6/23 3/7/23 3/8/23 3/9/23 : 278 NaN United Kingdom 55.3781 -3.436 0 0 0 0 0 0 ... 24370150 24370150 24396530 24396530 24396530 24396530 24396530 24396530 24396530 24425309 : : [1 rows x 1147 columns] Enfin nous allons nous occuper de la Hollande. *** Hollande #+begin_src python :results output :session :exports both df2HO = df2.loc[df2['Country/Region'] == 'Netherlands'] print(df2HO) #+end_src #+RESULTS: : Province/State Country/Region Lat Long 1/22/20 1/23/20 1/24/20 1/25/20 1/26/20 ... 3/1/23 3/2/23 3/3/23 3/4/23 3/5/23 3/6/23 3/7/23 3/8/23 3/9/23 : 196 Aruba Netherlands 12.5211 -69.9683 0 0 0 0 0 ... 44044 44044 44044 44044 44044 44044 44044 44044 44044 : 197 Bonaire, Sint Eustatius and Saba Netherlands 12.1784 -68.2385 0 0 0 0 0 ... 11804 11804 11804 11804 11804 11804 11804 11804 11804 : 198 Curacao Netherlands 12.1696 -68.9900 0 0 0 0 0 ... 45986 45986 45986 45986 45986 45986 45986 45986 45986 : 199 Sint Maarten Netherlands 18.0425 -63.0548 0 0 0 0 0 ... 11020 11020 11020 11020 11020 11020 11020 11020 11020 : 200 NaN Netherlands 52.1326 5.2913 0 0 0 0 0 ... 8596157 8596157 8598043 8598043 8598043 8598043 8599981 8599981 8599981 : : [5 rows x 1147 columns] #+begin_src python :results output :session :exports both df5 = df4.loc[~df4['Province/State'].isin(['Aruba','Bonaire, Sint Eustatius and Saba','Curacao','Sint Maarten'])] print(df5.loc[df5['Country/Region'] == 'Netherlands']) #+end_src #+RESULTS: : Province/State Country/Region Lat Long 1/22/20 1/23/20 1/24/20 1/25/20 1/26/20 1/27/20 ... 2/28/23 3/1/23 3/2/23 3/3/23 3/4/23 3/5/23 3/6/23 3/7/23 3/8/23 3/9/23 : 200 NaN Netherlands 52.1326 5.2913 0 0 0 0 0 0 ... 8596157 8596157 8596157 8598043 8598043 8598043 8598043 8599981 8599981 8599981 : : [1 rows x 1147 columns] Nous pouvons maintenant vérifier si le jeu de données ne contient que les métropoles. #+begin_src python :results output :session :exports both print(df5.loc[(df5['Country/Region'] == 'France') | (df5['Country/Region'] == 'United Kingdom') | (df5['Country/Region'] == 'Netherlands')]) #+end_src #+RESULTS: : Province/State Country/Region Lat Long 1/22/20 1/23/20 1/24/20 1/25/20 1/26/20 1/27/20 ... 2/28/23 3/1/23 3/2/23 3/3/23 3/4/23 3/5/23 3/6/23 3/7/23 3/8/23 3/9/23 : 131 NaN France 46.2276 2.2137 0 0 2 3 3 3 ... 38579269 38583794 38587990 38591184 38591184 38591184 38599330 38606393 38612201 38618509 : 200 NaN Netherlands 52.1326 5.2913 0 0 0 0 0 0 ... 8596157 8596157 8596157 8598043 8598043 8598043 8598043 8599981 8599981 8599981 : 278 NaN United Kingdom 55.3781 -3.4360 0 0 0 0 0 0 ... 24370150 24370150 24396530 24396530 24396530 24396530 24396530 24396530 24396530 24425309 : : [3 rows x 1147 columns] Nous voyons que seul les métropoles sont présentes dans le jeu de données! ** Modification des données de la région de Hong-Kong Nous allons maintenant remplacer la valeur Chine pour Hong-Kong en 'Hong-Kong' de manière à différencier la chine et ses provinces et Hong-Kong. #+begin_src python :results output :session :exports both df5.loc[df5['Province/State'] == 'Hong Kong'] #+end_src #+RESULTS: : Province/State Country/Region Lat Long 1/22/20 1/23/20 1/24/20 1/25/20 1/26/20 1/27/20 1/28/20 ... 2/27/23 2/28/23 3/1/23 3/2/23 3/3/23 3/4/23 3/5/23 3/6/23 3/7/23 3/8/23 3/9/23 : 71 Hong Kong China 22.3 114.2 0 2 2 5 8 8 8 ... 2876106 2876106 2876106 2876106 2876106 2876106 2876106 2876106 2876106 2876106 2876106 : : [1 rows x 1147 columns] Nous voyons que la ligne de Hong-Kong correspond à la ligne 71 et que pour l'instant, la valeur de la colonne *Country/Region* correspond à la Chine. Nous allons pouvoir modifier cette ligne! #+begin_src python :results output :session :exports both df5.loc[[71], ['Country/Region']] = "Hong Kong" #+end_src #+RESULTS: Nous pouvons vérifier si la ligne contenant Hong- Kong a changé de valeur pour la colonne Country/Region #+begin_src python :results output :session :exports both df5.loc[df5['Country/Region'] == 'Hong Kong'] #+end_src #+RESULTS: : Province/State Country/Region Lat Long 1/22/20 1/23/20 1/24/20 1/25/20 1/26/20 1/27/20 1/28/20 ... 2/27/23 2/28/23 3/1/23 3/2/23 3/3/23 3/4/23 3/5/23 3/6/23 3/7/23 3/8/23 3/9/23 : 71 Hong Kong Hong Kong 22.3 114.2 0 2 2 5 8 8 8 ... 2876106 2876106 2876106 2876106 2876106 2876106 2876106 2876106 2876106 2876106 2876106 : : [1 rows x 1147 columns] Parfait! La valeur a bien été remplacée. nous pouvons dorénavant nous occuper d'enlever les colonnes qui ne sont pas nécessaires. ** Retrait des colonnes inutiles pour l'analyse Dans notre tableau certaines colonnes comme les province, la latitude et la longitude ne seront pas utile lors de l'analyse nous pouvons donc les supprimer du jeu de données. #+begin_src python :results output :session :exports both df6 = df5.drop(['Province/State','Lat','Long'], axis=1) #+end_src #+RESULTS: Nous pouvons maintenant vérifier si les colonnes que nous voulions supprimer sont absentes du tableau. #+begin_src python :results output :session :exports both df6[:5] #+end_src #+RESULTS: : Country/Region 1/22/20 1/23/20 1/24/20 1/25/20 1/26/20 1/27/20 1/28/20 1/29/20 1/30/20 1/31/20 ... 2/27/23 2/28/23 3/1/23 3/2/23 3/3/23 3/4/23 3/5/23 3/6/23 3/7/23 3/8/23 3/9/23 : 24 Belgium 0 0 0 0 0 0 0 0 0 0 ... 4717655 4717655 4717655 4727795 4727795 4727795 4727795 4727795 4727795 4727795 4739365 : 59 China 1 9 15 39 60 70 106 152 200 237 ... 2275 2275 2275 2275 2275 2275 2275 2275 2275 2275 2275 : 60 China 14 22 36 41 68 80 91 111 114 139 ... 40774 40774 40774 40774 40774 40774 40774 40774 40774 40774 40774 : 61 China 6 9 27 57 75 110 132 147 182 211 ... 14715 14715 14715 14715 14715 14715 14715 14715 14715 14715 14715 : 62 China 1 5 10 18 35 59 80 84 101 120 ... 17122 17122 17122 17122 17122 17122 17122 17122 17122 17122 17122 : : [5 rows x 1144 columns] Il ne reste que les colonnes *Country/Region* et les *colonnes temporelles*. ** Modifications du tableau Le tableau que nous avons pour l'instant ne nous permet pas de produire des courbes du nombre de cas cumulés par pays. De ce fait nous devons transformer le tableau pour donner : | Pays | Date | Nombre-de-cas | |----------+------------+---------------| | Belgique | 01/01/2020 | 0 | | Belgique | 02/01/2020 | 4 | | Belgique | 03/01/2020 | 56 | | . | . | . | | . | . | . | | . | . | . | #+begin_src python :results output :session :exports both df7 = pd.melt(df6, id_vars='Country/Region',var_name='Date', value_name='Nombre_de_cas') print(df7) #+end_src #+RESULTS: #+begin_example Country/Region Date Nombre_de_cas 0 Belgium 1/22/20 0 1 China 1/22/20 1 2 China 1/22/20 14 3 China 1/22/20 6 4 China 1/22/20 1 ... ... ... 52573 Netherlands 3/9/23 8599981 52574 Portugal 3/9/23 5570473 52575 Spain 3/9/23 13770429 52576 US 3/9/23 103802702 52577 United Kingdom 3/9/23 24425309 [52578 rows x 3 columns] #+end_example Nous devons maintenant transformer la date donnée dès le départ en date compréhensible par python pour permettre de trier en fonction en fonction du pays et en fonction de la date nos nombre de cas. #+begin_src python :results output :session :exports both df7['Date'] = pd.to_datetime(df7['Date'], format='%m/%d/%y') print(df7) #+end_src #+RESULTS: #+begin_example Country/Region Date Nombre_de_cas 0 Belgium 2020-01-22 0 1 China 2020-01-22 1 2 China 2020-01-22 14 3 China 2020-01-22 6 4 China 2020-01-22 1 ... ... ... 52573 Netherlands 2023-03-09 8599981 52574 Portugal 2023-03-09 5570473 52575 Spain 2023-03-09 13770429 52576 US 2023-03-09 103802702 52577 United Kingdom 2023-03-09 24425309 [52578 rows x 3 columns] #+end_example Les dates sont mises à jour et sont comprises par python nous pouvons maintenant trier le tableau. #+begin_src python :results output :session :exports both df8 = df7.sort_values(by=['Country/Region','Date']) print(df8) #+end_src #+RESULTS: #+begin_example Country/Region Date Nombre_de_cas 0 Belgium 2020-01-22 0 46 Belgium 2020-01-23 0 92 Belgium 2020-01-24 0 138 Belgium 2020-01-25 0 184 Belgium 2020-01-26 0 ... ... ... 52393 United Kingdom 2023-03-05 24396530 52439 United Kingdom 2023-03-06 24396530 52485 United Kingdom 2023-03-07 24396530 52531 United Kingdom 2023-03-08 24396530 52577 United Kingdom 2023-03-09 24425309 [52578 rows x 3 columns] #+end_example Le tableau est trié! ** Données manquantes ? Nous allons maintenant vérifier de potentielles données manquantes dans notre tableau. #+begin_src python :results output :session :exports both NA = df8.isna().sum() print(NA) #+end_src #+RESULTS: : Country/Region 0 : Date 0 : Nombre_de_cas 0 : dtype: int64 Aucune donnée semble manquante nous allons pouvoir passer à la création des graphiques. Mais avant nous allons séparer les différents pays en différents tableaux. Premièrement nous allons regarder quels sont les différents pays. Cela a déjà été fait plus haut mais nous allons le refaire ci-dessous: ** Subdivision du jeu de données #+begin_src python :results output :session :exports both df8['Country/Region'].unique() #+end_src #+RESULTS: : array(['Belgium', 'China', 'France', 'Germany', 'Hong Kong', 'Iran', : 'Italy', 'Japan', 'Korea, South', 'Netherlands', 'Portugal', : 'Spain', 'US', 'United Kingdom'], dtype=object) Nous allons créer un tableaux pour chaque pays ci-dessus: #+begin_src python :results output :session :exports both dfBel = df8[df8["Country/Region"] == 'Belgium'] print(dfBel['Country/Region'].unique()) dfChina = df8[df8["Country/Region"] == 'China'] print(dfChina['Country/Region'].unique()) dfFR = df8[df8["Country/Region"] == 'France'] print(dfFR['Country/Region'].unique()) dfGER = df8[df8["Country/Region"] == 'Germany'] print(dfGER['Country/Region'].unique()) dfHK = df8[df8["Country/Region"] == 'Hong Kong'] print(dfHK['Country/Region'].unique()) dfIran = df8[df8["Country/Region"] == 'Iran'] print(dfIran['Country/Region'].unique()) dfIT = df8[df8["Country/Region"] == 'Italy'] print(dfIT['Country/Region'].unique()) dfJP = df8[df8["Country/Region"] == 'Japan'] print(dfJP['Country/Region'].unique()) dfSK = df8[df8["Country/Region"] == 'Korea, South'] print(dfSK['Country/Region'].unique()) dfNTH = df8[df8["Country/Region"] == 'Netherlands'] print(dfNTH['Country/Region'].unique()) dfPOG = df8[df8["Country/Region"] == 'Portugal'] print(dfPOG['Country/Region'].unique()) dfSP = df8[df8["Country/Region"] == 'Spain'] print(dfSP['Country/Region'].unique()) dfUS = df8[df8["Country/Region"] == 'US'] print(dfUS['Country/Region'].unique()) dfUK = df8[df8["Country/Region"] == 'United Kingdom'] print(dfUK['Country/Region'].unique()) #+end_src #+RESULTS: #+begin_example ['Belgium'] ['China'] ['France'] ['Germany'] ['Hong Kong'] ['Iran'] ['Italy'] ['Japan'] ['Korea, South'] ['Netherlands'] ['Portugal'] ['Spain'] ['US'] ['United Kingdom'] #+end_example Nous avons correctement subdivisé nos tableaux! Cependant les différentes provinces de la Chine n'ont toujours pas été regroupé ce qui donne plusieurs nombre de cas par jour. #+begin_src python :results output :session :exports both dfChina #+end_src #+RESULTS: #+begin_example Country/Region Date Nombre_de_cas 1 China 2020-01-22 1 2 China 2020-01-22 14 3 China 2020-01-22 6 4 China 2020-01-22 1 5 China 2020-01-22 0 ... ... ... 52562 China 2023-03-09 1647 52563 China 2023-03-09 1521816 52564 China 2023-03-09 3089 52565 China 2023-03-09 9743 52566 China 2023-03-09 11848 [37719 rows x 3 columns] #+end_example Nous allons donc aggréger le tableau pour n'avoir qu'une valeur par jour! #+begin_src python :results output :session :exports both dfC = dfChina.groupby(['Country/Region','Date'],as_index= False).sum().reset_index() print(dfC) #+end_src #+RESULTS: #+begin_example index Country/Region Date Nombre_de_cas 0 0 China 2020-01-22 548 1 1 China 2020-01-23 641 2 2 China 2020-01-24 918 3 3 China 2020-01-25 1401 4 4 China 2020-01-26 2067 ... ... ... ... 1138 1138 China 2023-03-05 2027418 1139 1139 China 2023-03-06 2027418 1140 1140 China 2023-03-07 2027418 1141 1141 China 2023-03-08 2027418 1142 1142 China 2023-03-09 2027418 [1143 rows x 4 columns] #+end_example Nous allons enfin pouvoir représenter graphiquement le jeu de données. Prenons d'abord un exemple avec le Japon! * Représentation graphique #+begin_src python :results output graphics :file JPL.png :session :exports both import matplotlib.pyplot as plt import matplotlib.axes as axs plt.plot('Date', 'Nombre_de_cas', data=dfJP) plt.xlabel('Date') plt.ylabel('Nombre de cas') plt.suptitle('Nombre de cas au Japon') plt.savefig('JPL.png') #+end_src #+RESULTS: La représentation graphique est sauvegardée dans le dossier avec le document computationnel! Nous allons maintenant essayer avec une échelle logarithmique. #+begin_src python :results output graphics :file JPLOG.png :session :exports both plt.plot('Date', 'Nombre_de_cas', data=dfJP) plt.semilogy() plt.xlabel('Date') plt.ylabel('Nombre de cas') plt.suptitle('Nombre de cas au Japon') plt.savefig('JPLOG.png') #+end_src #+RESULTS: Nous allons pouvoir passer au plot de chaque pays! TOut d'abord avec une échelle linéaire: #+begin_src python :results output graphics :file DATAL.png :session :exports both plt.plot('Date', 'Nombre_de_cas', data=dfBel, label='Belgium') plt.plot('Date', 'Nombre_de_cas', data=dfC, label='China') plt.plot('Date', 'Nombre_de_cas', data=dfFR, label='France') plt.plot('Date', 'Nombre_de_cas', data=dfGER, label='Germany') plt.plot('Date', 'Nombre_de_cas', data=dfHK, label='Hong Kong') plt.plot('Date', 'Nombre_de_cas', data=dfIran, label='Iran') plt.plot('Date', 'Nombre_de_cas', data=dfIT, label='Italy') plt.plot('Date', 'Nombre_de_cas', data=dfJP, label='Japan') plt.plot('Date', 'Nombre_de_cas', data=dfSK, label='South Korea') plt.plot('Date', 'Nombre_de_cas', data=dfNTH, label='Netherlands') plt.plot('Date', 'Nombre_de_cas', data=dfPOG, label='Portugal') plt.plot('Date', 'Nombre_de_cas', data=dfSP, label='Spain') plt.plot('Date', 'Nombre_de_cas', data=dfUS, label='US') plt.plot('Date', 'Nombre_de_cas', data=dfUK, label='United Kingdom') plt.xlabel('Date') plt.ylabel('Nombre de cas') plt.legend() plt.savefig('DATAL.png') #+end_src #+RESULTS: et efin avec une échelle logarithmique. #+begin_src python :results output graphics :file DATALOG.png :session :exports both plt.plot('Date', 'Nombre_de_cas', data=dfBel, label='Belgium') plt.plot('Date', 'Nombre_de_cas', data=dfC, label='China') plt.plot('Date', 'Nombre_de_cas', data=dfFR, label='France') plt.plot('Date', 'Nombre_de_cas', data=dfGER, label='Germany') plt.plot('Date', 'Nombre_de_cas', data=dfHK, label='Hong Kong') plt.plot('Date', 'Nombre_de_cas', data=dfIran, label='Iran') plt.plot('Date', 'Nombre_de_cas', data=dfIT, label='Italy') plt.plot('Date', 'Nombre_de_cas', data=dfJP, label='Japan') plt.plot('Date', 'Nombre_de_cas', data=dfSK, label='South Korea') plt.plot('Date', 'Nombre_de_cas', data=dfNTH, label='Netherlands') plt.plot('Date', 'Nombre_de_cas', data=dfPOG, label='Portugal') plt.plot('Date', 'Nombre_de_cas', data=dfSP, label='Spain') plt.plot('Date', 'Nombre_de_cas', data=dfUS, label='US') plt.plot('Date', 'Nombre_de_cas', data=dfUK, label='United Kingdom') plt.semilogy() plt.xlabel('Date') plt.ylabel('Nombre de cas') plt.legend() plt.savefig('DATALOG.png') #+end_src #+RESULTS: Tous les graphiques doivent être dans votre dossier où se situe le document computationnel.