#+TITLE: Exercice 4 #+AUTHOR: Waad ALMASRI #+DATE: 25/08/2020 #+LANGUAGE: fr # #+PROPERTY: header-args :eval never-export #+HTML_HEAD: #+HTML_HEAD: #+HTML_HEAD: #+HTML_HEAD: #+HTML_HEAD: #+HTML_HEAD: * Exploration du répertoire D'abord, on fait un git pull pour récupérer les données qu'on a téléversé dans le répertoire GIT.
*Attention!* Si tu as commencé à écrire dans le notebook, enregistre les données pour ne pas les perdre.
Ensuite, on s'assure que nous avons les données dans le répertoire avec la commande "listdir()". #+begin_src python :results output :exports both import os files = os.listdir() print(files) #+end_src #+RESULTS: : ['exercice_en.ipynb', 'exercice.ipynb', 'exercice_python_en.org', 'exercice_python_fr.org', 'data.csv', 'exercice_R_en.org', 'bar-chart.html', 'exercice_R_fr.org', 'cosxsx.png', 'exercice_fr.Rmd', 'exercice_en.Rmd', 'exercice_fr.ipynb'] * Exploration du jeu de données Maintenant qu'on a les données, on va commencer à les explorer.
*NB:* les données suivantes sont déjà formattées en .csv. #+begin_src python :results output :exports both print("Reading Data...") import pandas as pd df = pd.read_csv("./data.csv", sep="\t") print("Checking Data...") print("In this dataframe there are ",len(df), "data samples") print(df.head()) print("Adding a column for 'year'") df['year'] = df.date.apply(lambda x: int(x[:4])) print("Checking Missing Data...") print(df.isnull().sum()) print("Dropping rows having a Null job i.e. missing job info") df_ = df.dropna(subset=['job']) print("The number of data samples left are",len(df_)) print("\n Statistiques de Base") print("There are ",len(set(df_.job)), " unique jobs.") print("There are ",len(set(df_.edited_by)), " unique editors.") print("There are ",len(set(df_.state)), " unique states.") print("Number of jobs per state per year") pivot_table = pd.pivot_table(df_, index=['state'], columns=['year'], values=['job'], aggfunc='count', fill_value=0) print(pivot_table) #+end_src #+RESULTS: * Representations graphiques We will start by plotting the Nbr of jobs per year of New York versus Texas. #+begin_src python :results file :session :var matplot_lib_filename="fig1_python_org.png" :exports both import pandas as pd df = pd.read_csv("./data.csv", sep="\t") df_ = df.dropna(subset=['job']) import plotly import plotly.graph_objs as go # Create two additional DataFrames to traces df1 = df_[df_.state == "New York"] df2 = df_[df_.state == "Texas"] # Create two traces, first "New York" and second "Texas" trace1 = go.Bar(x=df1["year"], y=df1["job"], name="New York") trace2 = go.Bar(x=df2["year"], y=df2["job"], name="Texas") # Fill out data with our traces data = [trace1, trace2] # Create layout and specify title, legend and so on layout = go.Layout(title="Nbr of jobs per state per year", xaxis=dict(title="Year"), yaxis=dict(title="Count of Jobs"), barmode="group") # Create figure with all prepared data for plot fig = go.Figure(data=data, layout=layout) fig fig.write_image(matplot_lib_filename) #+end_src Checkig the top 7 jobs present in the United states #+begin_src python :results output :exports both import pandas as pd df = pd.read_csv("./data.csv", sep="\t") df_ = df.dropna(subset=['job']) df_.job.value_counts()[:7] #+end_src #+begin_src python :results file :session :var matplot_lib_filename="fig2_python_org.png" :exports both import pandas as pd df = pd.read_csv("./data.csv", sep="\t") df_ = df.dropna(subset=['job']) import matplotlib.pyplot as plt import seaborn as sns sns.countplot(df_[df_.job.isin(df_.job.value_counts()[:7].keys())].job) plt.xticks(rotation=90) plt.savefig(matplot_lib_filename) #+end_src ** Réflexion It seems that this database is more about politics since we see that the top 2 jobs are Republicans and Democrats. Let us check the rate of Republicans versus Democrats in the top states of the US. But First let us identify the top states of the US. Checkig the top 7 US states present in the dataset #+begin_src python :results output :exports both import pandas as pd df = pd.read_csv("./data.csv", sep="\t") df_ = df.dropna(subset=['job']) df_.state.value_counts()[:7] #+end_src #+begin_src python :results file :session :var matplot_lib_filename="fig3_python_org.png" :exports both import pandas as pd df = pd.read_csv("./data.csv", sep="\t") df_ = df.dropna(subset=['job']) import seaborn as sns import matplotlib.pyplot as plt sns.countplot(df_[df_.state.isin(df_.state.value_counts()[:7].keys())].state) plt.xticks(rotation=90) plt.savefig(matplot_lib_filename) #+end_src Now let us compare the distribution of the Republican versus Democrat in the top 7 US states. #+begin_src python :results file :session :var matplot_lib_filename="fig4_python_org.png" :exports both import pandas as pd df = pd.read_csv("./data.csv", sep="\t") df_ = df.dropna(subset=['job']) import seaborn as sns import matplotlib.pyplot as plt df1 = df_[(df_.job.isin(["Republican","Democrat"])) & df_.state.isin(df_.state.value_counts()[:7].keys())] sns.countplot(data=df1, x='state', hue='job' ) plt.title("Distribution of Republican vs Democrat in the top 7 US states in the database") plt.xticks(rotation=90) plt.savefig(matplot_lib_filename) #+end_src * Word Cloud We could have also found the top states ad top jobs using word cloud. #+begin_src python :results file :session :var matplot_lib_filename="fig5_python_org.png" :exports both import pandas as pd df = pd.read_csv("./data.csv", sep="\t") df_ = df.dropna(subset=['job']) import matplotlib.pyplot as plt from wordcloud import WordCloud, ImageColorGenerator text = ' '.join(df_.job.tolist()) wordcloud = WordCloud(background_color="white").generate(text) # Display the generated image: plt.figure(figsize=(15,8)) plt.imshow(wordcloud) plt.axis("off") plt.show() plt.savefig(matplot_lib_filename) #+end_src #+begin_src python :results file :session :var matplot_lib_filename="fig6_python_org.png" :exports both import pandas as pd df = pd.read_csv("./data.csv", sep="\t") df_ = df.dropna(subset=['job']) import matplotlib.pyplot as plt text = ' '.join(df_.state.tolist()) wordcloud = WordCloud(background_color="pink").generate(text) # Display the generated image: plt.figure(figsize=(15,8)) plt.imshow(wordcloud) plt.axis("off") plt.show() plt.savefig(matplot_lib_filename) #+end_src