#+TITLE: Exercice 4
#+AUTHOR: Waad ALMASRI
#+DATE: 25/08/2020
#+LANGUAGE: fr
# #+PROPERTY: header-args :eval never-export
#+HTML_HEAD:
#+HTML_HEAD:
#+HTML_HEAD:
#+HTML_HEAD:
#+HTML_HEAD:
#+HTML_HEAD:
* Exploration du répertoire
D'abord, on fait un git pull pour récupérer les données qu'on a téléversé dans le répertoire GIT.
*Attention!* Si tu as commencé à écrire dans le notebook, enregistre les données pour ne pas les perdre.
Ensuite, on s'assure que nous avons les données dans le répertoire
avec la commande "listdir()".
#+begin_src python :results output :exports both
import os
files = os.listdir()
print(files)
#+end_src
#+RESULTS:
: ['exercice_en.ipynb', 'exercice.ipynb', 'exercice_python_en.org', 'exercice_python_fr.org', 'data.csv', 'exercice_R_en.org', 'bar-chart.html', 'exercice_R_fr.org', 'cosxsx.png', 'exercice_fr.Rmd', 'exercice_en.Rmd', 'exercice_fr.ipynb']
* Exploration du jeu de données
Maintenant qu'on a les données, on va commencer à les explorer.
*NB:* les données suivantes sont déjà formattées en .csv.
#+begin_src python :results output :exports both
print("Reading Data...")
import pandas as pd
df = pd.read_csv("./data.csv", sep="\t")
print("Checking Data...")
print("In this dataframe there are ",len(df), "data samples")
print(df.head())
print("Adding a column for 'year'")
df['year'] = df.date.apply(lambda x: int(x[:4]))
print("Checking Missing Data...")
print(df.isnull().sum())
print("Dropping rows having a Null job i.e. missing job info")
df_ = df.dropna(subset=['job'])
print("The number of data samples left are",len(df_))
print("\n Statistiques de Base")
print("There are ",len(set(df_.job)), " unique jobs.")
print("There are ",len(set(df_.edited_by)), " unique editors.")
print("There are ",len(set(df_.state)), " unique states.")
print("Number of jobs per state per year")
pivot_table = pd.pivot_table(df_, index=['state'], columns=['year'], values=['job'], aggfunc='count', fill_value=0)
print(pivot_table)
#+end_src
#+RESULTS:
* Representations graphiques
We will start by plotting the Nbr of jobs per year of New York versus Texas.
#+begin_src python :results file :session :var matplot_lib_filename="fig1_python_org.png" :exports both
import pandas as pd
df = pd.read_csv("./data.csv", sep="\t")
df_ = df.dropna(subset=['job'])
import plotly
import plotly.graph_objs as go
# Create two additional DataFrames to traces
df1 = df_[df_.state == "New York"]
df2 = df_[df_.state == "Texas"]
# Create two traces, first "New York" and second "Texas"
trace1 = go.Bar(x=df1["year"], y=df1["job"], name="New York")
trace2 = go.Bar(x=df2["year"], y=df2["job"], name="Texas")
# Fill out data with our traces
data = [trace1, trace2]
# Create layout and specify title, legend and so on
layout = go.Layout(title="Nbr of jobs per state per year",
xaxis=dict(title="Year"),
yaxis=dict(title="Count of Jobs"),
barmode="group")
# Create figure with all prepared data for plot
fig = go.Figure(data=data, layout=layout)
fig
fig.write_image(matplot_lib_filename)
#+end_src
Checkig the top 7 jobs present in the United states
#+begin_src python :results output :exports both
import pandas as pd
df = pd.read_csv("./data.csv", sep="\t")
df_ = df.dropna(subset=['job'])
df_.job.value_counts()[:7]
#+end_src
#+begin_src python :results file :session :var matplot_lib_filename="fig2_python_org.png" :exports both
import pandas as pd
df = pd.read_csv("./data.csv", sep="\t")
df_ = df.dropna(subset=['job'])
import matplotlib.pyplot as plt
import seaborn as sns
sns.countplot(df_[df_.job.isin(df_.job.value_counts()[:7].keys())].job)
plt.xticks(rotation=90)
plt.savefig(matplot_lib_filename)
#+end_src
** Réflexion
It seems that this database is more about politics since we see that the top 2 jobs are Republicans and Democrats.
Let us check the rate of Republicans versus Democrats in the top states of the US.
But First let us identify the top states of the US.
Checkig the top 7 US states present in the dataset
#+begin_src python :results output :exports both
import pandas as pd
df = pd.read_csv("./data.csv", sep="\t")
df_ = df.dropna(subset=['job'])
df_.state.value_counts()[:7]
#+end_src
#+begin_src python :results file :session :var matplot_lib_filename="fig3_python_org.png" :exports both
import pandas as pd
df = pd.read_csv("./data.csv", sep="\t")
df_ = df.dropna(subset=['job'])
import seaborn as sns
import matplotlib.pyplot as plt
sns.countplot(df_[df_.state.isin(df_.state.value_counts()[:7].keys())].state)
plt.xticks(rotation=90)
plt.savefig(matplot_lib_filename)
#+end_src
Now let us compare the distribution of the Republican versus Democrat
in the top 7 US states.
#+begin_src python :results file :session :var matplot_lib_filename="fig4_python_org.png" :exports both
import pandas as pd
df = pd.read_csv("./data.csv", sep="\t")
df_ = df.dropna(subset=['job'])
import seaborn as sns
import matplotlib.pyplot as plt
df1 = df_[(df_.job.isin(["Republican","Democrat"])) & df_.state.isin(df_.state.value_counts()[:7].keys())]
sns.countplot(data=df1, x='state', hue='job' )
plt.title("Distribution of Republican vs Democrat in the top 7 US states in the database")
plt.xticks(rotation=90)
plt.savefig(matplot_lib_filename)
#+end_src
* Word Cloud
We could have also found the top states ad top jobs using word cloud.
#+begin_src python :results file :session :var matplot_lib_filename="fig5_python_org.png" :exports both
import pandas as pd
df = pd.read_csv("./data.csv", sep="\t")
df_ = df.dropna(subset=['job'])
import matplotlib.pyplot as plt
from wordcloud import WordCloud, ImageColorGenerator
text = ' '.join(df_.job.tolist())
wordcloud = WordCloud(background_color="white").generate(text)
# Display the generated image:
plt.figure(figsize=(15,8))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
plt.savefig(matplot_lib_filename)
#+end_src
#+begin_src python :results file :session :var matplot_lib_filename="fig6_python_org.png" :exports both
import pandas as pd
df = pd.read_csv("./data.csv", sep="\t")
df_ = df.dropna(subset=['job'])
import matplotlib.pyplot as plt
text = ' '.join(df_.state.tolist())
wordcloud = WordCloud(background_color="pink").generate(text)
# Display the generated image:
plt.figure(figsize=(15,8))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
plt.savefig(matplot_lib_filename)
#+end_src