#+TITLE: Exercice 4 #+AUTHOR: Waad ALMASRI #+DATE: 25/08/2020 #+LANGUAGE: fr # #+PROPERTY: header-args :eval never-export #+HTML_HEAD: #+HTML_HEAD: #+HTML_HEAD: #+HTML_HEAD: #+HTML_HEAD: #+HTML_HEAD: * Exploration du répertoire D'abord, on fait un git pull pour récupérer les données qu'on a téléversé dans le répertoire GIT.
*Attention!* Si tu as commencé à écrire dans le notebook, enregistre les données pour ne pas les perdre.
Ensuite, on s'assure que nous avons les données dans le répertoire avec la commande "list.files()". #+begin_src R :results output :session *R* :exports both list.files(".") #+end_src #+RESULTS: : [1] "#exercice_python_fr.org#" "#exercice_R_fr.org#" : [3] "bar-chart.html" "cosxsx.png" : [5] "data.csv" "exercice_en.ipynb" : [7] "exercice_en.Rmd" "exercice_fr.ipynb" : [9] "exercice_fr.Rmd" "exercice_python_en.org" : [11] "exercice_python_fr.org" "exercice_R_en.org" : [13] "exercice_R_fr.org" "exercice.ipynb" : [15] "fig2_python_org.png" "fig3_python_org.png" : [17] "fig4_python_org.png" "fig5_python_org.png" * Exploration du jeu de données Maintenant qu'on a les données, on va commencer à les explorer.
*NB:* les données suivantes sont déjà formattées en .csv. #+begin_src R :results output :session *R* :exports both df <- read.csv(file = "data.csv", sep="\t") print(nrow(df)) head(df) #+end_src #+RESULTS: #+begin_example [1] 7569 date edited_by job researched_by 1 2013-08-29 Angie Drobnic Holan Republican Jon Greenberg 2 2013-08-29 Angie Drobnic Holan Republican Louis Jacobson 3 2013-08-29 Greg Borowski Tom Kertscher 4 2013-08-28 Aaron Sharockman Rochelle Koff 5 2013-08-28 Aaron Sharockman Angie Drobnic Holan 6 2013-08-28 W. Gardner Selby Republican Sue Owen source state 1 Scott Walker Wisconsin 2 Mike Huckabee Arkansas 3 League of Conservation Voters 4 National Republican Congressional Committee 5 Janet Napolitano 6 Steve Stockman Texas statement 1 In the Wisconsin health insurance exchange, "the Society of Actuaries points out that there'll be, according to them, an 82 percent increase in individual premiums over the next couple years under Obamacare." 2 "America’s gun-related homicide rate … would be about the same as Belgium’s if you left out California, Illinois, D.C. and New Jersey, places with some of the strictest gun control laws in the U.S." 3 Says U.S. Sen. Ron Johnson voted to let oil and gas companies emit "unlimited carbon pollution into our air" 4 "Congressman Patrick Murphy voted to keep the scandal-ridden IRS in charge of enforcing Obamacare." 5 The 2010 DREAM Act failed despite "strong bipartisan support." 6 Says U.N. arms treaty will mandate a "new international gun registry." subjects truth 1 ['Health Care'] 3 2 ['Crime', 'Guns', 'Pundits'] 0 3 ['Climate Change', 'Energy', 'Environment', 'Transportation'] 5 4 ['Health Care'] 2 5 ['Bipartisanship', 'Immigration'] 2 6 ['Guns'] 1 #+end_example Let us add the year column to the dataframe. #+begin_src R :results output :session *R* :exports both df$year <- substring(df$date,1,4) #+end_src #+RESULTS: Now let us check what's in the dataframe: #+begin_src R :results output :session *R* :exports both summary(df) #+end_src #+RESULTS: #+begin_example date edited_by job researched_by Length:7569 Length:7569 Length:7569 Length:7569 Class :character Class :character Class :character Class :character Mode :character Mode :character Mode :character Mode :character source state statement subjects Length:7569 Length:7569 Length:7569 Length:7569 Class :character Class :character Class :character Class :character Mode :character Mode :character Mode :character Mode :character truth year Min. :0.000 Length:7569 1st Qu.:1.000 Class :character Median :3.000 Mode :character Mean :2.741 3rd Qu.:4.000 Max. :5.000 #+end_example Let us remove the missing data. #+begin_src R :results output :session *R* :exports both library(tidyr) library(plyr) library(dplyr) df <- df %>% drop_na(job, state) #+end_src #+RESULTS: * Statistiques de base #+begin_src R :results output :session *R* :exports both print(paste0("There are ", length(unique(df$job)), " unique jobs.")) print(paste0("There are ", length(unique(df$edited_by)), " unique editors.")) print(paste0("There are ", length(unique(df$state)), " unique state")) #+end_src #+RESULTS: : [1] "There are 20 unique jobs." : : [1] "There are 127 unique editors." : : [1] "There are 60 unique state" Number of jobs per state per year #+begin_src R :results output :session *R* :exports both jobs_per_state_year <-ddply(df,.(state,year),summarise,number_of_jobs=length((job))) jobs_per_state_year <-jobs_per_state_year[order(jobs_per_state_year$number_of_jobs, decreasing=TRUE),] #+end_src #+RESULTS: * Representations graphiques We will start by plotting the Nbr of jobs per year of New York versus Texas. #+begin_src R :results output graphics :file (org-babel-temp-file "figure" ".png") :exports both :width 600 :height 400 :session *R* library(ggplot2) df %>% filter(df$state %in% c("Texas", "New York") ) %>% group_by(state, year) %>% dplyr::summarise(Nbr_of_jobs=n()) %>% ggplot(aes(x=year, y=Nbr_of_jobs))+ geom_bar(aes(fill=state),stat="identity") + theme_bw() #+end_src #+RESULTS: [[file:/var/folders/7s/_r7s0qgj0nlbng33j4v38z9h0000gn/T/babel-dXCm2H/figureLo0GIk.png]] Let us Check the top 7 jobs present in the US: #+begin_src R :results output graphics :file (org-babel-temp-file "figure" ".png") :exports both :width 600 :height 400 :session *R* top_jobs <-ddply(df,.(job),summarise,number_of_jobs=length((state))) top_jobs <-top_jobs[order(top_jobs$number_of_jobs, decreasing=TRUE),] ggplot(data=top_jobs, aes(x=reorder(job, -number_of_jobs), y=number_of_jobs)) + geom_bar(stat="identity", color="blue", fill="white")+ theme(axis.text.x = element_text(angle = 90)) #+end_src #+RESULTS: [[file:/var/folders/7s/_r7s0qgj0nlbng33j4v38z9h0000gn/T/babel-dXCm2H/figurejq9iem.png]] ** Réflexion It seems that this database is more about politics since we see that the top 2 jobs are Republicans and Democrats. Let us check the rate of Republicans versus Democrats in the top states of the US. But First let us identify the top states of the US. #+begin_src R :results output graphics :file (org-babel-temp-file "figure" ".png") :exports both :width 600 :height 400 :session *R* jobs_per_state <-ddply(df,.(state),summarise,number_of_jobs=length((job))) jobs_per_state <-jobs_per_state[order(jobs_per_state$number_of_jobs, decreasing=TRUE),] ggplot(data=jobs_per_state, aes(x=reorder(state, -number_of_jobs), y=number_of_jobs)) + geom_bar(stat="identity", color="white", fill="red")+ theme(axis.text.x = element_text(angle = 90)) #+end_src #+RESULTS: [[file:/var/folders/7s/_r7s0qgj0nlbng33j4v38z9h0000gn/T/babel-dXCm2H/figureDIwvHt.png]] Thus, we can conclude that the top 7 US states having the higher jobs availability are: Texas, Florida, Illinois, Ohio, Wisconsin, Georgia and Rhode Island. Now let us compare the distribution of the Republican versus Democrat in the top 7 US states: #+begin_src R :results output graphics :file (org-babel-temp-file "figure" ".png") :exports both :width 600 :height 400 :session *R* df %>% filter(df$state %in% c("Texas", "Florida", "Illinois", "Ohio", "Wisconsin", "Georgia", "Rhode Island") & df$job %in% c("Republican", "Democrat")) %>% group_by(state, job) %>% dplyr::summarise(Nbr_of_jobs=n()) %>% ggplot(aes(x=state, y=Nbr_of_jobs))+ geom_bar(aes(fill=job),stat="identity") + theme_bw() #+end_src #+RESULTS: [[file:/var/folders/7s/_r7s0qgj0nlbng33j4v38z9h0000gn/T/babel-dXCm2H/figurekHTcAX.png]] ** Word Cloud We could have also found the top states and top jobs using word cloud. Top Jobs: #+begin_src R :results output graphics :file (org-babel-temp-file "figure" ".png") :exports both :width 600 :height 400 :session *R* library(wordcloud) library(RColorBrewer) pal2 <- brewer.pal(8,"Set2")#length(unique(top_jobs$job)) wordcloud(top_jobs$job, top_jobs$number_of_jobs, random.order=TRUE, rot.per=.10, colors=pal2, vfont=c("sans serif","plain")) #+end_src #+RESULTS: [[file:/var/folders/7s/_r7s0qgj0nlbng33j4v38z9h0000gn/T/babel-dXCm2H/figureSVtKM2.png]] Top US states: #+begin_src R :results output graphics :file (org-babel-temp-file "figure" ".png") :exports both :width 600 :height 400 :session *R* pal2 <- brewer.pal(8,"Accent") wordcloud(jobs_per_state$state, jobs_per_state$number_of_jobs, random.order=FALSE, rot.per=.15, colors=pal2, vfont=c("sans serif","plain")) #+end_src #+RESULTS: [[file:/var/folders/7s/_r7s0qgj0nlbng33j4v38z9h0000gn/T/babel-dXCm2H/figure3i2g5j.png]]