From 5dcb435de4e4e0add9fc9f8226da5bb187bc671e Mon Sep 17 00:00:00 2001 From: NourElh <734092651fcdd5add927271f472626a6@app-learninglab.inria.fr> Date: Sat, 5 Nov 2022 09:42:01 +0000 Subject: [PATCH] Delete SMPE-HM1.Rmd --- data-analysis-visualization/SMPE-HM1.Rmd | 171 ----------------------- 1 file changed, 171 deletions(-) delete mode 100644 data-analysis-visualization/SMPE-HM1.Rmd diff --git a/data-analysis-visualization/SMPE-HM1.Rmd b/data-analysis-visualization/SMPE-HM1.Rmd deleted file mode 100644 index 49b4dde..0000000 --- a/data-analysis-visualization/SMPE-HM1.Rmd +++ /dev/null @@ -1,171 +0,0 @@ ---- -title: "SMPE-HM1" -output: - pdf_document: default - html_document: default ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` -# Getting the wrong picture from the data -## Read the data -A good way to store the data is to organize it in a csv file. -```{r} -group1=read.csv("activite-histo-group1.csv",header = T) -group2=read.csv("activite-histo-group2.csv",header = T) -group1 -group2 -``` -## Data Visualization -### Petite section -Plot each section scores of both groups in the same graph to see the difference. We need to make a plot that represents the scores of each group. The We use for example a solid line for the first group, and then draw a dashed line for group 2. A legend is added to the graph in order to make it clear to the reader what the two lines represent. -```{r} -# plot solid line, set plot size, but omit axes -plot(x=seq(1,33), y=group1$score.in.petite.section, type="l", lty=1, main="Petite section scores for group 1 and 2", - xlab="students" , ylab="scores") - -# plot dashed line -lines(x=seq(1,30), y=group2$score.in.petite.section, lty=2) - -# add legend -par(xpd=TRUE) -legend(x=5, y=-1, legend=c("group1", "group2"), lty=1:2, box.lty=0, ncol=2) -``` -It's difficult to see whether the classical pedagogy is better than the alternative one, so an idea that could make it more clear is to visualize the grouped scores, in order to see their frequency for each group and thus we can make a good conclusion. - -```{r} -library(plyr) -freq_1 <- count(group1, 'score.in.petite.section') -freq_2 <- count(group2, 'score.in.petite.section') - -hg1 <- hist(freq_1$score.in.petite.section, plot = FALSE) # Save first histogram data -hg2 <- hist(freq_2$score.in.petite.section, plot = FALSE) # Save 2nd histogram data - -plot(hg1, col = alpha("blue",0.2),xlab = "Petite section scores",main = ("Petite section scores frequency for group 1 and 2")) # Plot 1st histogram using a transparent color -plot(hg2, col = alpha("red",0.4), add = TRUE) # Add 2nd histogram using different color -legend("topright", - legend = c("group 1", "group 2"), - fill = c(5,2), # Color of the squares - border = "black") -``` - -According to the graph, the classical pedagogy has a higher frequency of low scores than the alternative pedagogy. However, the classical pedagogy got the best scores [3, 3.5]. - -### Moyenne section -Let's see for the moyenne section. -```{r} -# plot solid line, set plot size, but omit axes -plot(x=seq(1,33), y=group1$score.in.moyenne.section, type="l", lty=1, main="Moyenne section scores for group 1 and 2", - xlab="students", ylab="scores") - -# plot dashed line -lines(x=seq(1,30), y=group2$score.in.moyenne.section, lty=2) - -# add legend -par(xpd=TRUE) -legend(x=5, y=-1, legend=c("group1", "group2"), lty=1:2, box.lty=0, ncol=2) - -``` -```{r} -library(ggplot2) -library(plyr) -freq_1 <- count(group1, 'score.in.moyenne.section') -freq_2 <- count(group2, 'score.in.moyenne.section') - -hg1 <- hist(freq_1$score.in.moyenne.section, plot = FALSE) # Save first histogram data -hg2 <- hist(freq_2$score.in.moyenne.section, plot = FALSE) # Save 2nd histogram data - -plot(hg1, col = alpha("blue",0.2),xlab = "Moyenne section scores",main = ("Moyenne section scores frequency for group 1 and 2")) # Plot 1st histogram using a transparent color -plot(hg2, col = alpha("red",0.4), add = TRUE) # Add 2nd histogram using different color -legend("topright", - legend = c("group 1", "group 2"), - fill = c(5,2), # Color of the squares - border = "black") -``` - -Again, the classical pedagogy has a higher frequency of low scores than the alternative pedagogy, but it has the best scores [7,8]. - -### Grande section -Let's see now for the grande section. -```{r} -# plot solid line, set plot size, but omit axes -plot(x=seq(1,33), y=group1$score.in.grande.section, type="l", lty=1, main="Grande section scores for group 1 and 2", - xlab="students", ylab="scores") - -# plot dashed line -lines(x=seq(1,30), y=group2$score.in.grande.section, lty=2) - -# add legend -par(xpd=TRUE) -legend(x=5, y=-1, legend=c("group1", "group2"), lty=1:2, box.lty=0, ncol=2) -``` -```{r} -library(ggplot2) -library(plyr) -freq_1 <- count(group1, 'score.in.grande.section') -freq_2 <- count(group2, 'score.in.grande.section') - -hg1 <- hist(freq_1$score.in.grande.section, plot = FALSE) # Save first histogram data -hg2 <- hist(freq_2$score.in.grande.section, plot = FALSE) # Save 2nd histogram data - -plot(hg1, col = alpha("blue",0.2),xlab = "Grande section scores",main = ("Grande section scores frequency for group 1 and 2")) # Plot 1st histogram using a transparent color -plot(hg2, col = alpha("red",0.4), add = TRUE) # Add 2nd histogram using different color -legend("topright", - legend = c("group 1", "group 2"), - fill = c(5,2), # Color of the squares - border = "black") -``` -However, the last graph shows that the alternative pedagogy is better than the classical one in the grande section. - -We can also visualize the mean of each section for both groups in the same plot. -```{r} -library(ggplot2) -df <- data.frame(Petite=c(mean(group1$score.in.petite.section),mean(group2$score.in.petite.section)), - Moyenne=c(mean(group1$score.in.moyenne.section),mean(group2$score.in.moyenne.section)), - Grande=c(mean(group1$score.in.grande.section),mean(group2$score.in.grande.section))) -print(df) -plot(x=1:3, y=c(df$Petite[1],df$Moyenne[1],df$Grande[1]), xaxt="n",type = "o", col = 1,xlab = "Average scores",ylab = "Sections", main = "Average scores of sections for group 1 & 2") -lines(x=1:3, y=c(df$Petite[2],df$Moyenne[2],df$Grande[2]),xaxt="n", type = "o", col = 2) -axis(1, at = seq(1, 3, by = 1), las=2) -legend("bottomright", - legend = c("1: Petite section", "2: Moyenne section","3: Grande section"), - border = "black") -par(xpd=TRUE) -legend("topleft", - legend = c("group 1", "group 2"), - fill = c(1,2), # Color of the squares - border = "black") -``` - -# Getting the wrong picture from the data - Correlation, causality -## Read the data -```{r} -data=read.csv("foot_size_data.csv",header = T) -data -``` - -## Data Visualization -- The graph I propose to represent the data is a *box plot*, that shows the the range and median of total mistakes made by each feet size category. - -- To build this graph, we can use the *ggplot* library. Our data is given to the ggplot, the x-axis is the feet size, the y-axis is the number of mistakes, and we want to group the latter by the feet size, so we use the group argument. Then, we tell ggplot to plot the boxes, and finally to set a black & white theme for a good visualization. -```{r} -library(ggplot2) -ggplot(data = data, aes(x=feet_size,y=nb_mistakes,group=factor(feet_size))) + -geom_boxplot() + theme_bw() - -``` -- I made this graph because it summarizes the data well, and serves as a statistic for the reader to make a conclusion just by seeing the graph. - -- We can use a linear regression to determine the relationship between these two variables to make a summary. -```{r} -reg<-lm(data$nb_mistakes ~.,data) -summary(reg) -``` - -- From the graph, we can say that for small sizes of students' feet, the number of mistakes made in dictation are big, and with the growth of the feet size, the number of mistakes decreases. - -- We deduce from the graph that in the small ones (i.e. students with small feet sizes), they have a high probability of making a mistake, however with time (when they grow up and therefore their feet sizes grow), they start to master the language and to make less mistakes. This is logicial and corresponds to my initial intuition. - -- Yes there is a negative correlation between the two quantities. I looked for causality and found that correlation doesn't imply causality. This means that two variables could be linked together without one of them being the reason for the other's observed behavior. - -- 2.18.1