From 204498aacccab9db47e3f60702bd959845551d90 Mon Sep 17 00:00:00 2001 From: NourElh <734092651fcdd5add927271f472626a6@app-learninglab.inria.fr> Date: Thu, 5 Jan 2023 12:04:42 +0000 Subject: [PATCH] Upload New File --- Linear Model/tree_lab.Rmd | 92 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 Linear Model/tree_lab.Rmd diff --git a/Linear Model/tree_lab.Rmd b/Linear Model/tree_lab.Rmd new file mode 100644 index 0000000..ca736e4 --- /dev/null +++ b/Linear Model/tree_lab.Rmd @@ -0,0 +1,92 @@ +--- +title: "Linear Regression on trees" +author: "Nour El hassane" +date: "2022-11-24" +output: html_document +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +library(tidyverse) +library(dplyr) +library(ggplot2) +get_hostname <- function(){ + return(gsub(" ", "", as.character(Sys.info()["nodename"]))) +} +``` +## Import the data +```{r} +myData=read.table(file = "arbres-tot.csv",sep=";",header = TRUE) +myData=myData[myData$HAUTEUR!=0,] +myData +``` + +```{r} +circ=myData$CIRCONFERENCE +height=myData$HAUTEUR +arbres <- data.frame(circ=circ,height=height) +arbres +``` + +```{r} +ggplot(arbres, aes (x=circ,y=height)) + geom_point ()+xlab("circ")+ylab("height") +``` + +```{r} +simple_reg <- lm(height~circ, data=arbres) +names(simple_reg) +``` + +```{r} +anova(simple_reg) +#1509 is the variance captured by the model +#1131 is the variance not captured by the model +``` +```{r} +summary(simple_reg) +# 0.09 is the estimate of beta_2 +# We keep circ bcz beta_2 is significantly different from 0 +``` + +```{r} +ggplot(myData,aes(x = circ,y=height))+geom_point()+stat_smooth(method = "lm",se=FALSE)+xlab("circ")+ylab("height") +``` + +```{r} +# It tells how points are far from the line, and shows that it's okay, but if we have a big value in between, then it could be that there is no correlation, or there is a problem +acf(residuals(simple_reg)) + +``` + +```{r} +plot(simple_reg,2) +``` + +```{r} +# graph1: plot the line in a horizontal direction +# graph 2: variability on errors, the more the height is big, the more the error is probable because in the dataset points go below the line for larger values +# graph 3: x is fitted values if the model was perfect, +plot(simple_reg$residuals) +plot(simple_reg,3) +plot(simple_reg,1) +``` +```{r} +# Graph to detect all outliers, obs 74, 33, and 102 are indexes of the outliers +plot(simple_reg,4) +``` + +```{r} +# fit is the predcition, lwr,upr bounds of the confidence interval +predict(simple_reg, data.frame(circ=75),interval = "prediction") +``` + +```{r} +# R is better, but coefficients aren't significantly far from 0. We remove circ. +arbres$circ_sqrt<-sqrt(arbres$circ) +multi_reg <- lm(height~circ+circ_sqrt,data=arbres) +summary(multi_reg) +``` + +```{r} +ggplot(arbres,aes(x = circ_sqrt,y=height))+geom_point()+stat_smooth(method = "lm",se=FALSE)+xlab("circ_sqrt")+ylab("height") +``` -- 2.18.1