--- title: "Linear Regression on trees" author: "Nour El hassane" date: "2022-11-24" output: html_document --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) library(tidyverse) library(dplyr) library(ggplot2) get_hostname <- function(){ return(gsub(" ", "", as.character(Sys.info()["nodename"]))) } ``` ## Import the data ```{r} myData=read.table(file = "arbres-tot.csv",sep=";",header = TRUE) myData=myData[myData$HAUTEUR!=0,] myData ``` ```{r} circ=myData$CIRCONFERENCE height=myData$HAUTEUR arbres <- data.frame(circ=circ,height=height) arbres ``` ```{r} ggplot(arbres, aes (x=circ,y=height)) + geom_point ()+xlab("circ")+ylab("height") ``` ```{r} simple_reg <- lm(height~circ, data=arbres) names(simple_reg) ``` ```{r} anova(simple_reg) #1509 is the variance captured by the model #1131 is the variance not captured by the model ``` ```{r} summary(simple_reg) # 0.09 is the estimate of beta_2 # We keep circ bcz beta_2 is significantly different from 0 ``` ```{r} ggplot(myData,aes(x = circ,y=height))+geom_point()+stat_smooth(method = "lm",se=FALSE)+xlab("circ")+ylab("height") ``` ```{r} # It tells how points are far from the line, and shows that it's okay, but if we have a big value in between, then it could be that there is no correlation, or there is a problem acf(residuals(simple_reg)) ``` ```{r} plot(simple_reg,2) ``` ```{r} # graph1: plot the line in a horizontal direction # graph 2: variability on errors, the more the height is big, the more the error is probable because in the dataset points go below the line for larger values # graph 3: x is fitted values if the model was perfect, plot(simple_reg$residuals) plot(simple_reg,3) plot(simple_reg,1) ``` ```{r} # Graph to detect all outliers, obs 74, 33, and 102 are indexes of the outliers plot(simple_reg,4) ``` ```{r} # fit is the predcition, lwr,upr bounds of the confidence interval predict(simple_reg, data.frame(circ=75),interval = "prediction") ``` ```{r} # R is better, but coefficients aren't significantly far from 0. We remove circ. arbres$circ_sqrt<-sqrt(arbres$circ) multi_reg <- lm(height~circ+circ_sqrt,data=arbres) summary(multi_reg) ``` ```{r} ggplot(arbres,aes(x = circ_sqrt,y=height))+geom_point()+stat_smooth(method = "lm",se=FALSE)+xlab("circ_sqrt")+ylab("height") ```