From 204498aacccab9db47e3f60702bd959845551d90 Mon Sep 17 00:00:00 2001
From: NourElh <734092651fcdd5add927271f472626a6@app-learninglab.inria.fr>
Date: Thu, 5 Jan 2023 12:04:42 +0000
Subject: [PATCH] Upload New File

---
 Linear Model/tree_lab.Rmd | 92 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 92 insertions(+)
 create mode 100644 Linear Model/tree_lab.Rmd

diff --git a/Linear Model/tree_lab.Rmd b/Linear Model/tree_lab.Rmd
new file mode 100644
index 0000000..ca736e4
--- /dev/null
+++ b/Linear Model/tree_lab.Rmd	
@@ -0,0 +1,92 @@
+---
+title: "Linear Regression on trees"
+author: "Nour El hassane"
+date: "2022-11-24"
+output: html_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+library(tidyverse)
+library(dplyr)
+library(ggplot2)
+get_hostname <- function(){
+  return(gsub(" ", "", as.character(Sys.info()["nodename"])))
+}
+```
+## Import the data
+```{r}
+myData=read.table(file = "arbres-tot.csv",sep=";",header = TRUE)
+myData=myData[myData$HAUTEUR!=0,]
+myData
+```
+
+```{r}
+circ=myData$CIRCONFERENCE
+height=myData$HAUTEUR
+arbres <- data.frame(circ=circ,height=height)
+arbres
+```
+
+```{r}
+ggplot(arbres, aes (x=circ,y=height)) + geom_point ()+xlab("circ")+ylab("height")
+```
+
+```{r}
+simple_reg <- lm(height~circ, data=arbres)
+names(simple_reg)
+```
+
+```{r}
+anova(simple_reg)
+#1509 is the variance captured by the model
+#1131 is the variance not captured by the model
+```
+```{r}
+summary(simple_reg)
+# 0.09 is the estimate of beta_2
+# We keep circ bcz beta_2 is significantly different from 0
+```
+
+```{r}
+ggplot(myData,aes(x = circ,y=height))+geom_point()+stat_smooth(method = "lm",se=FALSE)+xlab("circ")+ylab("height")
+```
+
+```{r}
+# It tells how points are far from the line, and shows that it's okay, but if we have a big value in between, then it could be that there is no correlation, or there is a problem
+acf(residuals(simple_reg))
+
+```
+
+```{r}
+plot(simple_reg,2)
+```
+
+```{r}
+# graph1: plot the line in a horizontal direction
+# graph 2: variability on errors, the more the height is big, the more the error is probable because in the dataset points go below the line for larger values
+# graph 3: x is fitted values if the model was perfect, 
+plot(simple_reg$residuals)
+plot(simple_reg,3)
+plot(simple_reg,1)
+```
+```{r}
+# Graph to detect all outliers, obs 74, 33, and 102 are indexes of the outliers
+plot(simple_reg,4)
+```
+
+```{r}
+# fit is the predcition, lwr,upr bounds of the confidence interval
+predict(simple_reg, data.frame(circ=75),interval = "prediction")
+```
+
+```{r}
+# R is better, but coefficients aren't significantly far from 0. We remove circ.
+arbres$circ_sqrt<-sqrt(arbres$circ)
+multi_reg <- lm(height~circ+circ_sqrt,data=arbres)
+summary(multi_reg)
+```
+
+```{r}
+ggplot(arbres,aes(x = circ_sqrt,y=height))+geom_point()+stat_smooth(method = "lm",se=FALSE)+xlab("circ_sqrt")+ylab("height")
+```
-- 
2.18.1