Commit 204498aa authored by NourElh's avatar NourElh

Upload New File

parent 181713f5
---
title: "Linear Regression on trees"
author: "Nour El hassane"
date: "2022-11-24"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(dplyr)
library(ggplot2)
get_hostname <- function(){
return(gsub(" ", "", as.character(Sys.info()["nodename"])))
}
```
## Import the data
```{r}
myData=read.table(file = "arbres-tot.csv",sep=";",header = TRUE)
myData=myData[myData$HAUTEUR!=0,]
myData
```
```{r}
circ=myData$CIRCONFERENCE
height=myData$HAUTEUR
arbres <- data.frame(circ=circ,height=height)
arbres
```
```{r}
ggplot(arbres, aes (x=circ,y=height)) + geom_point ()+xlab("circ")+ylab("height")
```
```{r}
simple_reg <- lm(height~circ, data=arbres)
names(simple_reg)
```
```{r}
anova(simple_reg)
#1509 is the variance captured by the model
#1131 is the variance not captured by the model
```
```{r}
summary(simple_reg)
# 0.09 is the estimate of beta_2
# We keep circ bcz beta_2 is significantly different from 0
```
```{r}
ggplot(myData,aes(x = circ,y=height))+geom_point()+stat_smooth(method = "lm",se=FALSE)+xlab("circ")+ylab("height")
```
```{r}
# It tells how points are far from the line, and shows that it's okay, but if we have a big value in between, then it could be that there is no correlation, or there is a problem
acf(residuals(simple_reg))
```
```{r}
plot(simple_reg,2)
```
```{r}
# graph1: plot the line in a horizontal direction
# graph 2: variability on errors, the more the height is big, the more the error is probable because in the dataset points go below the line for larger values
# graph 3: x is fitted values if the model was perfect,
plot(simple_reg$residuals)
plot(simple_reg,3)
plot(simple_reg,1)
```
```{r}
# Graph to detect all outliers, obs 74, 33, and 102 are indexes of the outliers
plot(simple_reg,4)
```
```{r}
# fit is the predcition, lwr,upr bounds of the confidence interval
predict(simple_reg, data.frame(circ=75),interval = "prediction")
```
```{r}
# R is better, but coefficients aren't significantly far from 0. We remove circ.
arbres$circ_sqrt<-sqrt(arbres$circ)
multi_reg <- lm(height~circ+circ_sqrt,data=arbres)
summary(multi_reg)
```
```{r}
ggplot(arbres,aes(x = circ_sqrt,y=height))+geom_point()+stat_smooth(method = "lm",se=FALSE)+xlab("circ_sqrt")+ylab("height")
```
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment