#+TITLE: Latency and capacity estimation for a network connection from asymmetric measurements #+AUTHOR: Alexandre D. Jesus #+DATE: 2020-05-30 #+LANGUAGE: en #+OPTIONS: toc:nil #+PROPERTY: header-args:R :session *R* #+LATEX_CLASS: article #+LATEX_CLASS_OPTIONS: [a4paper] #+LATEX_HEADER: \usepackage{geometry} #+LATEX_HEADER: \linespread{1.2} #+LATEX_HEADER: \setminted{breaklines} #+LATEX_HEADER: \usepackage[parfill]{parskip} * Introduction A simple and commonly used model for the performance of a network connection is given by \[ T(S) = L + S/C \] where $S$ denotes the size of a message in bytes, $L$ denotes the latency of the connection in seconds, and $C$ denotes the capacity of the connection. This model $T(S)$ denotes the time required to send a message of size $S$. In this work, we are interested in estimating the values $L$ and $C$ for a network connection given sample values of $S$ and $T(S)$. * Reading the data ** Dataset format In this work we consider two datasets for two connections: - A short on-campus connection: file://./liglab2.log.gz - A connection to =stackoverflow.com=: file://./stackoverflow.log.gz These files contain the raw output of a =ping= command. Each line is of the form #+begin_example [TIMESTAMP] SIZE bytes from DOMAIN (IP): icmp_seq=ICMP_SEQ ttl=TTL time=TIME ms #+end_example where: - =TIMESTAMP= denotes the date of the measurement expressed in seconds since January 1, 1970 - =SIZE= denotes the size of the message expressed in bytes, which corresponds to the variable $S$ of our model - =DOMAIN= and =IP= denote the domain and IP address of the target machine, which should be fixed for each file - =ICMP_SEQ= and =TTL= are not relevant for our analysis - =TIME= denotes the round-trip time of the message in milliseconds An example line looks like this #+begin_example [1421761682.052172] 665 bytes from lig-publig.imag.fr (129.88.11.7): icmp_seq=1 ttl=60 time=22.5 ms #+end_example ** Reading the data To process these files we devise a function that takes a file as input and returns a dataframe with the relevant data for our analysis #+begin_src R :results none :noweb yes read_data <- function(path) { <> } #+end_src In this function we start by checking if a file exists or not #+begin_src R :noweb-ref read-data :eval never if(!file.exists(path)) { stop("File does not exist") } #+end_src Then we define regexps that look for the timestamp, size and time data in a line #+begin_src R :noweb-ref read-data :eval never timestamp_r <- "\\[[0-9]+(\\.[0-9]+)?\\]" size_r <- "[0-9]+[[:space:]]bytes" time_r <- "time=[0-9]+(\\.[0-9]+)?" all_r <- sprintf("%s.*%s.*%s", timestamp_r, size_r, time_r) #+end_src Then, we find all the lines with no missing data, and gather all the data #+begin_src R :noweb-ref read-data :eval never lines <- readLines(path) ind <- grep(all_r, lines) timestamps <- sapply( regmatches(lines[ind], regexpr(timestamp_r, lines[ind])), function(s) as.numeric(substr(s, 2, nchar(s)-1)), USE.NAMES = F ) sizes <- sapply( regmatches(lines[ind], regexpr(size_r, lines[ind])), function(s) as.numeric(substr(s, 1, nchar(s)-6)), USE.NAMES = F ) times <- sapply( regmatches(lines[ind], regexpr(time_r, lines[ind])), function(s) as.numeric(substr(s, 6, nchar(s))), USE.NAMES = F ) #+end_src Finally we aggregate this data into a data frame #+begin_src R :noweb-ref read-data :eval never df <- data.frame( timestamp = timestamps, size = sizes, time = times ) df$timestamp <- as.POSIXct( df$timestamp, origin = "1970-01-01" ) df #+end_src and quickly check if the function is working correctly #+begin_src R :results output :exports both head(read_data("./stackoverflow.log.gz")) #+end_src #+RESULTS: : timestamp size time : 1 2015-01-20 16:26:43 1257 120 : 2 2015-01-20 16:26:43 454 120 : 3 2015-01-20 16:26:43 775 126 : 4 2015-01-20 16:26:44 1334 112 : 5 2015-01-20 16:26:44 83 111 : 6 2015-01-20 16:26:44 694 111 * Analysis of the dataset =liglab2= ** Effect of timestamp on time We start by looking at the effect of timestamp into time by plotting all samples from our dataset. #+begin_src R :exports code :results none dat <- read_data("./liglab2.log.gz") #+end_src #+begin_src R :exports results :results file graphics :file ./liglab1.pdf :width 6 :height 4 library("ggplot2") ggplot(dat, aes(x = timestamp, y = time)) + geom_point() + theme_bw() #+end_src #+RESULTS: [[file:./liglab1.pdf]] This plot does not show an impact of the timestamp on time. Another way to look at this data is through a bar plot that shows the frequency of time over certain timestamp intervals. #+begin_src R :exports results :results file graphics :file ./liglab2.pdf :width 6 :height 4 steps <- 16 dif <- (max(dat$timestamp) - min(dat$timestamp)) / (steps) lower_ts <- c() mean_ts <- c() upper_ts <- c() counts <- c() types <- c() for(i in 1:steps) { mi <- min(dat$timestamp)+dif*(i-1) me <- mi + dif / 2 ma <- min(dat$timestamp)+dif*i if(i == steps) { ma = ma + 1; } lower_ts <- c(lower_ts, mi, mi, mi, mi, mi) mean_ts <- c(mean_ts, me, me, me, me, me) upper_ts <- c(upper_ts, ma, ma, ma, ma, ma) count_times <- function(tl, ts, tsl, tsu) { sum(dat$time >= tl & dat$time < ts & dat$timestamp >= tsl & dat$timestamp < tsu) } types <- c( types, "0 <= t < 10", "10 <= t < 20", "20 <= t < 50", "50 <= t < 100", "100 <= t" ) counts <- c( counts, count_times(0, 10, mi, ma), count_times(10, 20, mi, ma), count_times(20, 50, mi, ma), count_times(50, 100, mi, ma), count_times(100, max(dat$time)+1, mi, ma) ) } aux <- data.frame( lower_ts = lower_ts, upper_ts = upper_ts, mean_ts = mean_ts, type = types, count = counts ) aux$lower_ts <- as.POSIXct(aux$lower_ts, origin = "1970-01-01") aux$mean_ts <- as.POSIXct(aux$mean_ts, origin = "1970-01-01") aux$upper_ts <- as.POSIXct(aux$upper_ts, origin = "1970-01-01") ggplot(aux, aes(x=mean_ts, y = count, fill = type)) + geom_bar(stat="identity") + xlab("Timestamp") + ylab("Count") + theme_bw() #+end_src #+RESULTS: [[file:./liglab2.pdf]] This plot also shows no significant impact of the timestamp on time. There are a couple of intervals where the time seems to increase slightly but nothing too significant. Also we note that the number of samples per timestamp period is consistent. ** Effect of size on time We start by plotting time over size #+begin_src R :exports results :results file graphics :file ./liglab3.pdf :width 6 :height 4 library("ggplot2") ggplot(dat, aes(x = size, y = time)) + geom_point() + theme_bw() #+end_src #+RESULTS: [[file:./liglab3.pdf]] This plot shows that there is an increase in time after a certain size. To better determine at what size this increase happens we zoom around that point. #+begin_src R :exports results :results file graphics :file ./liglab4.pdf :width 6 :height 4 library("ggplot2") ggplot(dat[dat$size >= 1450 & dat$size <= 1500,], aes(x = size, y = time)) + geom_point() + theme_bw() #+end_src #+RESULTS: [[file:./liglab4.pdf]] From this plot the increase in time seems to be after size 1480. As such, to be able to deal with these two cases separately, we introduce a class column into the dataset. #+begin_src R :exports code :results none dat$class <- NA dat[dat$size <= 1480,]$class <- "small" dat[dat$size > 1480,]$class <- "large" dat$class <- as.factor(dat$class) #+end_src ** Estimating values of L and C We will estimate the values of L and C for each of the two datasets previously defined. *** Linear regression To estimate the parameters of our network model we start by considering two linear regression models (one for each class). For the class "small" we have the linear regression model #+begin_src R :exports both :results output lmMod.small <- lm(time ~ size, data = dat[dat$class == "small",]) lmMod.small #+end_src #+RESULTS: : : Call: : lm(formula = time ~ size, data = dat[dat$class == "small", ]) : : Coefficients: : (Intercept) size : 3.2756742 0.0003263 From this model, we can estimate the latency and capacity values for the network model. In particular the linear regression model gives us the equation \[ T(S) = 3.2756742 + 0.0003263 \times S \] Then, since our network model is given by \[ T(S) = L + S / C \] we can estimate a latency $L = 3.2756742$ and a capacity $C = 1 / 0.0003263 = 3064.664$. For class "large" we have the linear regression model #+begin_src R :exports both :results output lmMod.large <- lm(time ~ size, data = dat[dat$class == "large",]) lmMod.large #+end_src #+RESULTS: : : Call: : lm(formula = time ~ size, data = dat[dat$class == "large", ]) : : Coefficients: : (Intercept) size : 5.289833 0.002579 From this we estimate a latency $L = 5.289833$ and a capacity $C = 1 / 0.002579 = 387.7472$. Finally, we plot both the regression models against our samples #+begin_src R :exports results :results file graphics :file ./liglab5.pdf :width 6 :height 4 library("ggplot2") ggplot(dat, aes(x = size, y = time)) + geom_point(color = "gray") + geom_smooth(method="lm", formula = y ~ x, se = F, color = "black") + facet_wrap(vars(class), scales="free") + theme_bw() #+end_src #+RESULTS: [[file:./liglab5.pdf]] Note that these seem to be overestimating most of the data. In the next section we will look into how we can improve this. *** Quantile regression If we look at a box plot of our data #+begin_src R :exports results :results file graphics :file ./liglab5.pdf :width 6 :height 4 library("ggplot2") ggplot(dat, aes(class, time)) + geom_boxplot() + theme_bw() #+end_src #+RESULTS: [[file:./liglab5.pdf]] it indicates that a first look at the points means that we mostly see outliers. We can zoom in further to take a better look at the box plot. #+begin_src R :exports results :results file graphics :file ./liglab6.pdf :width 6 :height 4 library("ggplot2") ggplot(dat, aes(class, time)) + geom_boxplot() + coord_cartesian(ylim = c(0,3)) + theme_bw() #+end_src #+RESULTS: [[file:./liglab6.pdf]] As a result of this asymmetry in the data, its probably better to use quantile regression with respect to the median. #+begin_src R :results none library(quantreg) #+end_src #+begin_src R :exports both :results output rqMod.small <- rq(time ~ size, tau = 0.5, data = dat[dat$class == "small",]) rqMod.small #+end_src #+RESULTS: #+begin_example Call: rq(formula = time ~ size, tau = 0.5, data = dat[dat$class == "small", ]) Coefficients: (Intercept) size 1.1390594059 0.0002475248 Degrees of freedom: 32667 total; 32665 residual #+end_example For the "small" class we have an estimated latency $L = 1.139$ and capacity $C = 1 / 0.0002475248 = 4039.999$. #+begin_src R :exports both :results output rqMod.large <- rq(time ~ size, tau = 0.5, data = dat[dat$class == "large",]) rqMod.large #+end_src #+RESULTS: #+begin_example Call: rq(formula = time ~ size, tau = 0.5, data = dat[dat$class == "large", ]) Coefficients: (Intercept) size 1.8853521127 0.0002464789 Degrees of freedom: 11369 total; 11367 residual #+end_example For the "large" class we have an estimated latency $L = 1.885$ and capacity $C = 1 / 0.0002464789 = 4057.142$. We can see that the quantile regression gives a different estimate, and that the latency estimate is significantly lower as expected. We can plot our models #+begin_src R :exports results :results file graphics :file ./liglab7.pdf :width 6 :height 4 library("ggplot2") ggplot(dat, aes(x = size, y = time)) + geom_point(color = "gray") + geom_smooth(method="rq", formula = y ~ x, se = F, method.args = list(tau = 0.5), color = "black") + facet_wrap(vars(class), scales="free") + theme_bw() #+end_src #+RESULTS: [[file:./liglab7.pdf]] And zoom in to take a better look #+begin_src R :exports results :results file graphics :file ./liglab8.pdf :width 6 :height 4 library("ggplot2") ggplot(dat, aes(x = size, y = time)) + geom_point(color = "gray") + geom_smooth(method="rq", formula = y ~ x, se = F, method.args = list(tau = 0.5), color = "black") + facet_wrap(vars(class), scales="free") + coord_cartesian(ylim = c(0, 3)) + theme_bw() #+end_src #+RESULTS: [[file:./liglab8.pdf]] Indeed there seems to be a better fit than with the linear regression which we plot below on a similar scale. #+begin_src R :exports results :results file graphics :file ./liglab9.pdf :width 6 :height 4 library("ggplot2") ggplot(dat, aes(x = size, y = time)) + geom_point(color = "gray") + geom_smooth(method="lm", formula = y ~ x, se = F, color = "black") + facet_wrap(vars(class), scales="free") + coord_cartesian(ylim = c(0, 10)) + theme_bw() #+end_src #+RESULTS: [[file:./liglab9.pdf]] * Analysis of the dataset =stackoverflow= Now we will perform a similar analysis on the =stackoverflow= dataset. ** Effect of timestamp on time We start by looking of the effect of the timestamp into the time. #+begin_src R :exports code :results none dat <- read_data("./stackoverflow.log.gz") #+end_src #+begin_src R :exports results :results file graphics :file ./stackoverflow1.pdf :width 6 :height 4 library("ggplot2") ggplot(dat, aes(x = timestamp, y = time)) + geom_point() + theme_bw() #+end_src #+RESULTS: [[file:./stackoverflow1.pdf]] Like in the previous dataset, there seems to be no significant impact of timestamp on time. We can also look frequency bar plots. #+begin_src R :exports results :results file graphics :file ./stackoverflow2.pdf :width 6 :height 4 steps <- 16 dif <- (max(dat$timestamp) - min(dat$timestamp)) / (steps) lower_ts <- c() mean_ts <- c() upper_ts <- c() counts <- c() types <- c() for(i in 1:steps) { mi <- min(dat$timestamp)+dif*(i-1) me <- mi + dif / 2 ma <- min(dat$timestamp)+dif*i if(i == steps) { ma = ma + 1; } lower_ts <- c(lower_ts, mi, mi, mi) mean_ts <- c(mean_ts, me, me, me) upper_ts <- c(upper_ts, ma, ma, ma) count_times <- function(tl, ts, tsl, tsu) { sum(dat$time >= tl & dat$time < ts & dat$timestamp >= tsl & dat$timestamp < tsu) } types <- c( types, "0 <= t < 115", "115 <= t < 140", "140 <= t" ) counts <- c( counts, count_times(0, 115, mi, ma), count_times(115, 140, mi, ma), count_times(140, max(dat$time)+1, mi, ma) ) } aux <- data.frame( lower_ts = lower_ts, upper_ts = upper_ts, mean_ts = mean_ts, type = types, count = counts ) aux$lower_ts <- as.POSIXct(aux$lower_ts, origin = "1970-01-01") aux$mean_ts <- as.POSIXct(aux$mean_ts, origin = "1970-01-01") aux$upper_ts <- as.POSIXct(aux$upper_ts, origin = "1970-01-01") ggplot(aux, aes(x=mean_ts, y = count, fill = type)) + geom_bar(stat="identity") + xlab("Timestamp") + ylab("Count") + theme_bw() #+end_src #+RESULTS: [[file:./stackoverflow2.pdf]] This plot shows that there are a couple of intervals with a slight increase in time (e.g. around 16:40). However, this does not seem to warrant a separation of the data. ** Effect of size on time We start by plotting the points of time over size #+begin_src R :exports results :results file graphics :file ./stackoverflow3.pdf :width 6 :height 4 library("ggplot2") ggplot(dat, aes(x = size, y = time)) + geom_point() + theme_bw() #+end_src #+RESULTS: [[file:./stackoverflow3.pdf]] This plot shows that the size seems to have an impact on time. If we zoom in #+begin_src R :exports results :results file graphics :file ./stackoverflow4.pdf :width 6 :height 4 library("ggplot2") ggplot(dat, aes(x = size, y = time)) + geom_point() + coord_cartesian(xlim = c(1440, 1520)) + theme_bw() #+end_src #+RESULTS: [[file:./stackoverflow4.pdf]] we see that this difference is after size 1480. As such we split the data into two classes. #+begin_src R :exports code :results none dat$class <- NA dat[dat$size <= 1480,]$class <- "small" dat[dat$size > 1480,]$class <- "large" dat$class <- as.factor(dat$class) #+end_src ** Estimating values of L and C *** Linear regression We again start by estimating the parameters of our network model with two linear regression models (one for each class). #+begin_src R :exports both :results output lmMod.small <- lm(time ~ size, data = dat[dat$class == "small",]) lmMod.small #+end_src #+RESULTS: : : Call: : lm(formula = time ~ size, data = dat[dat$class == "small", ]) : : Coefficients: : (Intercept) size : 1.132e+02 4.521e-05 For the class "small" (size less than or equal to 1480) the linear model gives us the equation \[ T(S) = 113.2 + 0.00004521 \times S \] meaning that we can estimate a latency $L = 113.2$ and capacity $C = 1 / 0.00004521 = 22119$. #+begin_src R :exports both :results output lmMod.large <- lm(time ~ size, data = dat[dat$class == "large",]) lmMod.large #+end_src #+RESULTS: : : Call: : lm(formula = time ~ size, data = dat[dat$class == "large", ]) : : Coefficients: : (Intercept) size : 120.053588 -0.001803 For class "large" we estimate a latency $L = 120.053588$ and capacity $C = 1 / -0.001803 = -554.63$. Note that a negative capacity does not make sense in terms of the network model. However, we may assume that this is an artifact of our dataset. A plot of the regression model against the data seems to indicate that both the models seem to be overestimating the latency due to outliers. The same as it happened for the previous dataset. #+begin_src R :exports results :results file graphics :file ./stackoverflow5.pdf :width 6 :height 4 library("ggplot2") ggplot(dat, aes(x = size, y = time)) + geom_point(color = "gray") + geom_smooth(method="lm", formula = y ~ x, color = "black", se = F) + facet_wrap(vars(class), scales="free") + theme_bw() #+end_src #+RESULTS: [[file:./stackoverflow5.pdf]] *** Quantile regression As in the previous case we notice an asymmetry in the data, and the linear regression model seems to influenced by the outliers. #+begin_src R :exports results :results file graphics :file ./stackoverflow6.pdf :width 6 :height 4 library("ggplot2") ggplot(dat, aes(class, time)) + geom_boxplot() + theme_bw() #+end_src #+RESULTS: [[file:./stackoverflow6.pdf]] As such, we will once again look into quantile regression with respect to the median. #+begin_src R :results none library(quantreg) #+end_src #+begin_src R :exports both :results output rqMod.small <- rq(time ~ size, tau = 0.5, data = dat[dat$class == "small",]) rqMod.small #+end_src #+RESULTS: #+begin_example Call: rq(formula = time ~ size, tau = 0.5, data = dat[dat$class == "small", ]) Coefficients: (Intercept) size 1.110000e+02 7.013151e-18 Degrees of freedom: 5015 total; 5013 residual #+end_example For the "small" class we estimate a latency $L = 111$ and a capacity $C = 1 / 7.013151e-18 = 1.425893e+17$. Note that the capacity seems to have no impact. #+begin_src R :exports both :results output rqMod.large <- rq(time ~ size, tau = 0.5, data = dat[dat$class == "large",]) rqMod.large #+end_src #+RESULTS: #+begin_example Call: rq(formula = time ~ size, tau = 0.5, data = dat[dat$class == "large", ]) Coefficients: (Intercept) size 1.120000e+02 -4.405647e-19 Degrees of freedom: 1809 total; 1807 residual #+end_example In the "large" class we estimate a latency $L = 112$ and a capacity $C = 1 / -4.405647e-19 = -2.269814e+18$. Note that once again we have a negative capacity but that the slope is very close to zero. As such, this seems to be an artifact of the data, and we could change the sign of the slope to be positive with no significant detriment to our model. Finally, we plot the model along with the data #+begin_src R :exports results :results file graphics :file ./stackoverflow7.pdf :width 6 :height 4 library("ggplot2") ggplot(dat, aes(x = size, y = time)) + geom_point(color = "gray") + geom_smooth(method="rq", formula = y ~ x, se = F, method.args = list(tau = 0.5), color = "black") + facet_wrap(vars(class), scales="free") + theme_bw() #+end_src #+RESULTS: [[file:./stackoverflow7.pdf]] This plot indicates that similarly to the previous dataset, the quantile model is closer to the actual data. # LocalWords: liglab stackoverflow