# -*- coding: utf-8 -*-
# -*- org-confirm-babel-evaluate: nil -*-
# -*- mode: org -*-
#+TITLE:
#+LANGUAGE:  en
#+OPTIONS: H:5 author:nil email:nil creator:nil timestamp:nil skip:nil toc:nil ^:nil
#+TAGS: ARNAUD(a) CHRISTIAN(c) TOM(T)
#+TAGS: noexport(n) DEPRECATED(d) ignore(i)
#+TAGS: EXPERIMENT(e) LU(l) EP(e)
#+STARTUP: overview indent inlineimages logdrawer hidestars
#+EXPORT_SELECT_TAGS: export
#+EXPORT_EXCLUDE_TAGS: noexport
#+SEQ_TODO: TODO(t!) STARTED(s!) WAITING(w@) | DONE(d!) CANCELLED(c@) DEFERRED(@) FLAWED(f@)
#+LATEX_CLASS: IEEEtran
#+LATEX_CLASS_OPTIONS: [nofonttune]
#+PROPERTY: header-args :eval never-export

* LaTeX Preamble                                                     :ignore:
#+LATEX_HEADER: \usepackage{DejaVuSansMono}
#+LATEX_HEADER: \usepackage[T1]{fontenc}
#+LATEX_HEADER: \usepackage[utf8]{inputenc}
#+LATEX_HEADER: %\usepackage{fixltx2e}
#+LATEX_HEADER: \usepackage{ifthen,figlatex}
#+LATEX_HEADER: \usepackage{longtable}
#+LATEX_HEADER: \usepackage{float}
#+LATEX_HEADER: \usepackage{wrapfig}
#+LATEX_HEADER: \usepackage{subfigure}
#+LATEX_HEADER: \usepackage{graphicx}
#+LATEX_HEADER: \usepackage{color,soul}
#+LATEX_HEADER: \usepackage[export]{adjustbox}
#+LATEX_HEADER: \usepackage{xspace}
#+LATEX_HEADER: \usepackage{amsmath,amssymb}
#+LATEX_HEADER: \usepackage[american]{babel}
#+LATEX_HEADER: \usepackage{relsize}
#+LATEX_HEADER: \AtBeginDocument{
#+LATEX_HEADER:   \definecolor{pdfurlcolor}{rgb}{0,0,0.6}
#+LATEX_HEADER:   \definecolor{pdfcitecolor}{rgb}{0,0.6,0}
#+LATEX_HEADER:   \definecolor{pdflinkcolor}{rgb}{0.6,0,0}
#+LATEX_HEADER:   \definecolor{light}{gray}{.85}
#+LATEX_HEADER:   \definecolor{vlight}{gray}{.95}
#+LATEX_HEADER: }
#+LATEX_HEADER: %\usepackage[paper=letterpaper,margin=1.61in]{geometry}
#+LATEX_HEADER: \usepackage{url} \urlstyle{sf}
#+LATEX_HEADER: \usepackage[normalem]{ulem}
#+LATEX_HEADER: \usepackage{todonotes}
#+LATEX_HEADER: \usepackage{fancyvrb}
#+LATEX_HEADER: \usepackage[colorlinks=true,citecolor=pdfcitecolor,urlcolor=pdfurlcolor,linkcolor=pdflinkcolor,pdfborder={0 0 0}]{hyperref}
#+LATEX_HEADER: \usepackage{color,colortbl}
#+LATEX_HEADER: \definecolor{gray98}{rgb}{0.98,0.98,0.98}
#+LATEX_HEADER: \definecolor{gray20}{rgb}{0.20,0.20,0.20}
#+LATEX_HEADER: \definecolor{gray25}{rgb}{0.25,0.25,0.25}
#+LATEX_HEADER: \definecolor{gray16}{rgb}{0.161,0.161,0.161}
#+LATEX_HEADER: \definecolor{gray60}{rgb}{0.6,0.6,0.6}
#+LATEX_HEADER: \definecolor{gray30}{rgb}{0.3,0.3,0.3}
#+LATEX_HEADER: \definecolor{bgray}{RGB}{248, 248, 248}
#+LATEX_HEADER: \definecolor{amgreen}{RGB}{77, 175, 74}
#+LATEX_HEADER: \definecolor{amblu}{RGB}{55, 126, 184}
#+LATEX_HEADER: \definecolor{amred}{RGB}{228,26,28}
#+LATEX_HEADER: \definecolor{amdove}{RGB}{102,102,122}
#+LATEX_HEADER: \usepackage{xcolor}
#+LATEX_HEADER: \usepackage[procnames]{listings}
#+LATEX_HEADER: \lstset{ %
#+LATEX_HEADER:  backgroundcolor=\color{gray98},    % choose the background color; you must add \usepackage{color} or \usepackage{xcolor}
#+LATEX_HEADER:  basicstyle=\tt\scriptsize,        % the size of the fonts that are used for the code
#+LATEX_HEADER:  breakatwhitespace=false,          % sets if automatic breaks should only happen at whitespace
#+LATEX_HEADER:  breaklines=true,                  % sets automatic line breaking
#+LATEX_HEADER:  showlines=true,                   % sets automatic line breaking
#+LATEX_HEADER:  captionpos=b,                     % sets the caption-position to bottom
#+LATEX_HEADER:  commentstyle=\color{gray30},      % comment style
#+LATEX_HEADER:  extendedchars=true,               % lets you use non-ASCII characters; for 8-bits encodings only, does not work with UTF-8
#+LATEX_HEADER:  frame=single,                     % adds a frame around the code
#+LATEX_HEADER:  keepspaces=true,                  % keeps spaces in text, useful for keeping indentation of code (possibly needs columns=flexible)
#+LATEX_HEADER:  keywordstyle=\color{amblu},       % keyword style
#+LATEX_HEADER:  procnamestyle=\color{amred},      % procedures style
#+LATEX_HEADER:  language=[95]fortran,             % the language of the code
#+LATEX_HEADER:  numbers=left,                     % where to put the line-numbers; possible values are (none, left, right)
#+LATEX_HEADER:  numbersep=5pt,                    % how far the line-numbers are from the code
#+LATEX_HEADER:  numberstyle=\tiny\color{gray20},  % the style that is used for the line-numbers
#+LATEX_HEADER:  rulecolor=\color{gray20},         % if not set, the frame-color may be changed on line-breaks within not-black text (\eg comments (green here))
#+LATEX_HEADER:  showspaces=false,                 % show spaces everywhere adding particular underscores; it overrides 'showstringspaces'
#+LATEX_HEADER:  showstringspaces=false,           % underline spaces within strings only
#+LATEX_HEADER:  showtabs=false,                   % show tabs within strings adding particular underscores
#+LATEX_HEADER:  stepnumber=2,                     % the step between two line-numbers. If it's 1, each line will be numbered
#+LATEX_HEADER:  stringstyle=\color{amdove},       % string literal style
#+LATEX_HEADER:  tabsize=2,                        % sets default tabsize to 2 spaces
#+LATEX_HEADER:  % title=\lstname,                    % show the filename of files included with \lstinputlisting; also try caption instead of title
#+LATEX_HEADER:  procnamekeys={call}
#+LATEX_HEADER: }
#+LATEX_HEADER: \definecolor{colorfuncall}{rgb}{0.6,0,0}
#+LATEX_HEADER: \newcommand{\prettysmall}{\fontsize{6}{8}\selectfont}
#+LATEX_HEADER: \let\oldtexttt=\texttt
#+LATEX_HEADER: \renewcommand\texttt[1]{\oldtexttt{\smaller[1]{#1}}}
# #+LATEX_HEADER: \usepackage[round-precision=3,round-mode=figures,scientific-notation=true]{siunitx}
#+LATEX_HEADER: \usepackage[binary-units]{siunitx}
#+LATEX_HEADER: \DeclareSIUnit\flop{Flop}
#+LATEX_HEADER: \DeclareSIUnit\flops{\flop\per\second}
#+LATEX_HEADER:\usepackage{tikz}
#+LATEX_HEADER:\usetikzlibrary{arrows,shapes,positioning,shadows,trees,calc}
#+LATEX_HEADER:\usepackage{pgfplots}
#+LATEX_HEADER:\pgfplotsset{compat=1.13}

#+LATEX_HEADER: \usepackage{enumitem}
#+LATEX_HEADER: \setlist[itemize,1]{leftmargin=\dimexpr 26pt-.2in}
#+LATEX_HEADER: \usepackage[mode=buildnew]{standalone}
#+LATEX_HEADER: \usepackage[ruled,vlined,english]{algorithm2e}
#+LATEX_HEADER: \DontPrintSemicolon

#+LaTeX: \newcommand\myemph[1]{\color{colorfuncall}\textbf{#1}}%

#+LaTeX: \newcommand\labspace[1][-0cm]{\vspace{#1}}
#+LaTeX: \renewcommand\O{\ensuremath{\mathcal{O}}\xspace}%

#+BEGIN_EXPORT latex
\makeatletter
\newcommand{\removelatexerror}{\let\@latex@error\@gobble}
\makeatother
#+END_EXPORT

* LaTeX IEEE title and authors                                       :ignore:
#+BEGIN_EXPORT latex
\let\oldcite=\cite
\renewcommand\cite[2][]{~\ifthenelse{\equal{#1}{}}{\oldcite{#2}}{\oldcite[#1]{#2}}\xspace}
\let\oldref=\ref
\def\ref#1{~\oldref{#1}\xspace}
\def\eqref#1{~(\oldref{#1})\xspace}
\def\ie{i.e.,\xspace}
\def\eg{e.g.,\xspace}
\def\etal{~\textit{et al.\xspace}}
\newcommand{\AL}[2][inline]{\todo[caption={},color=green!50,#1]{\small\sf\textbf{AL:} #2}}
\newcommand{\TC}[2][inline]{\todo[caption={},color=blue!50,#1]{\small\sf\textbf{TOM:} #2}}
\newcommand{\CH}[2][inline]{\todo[color=red!30,#1]{\small\sf \textbf{CH:} #2}}
%\newcommand{\AL}[2][inline]{}
%\newcommand{\TC}[2][inline]{}
%\newcommand{\CH}[2][inline]{}

%% Omit the copyright space.
%\makeatletter
%\def\@copyrightspace{}
%\makeatother

%\def\IEEEauthorblockN#1{\gdef\IEEEauthorrefmark##1{\ensuremath{{}^{\textsf{##1}}}}#1}
%\newlength{\blockA}
%\setlength{\blockA}{.35\linewidth}
%\def\IEEEauthorblockA#1{
%  \scalebox{.9}{\begin{minipage}{\blockA}\normalsize\sf
%    \def\IEEEauthorrefmark##1{##1: }
%    #1
%  \end{minipage}}
%}
% \def\IEEEauthorrefmark#1{#1: }

\title{Emulating High Performance Linpack on a Commodity Server at the Scale of a Supercomputer}
%\title{Simulating the Energy Consumption of MPI~Applications}
% Predicting the Performance and the Power Consumption of MPI Applications With SimGrid
  %\titlerunning{Power-aware simulation for large-scale systems with SimGrid}
  %

  \author{
    \begin{minipage}{.55\linewidth}\centering
      \IEEEauthorblockN{Tom Cornebize, Franz C. Heinrich, Arnaud Legrand}\\
      \IEEEauthorblockA{Univ. Grenoble Alpes, CNRS, Inria, Grenoble INP, LIG\\ 38000 Grenoble, France\\
        firstname.lastname@inria.fr}
    \end{minipage}
    \begin{minipage}{.35\linewidth}\centering
      \IEEEauthorblockN{Jérôme Vienne}\\
      \IEEEauthorblockA{Texas Advanced Computing Center\\Austin, Texas, USA\\
        viennej@tacc.utexas.edu}
    \end{minipage}

  }


  \maketitle              % typeset the title of the contribution
#+END_EXPORT
* Abstract                                                           :ignore:
#+LaTeX: \begin{abstract}
The Linpack benchmark, in particular the High-Performance Linpack
(HPL) implementation, has emerged as the de-facto standard benchmark
to rank supercomputers in the TOP500. With a power consumption of
several MW per hour on a TOP500 machine, test-running HPL on the whole
machine for hours is extremely expensive. With core-counts beyond the
100,000 cores threshold being common and sometimes even ranging into
the millions, an optimization of HPL parameters (problem size, grid
arrangement, granularity, collective operation algorithms, etc.)
specifically suited to the network topology and performance is
essential. Such optimization can be particularly time consuming and
can hardly be done through simple mathematical performance models. In
this article, we explain how we both extended the SimGrid's SMPI
simulator and slightly modified HPL to allow a fast emulation of HPL
on a single commodity computer at the scale of a supercomputer. More
precisely, we take as a motivating use case the large-scale run
performed on the Stampede cluster at TACC in 2013, when it got ranked
6th in the TOP500. While this qualification run required the
dedication of 6,006 computing nodes of the supercomputer and more than
120\nbsp{}TB of RAM for more than 2\nbsp{}hours, we manage to simulate a similar
configuration on a commodity computer with 19\nbsp{}GB of RAM in about
62\nbsp{}hours. Allied to a careful modeling of Stampede, this simulation
allows us to evaluate the performance that would have been obtained
using the freely available version of HPL. Such performance reveals much
lower than what was reported and which was obtained using a
closed-source version specifically designed by the Intel
engineers. Our simulation allows us to hint where the main algorithmic
improvements must have been done in HPL. 
#+LaTeX: \end{abstract}


#+BEGIN_EXPORT latex
% this is need to trim the number of authors and et al. for more than 3 authors
\bstctlcite{IEEEexample:BSTcontrol}
#+END_EXPORT
* Introduction

The world's largest and fastest machines are ranked twice a year in the so-called
TOP500 list. Among the benchmarks that are often used to evaluate
those machines, the Linpack benchmark, in particular the High-Performance Linpack (HPL)
implementation, has emerged as the de-facto standard benchmark, although
other benchmarks such as HPCG and HPGMG have recently been proposed to
become the new standard. Today, machines with 100,000\nbsp{}cores 
and more are common and several machines beyond the 1,000,000\nbsp{}cores mark
are already in production. This high density of computation units requires diligent optimization of application
parameters, such as problem size, process organization or choice of algorithm, as these
have an impact on load distribution and network utilization.
Furthermore, to yield best benchmark results,
runtimes (such as OpenMPI) and supporting libraries (such as BLAS) need to be fine-tuned and adapted to the
underlying platform. 

Alas, it takes typically several hours to run HPL on the list's number one system.
This duration, combined with the power consumption that often reaches several MW
for TOP500 machines, makes it financially infeasible to test-run HPL on the whole
machine just to tweak parameters. 
Yet, performance results of an already deployed, current-generation machine typically also
play a role in the funding process for future machines. Results near
the optimal performance for the current machine are hence considered critical for
HPC centers and vendors. These entities would benefit from being able to
tune parameters without actually running the benchmark for hours.
# This estimation can be done either via (mathematical) performance models (e.g., by
# estimating performance of specific functions) or by a simulation based approach.
# While performance models neglect the
# oftentimes serious impact of the network (\eg due to congestion, shared bandwidth,
# ...), this is not in general true for the simulation approach.

# \CH{Furthermore, simulations can be used to validate/check that the execution went well (operated near the peak performance) but can also help to find the right parameters for the application, runtime and network.}

In this article, we explain how to predict the performance of HPL
through simulation with the SimGrid/SMPI simulator. We detail how we obtained
faithful models for several functions (\eg =DGEMM= and =DTRSM=) and how we managed
to reduce the memory consumption from more than a hundred terabytes to several
gigabytes, allowing us to emulate HPL on a commonly available server node.
We evaluate the effectiveness of our solution by
simulating a scenario similar to the run conducted on the Stampede
cluster (TACC) in 2013 for the TOP500 . 

This article is organized as follows:
Section\ref{sec:con} presents the main characteristics of the HPL
application and provides detail on the run that was conducted at TACC
in 2013.  Section\ref{sec:relwork} discusses existing related work and
explains why emulation (or /online simulation/) is the only relevant
approach when studying an application as complex as HPL. In
Section\ref{sec:smpi}, we briefly present the simulator we used for
this work, SimGrid/SMPI, followed by an
extensive discussion in Section\ref{sec:em} about the
optimizations on all levels (\ie simulator, application, system) that
were necessary to make a large-scale run tractable. The scalability of
our approach is evaluated in Section\ref{sec:scalabilityevol}. The
modeling of the Stampede platform and the comparison of our simulation
with the 2013 execution is detailed in
Section\ref{sec:science}. Lastly, Section\ref{sec:cl} concludes this
article by summarizing our contributions.

* Context
#+LaTeX: \label{sec:con}

# The HPLinpack benchmark consists of a set of rules: A set of linear
# equations, $Ax = b$, needs to be solved and it requires furthermore that the input matrix can be of
# arbitrary dimension =n= and that O(n³) + O(n²) operations be used
# (hence, Strassen's matrix multiplication is prohibited).

** High-Performance Linpack
\label{sec:hpl}
#+BEGIN_EXPORT latex
\begin{figure}
  \newcommand{\mykwfn}[1]{{\bf\textsf{#1}}}%
  \SetAlFnt{\sf}%
  \SetKwSty{mykwfn}%
  \SetKw{KwStep}{step}%
  \centering
  \begin{minipage}[m]{0.4\linewidth}
    % \vspace{0.3cm} % ugly, could not align the drawing with the algorithm with minipages or tabular...
    \begin{tikzpicture}[scale=0.23]
      \draw (0, 0) -- (0, 12) -- (12, 12) -- (12, 0) -- cycle;
      \foreach \i in {2}{
        \draw [fill=lightgray] (\i, 0) -- (\i, 12-\i) -- (12, 12-\i) -- (12, 0) -- cycle;
        \draw [fill=gray] (\i, 12-\i) -- (\i, 12-\i-1) -- (\i+1, 12-\i-1) -- (\i+1, 12-\i) -- cycle;
        \draw[very thick, -latex] (\i,12-\i) -- (\i+2,12-\i-2);
        \draw[<->] (\i, 12-\i+0.5) -- (\i+1, 12-\i+0.5) node [pos=0.5, yshift=+0.15cm] {\scalebox{.8}{\texttt{NB}}};
      }
      \foreach \i in {3}{
        \draw [fill=white] (\i, 0) -- (\i, 12-\i) -- (12, 12-\i) -- (12, 0) -- cycle;
        \draw (\i,12-\i) -- (\i,0);
        \draw[very thick, -latex] (\i,12-\i) -- (\i+2,12-\i-2);
      }
      \draw[dashed] (0, 12) -- (12, 0);
      \node(L) at (2, 2) {\ensuremath{\boldsymbol{L}}};
      \node(U) at (10, 10) {\ensuremath{\boldsymbol{U}}};
      \node(A) at (8, 4) {\ensuremath{\boldsymbol{A}}};
      \draw[<->] (0, -0.5) -- (12, -0.5) node [pos=0.5, yshift=-0.3cm] {$N$};

    \end{tikzpicture}
  \end{minipage}%
  \begin{minipage}[m]{0.6\linewidth}
    \removelatexerror
    \begin{algorithm}[H]
      allocate and initialize $A$\;
      \For{$k=N$ \KwTo $0$ \KwStep \texttt{NB}}{
        allocate the panel\;
        factor the panel\;
        broadcast the panel\;
        update the sub-matrix;
      }
    \end{algorithm}
    \vspace{1em}
  \end{minipage}
  
  \caption{Overview of High Performance Linpack}\vspace{-1em}
  \label{fig:hpl_overview}
\end{figure}
#+END_EXPORT

For this work, we use the freely-available reference-implementation of
the High-Performance Linpack benchmark\cite{HPL}, HPL, which is 
used to benchmark systems for the TOP500\cite{top500} list. HPL
requires MPI to be available and implements
a LU decomposition, \ie a factorization of a square matrix $A$ as the
product of a lower triangular matrix $L$ and an upper triangular
matrix $U$. HPL checks the correctness of this factorization by
solving a linear system $A\cdot{}x=b$, but only the factorization step is
benchmarked.  The factorization is based on a right-looking variant of
the LU factorization with row partial pivoting and allows multiple
look-ahead depths. The working principle of the factorization is depicted in
Figure\ref{fig:hpl_overview} and consists of a series of panel
factorizations followed by an update of the trailing sub-matrix.
HPL uses a two-dimensional block-cyclic data distribution of $A$ and implements several custom
collective communication algorithms to efficiently overlap communication
with computation.
The main parameters of HPL are listed subsequently:
- $N$ is the order of the square matrix $A$.
- =NB= is the ``blocking factor'', \ie the granularity at
  which HPL operates when panels are distributed or worked on.
- $P$ and $Q$ denote the number of process rows and the
  number of process columns, respectively.
- =RFACT= determines the panel factorization algorithm. Possible values are Crout, left- or right-looking.
- =SWAP= specifies the swapping algorithm used while pivoting. Two
  algorithms are available: one based on /binary exchange/ (along a virtual tree topology) and the other one based on
  a /spread-and-roll/ (with a higher number of parallel communications). HPL
  also provides a panel-size threshold triggering a switch from one variant to the other.
- =BCAST= sets the algorithm used to broadcast the
  panel of columns to the other process columns. Legacy versions of
  the MPI standard only supported non-blocking point-to-point communications but did
  not support non-blocking collective communications, which is why HPL
  ships with in total 6 self-implemented variants to efficiently
  overlap the time spent waiting for an incoming panel with updates to
  the trailing matrix: =ring=, =ring-modified=, =2-ring=, =2-ring-modified=,
  =long=, and =long-modified=. The =modified= versions guarantee that
  the process right after the root (\ie the process that will become the root
  in the next iteration) receives data first and does not participate
  further in the broadcast. This process can thereby start working on the
  panel as soon as possible. The =ring= and =2-ring= versions correspond
  to the name-giving two virtual topologies while the =long= version
  is a /spread and roll/ algorithm where messages are chopped into $Q$
  pieces. This generally leads to better bandwidth exploitation. The =ring= and
  =2-ring= variants rely on =MPI_Iprobe=, meaning they
  return control if no message has been fully received yet and hence
  facilitate partial overlapping of communication with computations. In HPL 2.2 and 2.1, this capability
  has been deactivated for the =long= and =long-modified= algorithms. A comment in the source code states that some
  machines apparently get stuck when there are too many ongoing messages.
- =DEPTH= controls how many iterations of the outer loop can overlap with each other.

#+BEGIN_EXPORT latex
\begin{figure}[t]
  \centering
  \includegraphics[width=.95\linewidth,page=1]{./figures/stampede.pdf}                                                                                                                               
  \caption{The fat-tree network topology of Stampede.}
  \label{fig:fat_tree_topology}
  \labspace
\end{figure}
#+END_EXPORT

The sequential complexity of this factorization is 
$\mathrm{flop}(N) = \frac{2}{3}N^3 + 2N^2 + \O(N)$ where $N$ is the
order of the matrix to factorize. The time complexity can be
approximated by
$$T(N) \approx \frac{\left(\frac{2}{3}N^3 + 2N^2\right)}{P\cdot{}Q\cdot{}w} + \Theta((P+Q)\cdot{}N^2),$$ where
$w$ is the flop rate of a single node and 
the second term corresponds to the communication overhead which is
influenced by the network capacity and by the previously listed parameters (=RFACT=, =SWAP=, =BCAST=,
=DEPTH=, \ldots). 
After each run, HPL reports the overall flop
rate $\mathrm{flop}(N)/T(N)$ (expressed in \si{\giga\flops}) for
the given configuration. See Figure\ref{fig:hpl_output} for a (shortened)
example output.

A large-scale execution of HPL on a real machine in order to submit to the TOP500
can therefore be quite time consuming as all the BLAS kernels, the MPI runtime, and HPL's numerous parameters
need to be tuned carefully in order to reach optimal performance.
** A Typical Run on a Supercomputer
\label{sec:stampede}
In June 2013, the Stampede supercomputer at TACC was ranked 6th in the
TOP500 by achieving \SI{5168.1}{\tera\flops} and was still ranked 20th in
June 2017. In 2017, this machine got upgraded and renamed Stampede2. The Stampede platform
consisted of 6400 Sandy Bridge nodes, each with two 8-core Xeon E5-2680 and one
Intel Xeon Phi KNC MIC coprocessor. The nodes were interconnected
through a \SI{56}{\giga\bit\per\second} FDR InfiniBand 2-level Clos
fat-tree topology built on Mellanox switches. As can be seen in 
Figure\ref{fig:fat_tree_topology}, the 6400 nodes are
divided into groups of 20, with each group being connected to one of the 320 36-port switches (\SI{4}{\tera\bit\per\second}
capacity), which are themselves connected to 8 648-port
``core\nbsp{}switches'' (each with a capacity of \SI{73}{\tera\bit\per\second}). 
The peak performance of the 2 Xeon CPUs per node was approximately \SI{346}{\giga\flops},
while the peak performance of the KNC co-processor was about
\SI{1}{\tera\flops}. The theoretical peak performance of the
platform was therefore \SI{8614}{\tera\flops}. However, in the TOP500, Stampede
was ranked with \SI{5168}{\tera\flops}. According to the log submitted
to the TOP500 (see Figure\ref{fig:hpl_output}) that was provided to us,
this execution took roughly two hours and used $77\times78 = 6,006$
processes. The matrix of order $N = 3,875,000$ occupied approximately
\SI{120}{\tera\byte} of memory, \ie \SI{20}{\giga\byte} per node.
One MPI process per node was used and each node's
computational resources (the 16 CPU-cores and the Xeon Phi) must have 
been controlled by OpenMP and/or Intel's MKL.

#+BEGIN_EXPORT latex
\begin{figure}%[!htb]
  \centering
  \scalebox{.73}{\begin{minipage}[b]{.68\textwidth}
  \lstset{frame=bt,language=html,numbers=none,escapechar=£}\lstinputlisting{fullrun_hpl.txt}
  \end{minipage}}
  \null\vspace{-2em}\caption{HPL output submitted in June 2013 for the ranking of Stampede in the TOP500.}\vspace{-1em}
  \label{fig:hpl_output}
\end{figure}
#+END_EXPORT

*** Hidden information about the Stampede execution              :noexport:
#+BEGIN_SRC C :exports none :tangle fullrun_hpl.txt
================================================================================
HPLinpack 2.1  --  High-Performance Linpack benchmark  --   October 26, 2012
Written by A. Petitet and R. Clint Whaley,  Innovative Computing Laboratory, UTK
Modified by Piotr Luszczek, Innovative Computing Laboratory, UTK
Modified by Julien Langou, University of Colorado Denver
================================================================================

The following parameter values will be used:

£\myemph{N}£        : £\myemph{3875000}£
£\myemph{NB}£       :   £\myemph{1024}£
PMAP     : Column-major process mapping
£\myemph{P}£        :      £\myemph{77}£
£\myemph{Q}£        :      £\myemph{78}£
PFACT    :   Right 
NBMIN    :       4 
NDIV     :       2 
RFACT    :   Crout 
BCAST    :  BlongM 
DEPTH    :       0 
SWAP     : Binary-exchange
L1       : no-transposed form
U        : no-transposed form
EQUIL    : no
ALIGN    :    8 double precision words

--------------------------------------------------------------------------------


[...]


Peak Performance = 5172687.23 GFlops /   861.25 GFlops per node
================================================================================
T/V                N    NB     P     Q               Time                 Gflops
--------------------------------------------------------------------------------
WC05C2R4     3875000  1024    77    78            7505.72            £\myemph{5.16811e+06}£
HPL_pdgesv() start time Sun Jun  2 13:04:59 2013

HPL_pdgesv() end time   Sun Jun  2 15:10:04 2013

--------------------------------------------------------------------------------
||Ax-b||_oo/(eps*(||A||_oo*||x||_oo+||b||_oo)*N)=        0.0007822 ...... PASSED
#+END_SRC

** Performance Evaluation Challenges
:LOGBOOK:
- State "TODO"       from              [2017-11-15 mer. 16:26]
:END:
#+LaTeX: \label{sec:con:diff}

The performance achieved by Stampede, \SI{5168}{\tera\flops}, needs to
be compared to the peak performance of the 6,006 nodes, \ie
\SI{8084}{\tera\flops}. This difference may be attributed to the node
usage (\eg the MKL), to the MPI library, to the network topology that
may be unable to deal with the very intensive communication workload, to
load imbalance among nodes because some node happens to be slower for some
reason (defect, system noise, \ldots), to the algorithmic structure of
HPL, etc. All these factors make it difficult to know precisely what
performance to expect
without running the application at scale.

It is clear that due to the level of complexity of both HPL and
the underlying hardware, simple performance models (analytic expressions based
on $N, P, Q$ and estimations of platform characteristics as presented in
Section\ref{sec:hpl}) may be able to provide trends but can by no means
predict the performance for each configuration (\ie consider the
exact effect of HPL's 6 different broadcast algorithms on network
contention). Additionally, these expressions do not allow
engineers to improve the performance through actively identifying performance bottlenecks.
For complex optimizations such as partially non-blocking
collective communication algorithms intertwined with computations,
very faithful modeling of both the application and the platform is
required. Given the scale of this scenario
(3,785\nbsp{}steps on 6,006 nodes in two hours), detailed
simulations quickly become intractable without significant effort.
* Related Work
#+LaTeX: \label{sec:relwork}

Performance prediction of MPI application through simulation has been
widely studied over the last decades, with today's literature distinguishing mainly
between two approaches: offline and online simulation.

With the most common approach, /offline simulation/, a time-independent
trace of the application is first obtained on a real platform. This
trace comprises sequences of MPI optimizations and CPU bursts and can
be given as an input to a simulator that implements performance models
for the CPUs and the network to derive timings. Researchers
interested in finding out how their application reacts to changes to
the underlying platform can replay the trace on commodity hardware at
will with different platform models.
Most HPC simulators available today, notably BigSim\cite{bigsim_04},
Dimemas\cite{dimemas} and CODES\cite{CODES}, rely on this approach.

The main limitation of this approach comes from the trace
acquisition requirement.
Additionally, tracing an application provides only information about
its behavior at the time of the run. Even light modifications 
(\eg to communication patterns) may make the trace inaccurate. For
simple applications (\eg =stencil=) it is sometimes
possible to extrapolate behavior from small-scale
traces\cite{scalaextrap,pmac_lspp13} but the execution is
non-deterministic whenever the application relies on
non-blocking communication patterns, which is unfortunately the
case for HPL.

The second approach discussed in literature is /online simulation/.
Here, the application is executed (emulated) on top of a simulator
that is responsible for determining when each process
is run. This approach allows researchers
to study directly the behavior of MPI applications but only a few
recent simulators such as SST Macro\cite{sstmacro},
SimGrid/SMPI\cite{simgrid} 
and the closed-source extreme-scale simulator xSim\cite{xsim} support
it. To the best of our knowledge, only SST Macro and
SimGrid/SMPI are not only mature enough to faithfully emulate 
HPL but also free software. For our work, we relied on SimGrid as we 
have an excellent knowledge of its internals although the developments we
propose would a priori also be possible with SST Macro. Emulation of
HPL comes with at least two challenges:
- Firstly, the time-complexity of the
  algorithm is $\Theta(N^3)$. Furthermore, 
  $\Theta(N^2)$ communications are performed, with $N$ being very
  large. The execution on the Stampede cluster took roughly two hours
  on 6,006\nbsp{}compute nodes. Using only a single node, a naive
  emulation of HPL at the scale of the Stampede run would take about
  500\nbsp{}days if perfect scaling is reached. Although the emulation could
  be done in parallel, we want to use as little computing resources as possible.
- Secondly, the tremendous memory consumption and consequent high
  number of RAM accesses for read/write operations need to be dealt with.

# Real execution:
# - Matrix of order 3,875,000
# - Using 6,006 MPI processes
# - About 2 hours
# Requirement for the emulation of Stampede's execution:
# - $\ge 3, 875, 000 2 \times 8$ bytes \approx 120 terabytes of memory
# - $\ge 6, 006 \times 2$ hours \approx 500 days (very optimistic)
  
* SimGrid/SMPI in a nutshell
#+LATEX: \label{sec:smpi}

SimGrid\cite{simgrid} is a flexible and open-source simulation
framework that was originally designed in 2000 to study scheduling
heuristics tailored to heterogeneous grid computing
environments. Since then, SimGrid has also been used to study
peer-to-peer systems with up to two million
peers\cite{simgrid_simix2_12} just as cloud and HPC infrastructures.
To this end, SMPI, a simulator based on SimGrid, has been
developed and used to faithfully simulate unmodified MPI applications
written in C/C++ or FORTRAN\cite{smpi}.
A main development goal for SimGrid has been to provide validated
performance models particularly for scenarios leveraging the network. 
Such a validation normally consists of comparing simulation
predictions with results from real experiments to confirm or debunk network and application models.
In\cite{heinrich:hal-01523608}, we have for instance validated
SimGrid's energy module by accurately and consistently predicting within a few
percent the performance and the energy consumption of HPL and some
other benchmarks on small-scale clusters (up to $12\times12$ cores
in\cite{heinrich:hal-01523608} and up to $128\times1$ cores
in\cite{smpi}).

In this article, we aim to validate our approach through much larger experiments.
This scale, however, comes at the cost of a much less controlled
scenario for real-life experiments since the Stampede run of HPL was done
in 2013 and we only have very limited information about the
setup (\eg software versions).

** MPI Communication Modeling
The complex network optimizations done in real MPI implementations
need to be considered when predicting performance of MPI applications.
For instance, message size not only influences the network's latency
and bandwidth factors but also the protocol used, such as ``eager'' or
``rendez-vous'', as they are selected
based on the message size, with each protocol having its own
synchronization semantics.
To deal with this, SMPI relies on a generalization of the LogGPS
model\cite{smpi} and supports specifying synchronization and performance modes. This model
needs to be instantiated once per platform through a carefully controlled series of messages
(=MPI_Send= and =MPI_Recv=) between two nodes and through a set of
piece-wise linear regressions.
#+LABEL: \CH{This last sentence may be too long.}

Modeling network topologies and contention is also difficult.  SMPI
relies on SimGrid's communication models where each ongoing
communication is represented as a whole (as opposed to single packets)
by a /flow/. Assuming steady-state, contention between active
communications can be modeled as a bandwidth sharing problem that
accounts for non-trivial phenomena (\eg RTT-unfairness of TCP,
cross-traffic interference or network
heterogeneity\cite{Velho_TOMACS13}). Communications that start or end
trigger re-computation of the bandwidth sharing if needed.  In this
model, the time to simulate a message passing through the network is
independent of its size, which is advantageous for large-scale
applications frequently sending large messages.  SimGrid does not
model transient phenomena incurred by the network protocol but
accounts for network topology and heterogeneity.

Finally, collective operations are also challenging, particularly since
these operations often play a key factor to an application's performance. Consequently, performance optimization
of these operations has been studied intensively. As a result, MPI
implementations now commonly have several alternatives for each
collective operation and select one at runtime, depending on message size and communicator
geometry. SMPI implements collective
communication algorithms and the selection logic from several MPI implementations (\eg
Open MPI, MPICH), which helps to ensure that
simulations are as close as possible to real
executions. 
Although SMPI supports these facilities, they are not required in the
case of HPL as it ships with its own implementation of collective
operations.
    #+BEGIN_EXPORT latex
    \tikzset{draw half paths/.style 2 args={%
      % From https://tex.stackexchange.com/a/292108/71579
      decoration={show path construction,
        lineto code={
          \draw [#1] (\tikzinputsegmentfirst) --
             ($(\tikzinputsegmentfirst)!0.5!(\tikzinputsegmentlast)$);
          \draw [#2] ($(\tikzinputsegmentfirst)!0.5!(\tikzinputsegmentlast)$)
            -- (\tikzinputsegmentlast);
        }
      }, decorate
    }}
    \begin{figure}[b]%[htbp]
      \centering
      \begin{tikzpicture}[yscale=0.7, scale=0.7]
        \pgfmathtruncatemacro{\size}{4}
        \pgfmathtruncatemacro{\width}{2}
        \pgfmathtruncatemacro{\sizem}{\size-1}
        \pgfmathtruncatemacro{\smallbasex}{4}
        \pgfmathtruncatemacro{\smallbasey}{\size/2}
        \pgfmathtruncatemacro{\smallstopx}{\smallbasex+\width}
        \pgfmathtruncatemacro{\smallstopy}{\smallbasey+1}
        \foreach \i in {0,\sizem}{
	    \pgfmathtruncatemacro{\j}{\i+1}
	    \draw (0, \i) -- (0, \j);
	    \draw (\width, \i) -- (\width, \j);
	    \draw[dotted] (0, \i) -- (\width, \i);
	    \draw[dotted] (0, \j) -- (\width, \j);
	}
	\draw[dashed] (0, 1) -- (0, \sizem);
	\draw[dashed] (\width, 1) -- (\width, \sizem);
	\draw (0, 0)     -- (\width, 0);
	\draw (0, \size) -- (\width, \size);
        \draw (\smallbasex,\smallbasey) -- (\smallstopx,\smallbasey) -- (\smallstopx,\smallstopy) -- (\smallbasex,\smallstopy) -- cycle;
        \foreach \i in {0,\sizem}{
	    \pgfmathtruncatemacro{\j}{\i+1}
	    \draw[dotted] (\width, \i) -- (\smallbasex, \smallbasey);
	    \draw[dotted] (\width, \j) -- (\smallbasex, \smallstopy);
	    \pgfmathsetmacro{\xleft}{\width}
	    \pgfmathsetmacro{\xright}{\smallbasex}%{\width/2.0+\smallbasex/2.0}
	    \pgfmathsetmacro{\yleft}{\i + 0.5}
	    \pgfmathsetmacro{\yright}{\smallbasey + 0.5}
	    \path [draw half paths={solid, -latex}{draw=none}]  (\xleft, \yleft) -- (\xright, \yright);
	}
	\draw[decorate,line width=1pt,decoration={brace,raise=0.2cm}] (0, 0) -- (0, \size) node [pos=0.5, xshift=-1cm] {virtual};
	\draw[decorate,line width=1pt,decoration={brace,mirror,raise=0.2cm}] (\smallstopx, \smallbasey) -- (\smallstopx, \smallstopy) node [pos=0.5, xshift=1.2cm] {physical};
      \end{tikzpicture}
      \caption{\label{fig:global_shared_malloc}SMPI shared malloc mechanism: large area of virtual memory are cyclically mapped onto the same physical pages.}\vspace{-1em}
    \end{figure}
    #+END_EXPORT
** Application Behavior Modeling
#+LATEX: \label{sec:appmodeling}
In Section\ref{sec:relwork} we explained that SMPI relies on the /online/ simulation approach.
Since SimGrid is a sequential simulator, SMPI maps every MPI process of the application onto a
lightweight simulation thread. These threads are then run one at a
time, \ie in mutual exclusion.
Every time a thread enters an MPI call, 
SMPI takes control and the time that was spent
computing (isolated from the other threads) since the previous
MPI call can be injected into the simulator as a virtual delay. 

Mapping MPI processes to threads of a single
process effectively folds them into the same address space.
Consequently, global variables in the MPI application are shared
between threads unless these variables are /privatized/ and the
simulated MPI ranks thus isolated from each other. Several
technical solutions are possible to handle this issue\cite{smpi}. The
default strategy in SMPI consists of making a copy of the =data=
segment (containing all global variables) per MPI rank at startup and,
when context switching to another rank, to remap the =data= segment via =mmap= to the private copy of that rank.
SMPI also implements another mechanism relying on the =dlopen=
function that saves calls to =mmap= when context switching.

This causes online simulation to be expensive in terms of both simulation time and memory
since the whole parallel application is executed on a single node.
To deal with this, SMPI provides two simple annotation mechanisms:
- *Kernel sampling*: Control flow is in many cases
     independent of the computation results. This allows
     computation-intensive kernels (\eg BLAS kernels for HPL) 
     to be skipped during the simulation. For this purpose, SMPI
     supports annotation of regular kernels through several macros
     such as =SMPI_SAMPLE_LOCAL= and =SMPI_SAMPLE_GLOBAL=. The regularity allows SMPI to execute these
     kernels a few times, estimate their cost and skip the kernel in
     the future by deriving its cost from these samples, hence cutting
     simulation time significantly. Skipping kernels renders the
     content of some variables invalid but in simulation, only the
     behavior of the application and not the correctness of computation
     results are of concern.
- *Memory folding*: SMPI provides the =SMPI_SHARED_MALLOC= (=SMPI_SHARED_FREE=) macro to
     replace calls to =malloc= (=free=). They indicate that some data structures can safely be
     shared between processes and that the data they contain is not
     critical for the execution (\eg an input matrix) and that it may
     even be overwritten. 
     =SMPI_SHARED_MALLOC= works as follows (see Figure\ref{fig:global_shared_malloc}) : a single block of physical memory (of default size \SI{1}{\mega\byte}) for the whole
     execution is allocated and shared by all MPI processes.
     A range of virtual addresses corresponding to a specified size is reserved and cyclically mapped onto the previously obtained
     physical address.
     This mechanism allows applications to obtain a nearly constant memory
     footprint, regardless of the size of the actual allocations.

    # At the first call to =SMPI_SHARED_MALLOC=, a temporary file is created. The file descriptor is a global variable,
    # accessible by all the MPI processes, since they are implemented by POSIX threads.

    # At every call to =SMPI_SHARED_MALLOC=, a first call to =mmap= is done with the required size and the flag =MAP_ANONYMOUS=
    # (thus without any file descriptor). The effect of this call is to reserve the whole interval of virtual
    # addresses. Then, for each sub-interval, a new call to =mmap= is done with the temporary file. The address of the
    # sub-interval itself is passed with the flag =MAP_FIXED=, which forces the mapping to keep the same virtual address.
    # As a result, each of these sub-intervals of virtual addresses are mapped onto a same interval of physical
    # addresses. We therefore have a block of virtual addresses of arbitrary size backed by a constant amount of physical
    # memory. Since there are almost no computations left, this is harmless with respect to the simulation. Note that such
    # allocations cannot be fully removed as many parts of the code
    # still access it from time to time.

* Improving SMPI Emulation Mechanisms and Preparing HPL
#+LaTeX: \label{sec:em}

We now present our changes to SimGrid and HPL that were
required for a scalable and faithful simulation. We provide
only a brief evaluation of our modifications and refer the 
reader interested in details to\cite{cornebize:hal-01544827} and our laboratory 
#+LaTeX: notebook\footnote{See \texttt{journal.org} at \url{https://github.com/Ezibenroc/simulating_mpi_applications_at_scale/}}.
For our experiments in this section, we used a single core from nodes
of the Nova cluster provided by the Grid'5000 testbed\cite{grid5000} with
\SI{32}{\giga\byte} RAM, two 8-core Intel Xeon E5-2620 v4
CPUs processors with \SI{2.1}{\GHz} and Debian Stretch (kernel 4.9). 

** Kernel modeling
       As explained in Section\ref{sec:con:diff}, faithful prediction
       of HPL necessitates emulation, \ie to execute the code.
       HPL relies heavily on BLAS kernels such as =dgemm= (for matrix-matrix multiplication) or =dtrsm= (for solving
       an equation of the form $Ax=b$). An analysis of an HPL
       simulation with $64$ processes and a very small matrix of order
       $30,000$ showed that roughly \SI{96}{\percent} of
       the time is spent in these two very regular kernels.
       For larger matrices, these kernels will consume
       an even bigger percentage of the computation time. Since these
       kernels do not influence the control flow, simulation time can
       be reduced by substituting =dgemm= and =dtrsm= function calls 
       with a performance model for the respective kernel. 
       Figure\ref{fig:macro_simple} shows an example of this
       macro-based mechanism that allows us to keep HPL code modifications to an absolute
       minimum. The =(1.029e-11)= value represents the inverse of the
       flop rate for this computation kernel and was obtained
       through calibration. The estimated time for the real
       kernel is calculated based on the parameters and eventually
       passed on to =smpi_execute_benched= that advances the clock of the executing
       rank by this estimate by entering a sleep state.
       The effect on simulation time for a small scenario is depicted in Figure\ref{fig:kernel_sampling}. 
       On the one hand, this modification speeds up the simulation by
       orders of magnitude, especially when the matrix order
       grows. On the other hand, this kernel model leads to an
       optimistic estimation of the floprate. This may 
       be caused by inaccuracies in our model as well as by the fact
       that the initial emulation is generally more sensitive to pre-emptions,
       \eg by the operating system, and therefore more likely to be
       pessimistic compared to a real execution.
       # #+LATEX: \CH{Re-work this. I don't like that we talk about inaccuracies in our model. Shouldn't the pre-emptions be modeled alread? We did rely on measurements! "Absence of performance variability when kernel models are used."}
       # #+LATEX: \TC{I don't get the explanation about the inaccuracies. I think OS preemptions is one of the smallest factors here, especially since in real executions they will certainly fix this issue (e.g. with chrt --fifo 99) whereas in the calibration I did not take care of that.}

#+BEGIN_EXPORT latex
\begin{figure}%[!htb]
%  \null\vspace{-1cm}
  \centering
  \subfigure[Non-intrusive macro replacement.\label{fig:macro_simple}]{
    \begin{minipage}[b]{\linewidth}
      \lstset{frame=bt,language=C,numbers=none,escapechar=|}\lstinputlisting{HPL_dgemm_macro_simple.c}
    \end{minipage}}
  \subfigure[Gain in term of simulation time.\label{fig:kernel_sampling}]{
    \begin{minipage}[b]{\linewidth} 
      \includegraphics[width=\linewidth,page=2]{figures/validation_kernel_modeling.pdf}
    \end{minipage}}
  \caption{Replacing the calls to computationally expensive functions by a model allows to significantly reduce simulation time.}\vspace{-1em}
\end{figure}
#+END_EXPORT

*** Hidden section with estimation of the quality/speed of the simulation :noexport:
Inspire from the entry of [[file:~/Work/Journals/tom_cornebize/m2_internship_journal/journal.org::*2017-11-15%20Wednesday][Tom's journal]] ([[https://github.com/Ezibenroc/m2_internship_journal/tree/master/journal.org][Github version]]) "2017-11-15 Wednesday":
Regenerating the validation plot for smpi_execute".

#+begin_src R :results output :session *R* :exports both
library(ggplot2)
library(gridExtra)
library(grid)
old <- read.csv("/home/alegrand/Work/SimGrid/tom/m2_internship_journal/validation/result_size_L0.csv")
new <- read.csv("/home/alegrand/Work/SimGrid/tom/m2_internship_journal/validation/result_size_L1.csv")
old$kernel_sampling = FALSE
new$kernel_sampling =  TRUE
results = rbind(old, new)
generic_do_plot <- function(plot, fixed_shape=TRUE) {
#   For xrange, see https://stackoverflow.com/questions/7705345/how-can-i-extract-plot-axes-ranges-for-a-ggplot2-object
#   old version for xrange (broken)
#   xrange = ggplot_build(plot)$panel$ranges[[1]]$x.range
#   new version for xrange (may break in the next ggplot update...)
    xrange = ggplot_build(plot)$layout$panel_ranges[[1]]$x.range
    xwidth = xrange[2] - xrange[1]
    if(fixed_shape) {
        point = stat_summary(fun.y = mean, geom="point", shape=21)
    }
    else {
        point = stat_summary(fun.y = mean, geom="point")
    }
    return(plot +
        stat_summary(fun.data = mean_se, geom = "errorbar", width=xwidth/20)+
        stat_summary(fun.y = mean, geom="line")+
        point+
        theme_bw()+ scale_color_brewer(palette="Set1") + 
        expand_limits(x=0, y=0))
}

# From https://stackoverflow.com/a/38420690/4110059
grid_arrange_shared_legend <- function(..., nrow = 1, ncol = length(list(...)), position = c("bottom", "top", "right")) {

  plots <- list(...)
  position <- match.arg(position)
  g <- ggplotGrob(plots[[1]] + theme(legend.position = position))$grobs
  legend <- g[[which(sapply(g, function(x) x$name) == "guide-box")]]
  lheight <- sum(legend$height)
  lwidth <- sum(legend$width)
  gl <- lapply(plots, function(x) x + theme(legend.position = "none"))
  gl <- c(gl, nrow = nrow, ncol = ncol)

  combined <- switch(position,
                     "bottom" = arrangeGrob(do.call(arrangeGrob, gl),
                                            legend,
                                            ncol = 1,
                                            heights = unit.c(unit(1, "npc") - lheight, lheight)),
                     "top" = arrangeGrob(legend, do.call(arrangeGrob,gl),
                                            ncol = 1,
                                            heights = unit.c(lheight, unit(1, "npc") - lheight)),
                     "right" = arrangeGrob(do.call(arrangeGrob, gl),
                                           legend,
                                           ncol = 2,
                                           widths = unit.c(unit(1, "npc") - lwidth, lwidth)))
  grid.newpage()
  grid.draw(combined)

}
#+end_src

#+RESULTS:

#+begin_src R :file figures/validation_kernel_modeling.pdf :results value graphics :results output :session *R* :exports both :width 6.2 :height 3.5
plot1 = generic_do_plot(ggplot(results, aes(x=size, y=Gflops, color=kernel_sampling, linetype=kernel_sampling))) +
    labs(colour="Kernel modeling") +
    labs(linetype="Kernel modeling") +
    xlab('Matrix order') +
    ylab('Performance [Gflop/s]') +
    ggtitle("Performance estimation\n(P=Q=8, i.e., 64 MPI process)")
plot2 = generic_do_plot(ggplot(results, aes(x=size, y=simulation_time, color=kernel_sampling, linetype=kernel_sampling))) +
    labs(colour="Kernel modeling") +
    labs(linetype="Kernel modeling") +
    xlab('Matrix order') +
    ylab('Time [seconds]') +
    ggtitle("Simulation time\n(P=Q=8, i.e., 64 MPI process)")

grid_arrange_shared_legend(plot2, plot1, ncol=2, position="top")
#+end_src

#+RESULTS:
[[file:figures/validation_kernel_modeling.pdf]]


*** Hidden section with macro code                               :noexport:
#+BEGIN_SRC C :exports none :tangle HPL_dtrsm_macro_real.c
#define |\color{colorfuncall}HPL\_dtrsm|(layout, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb) ({ \
    double expected_time;                                                           \
    double coefficient, intercept;                                                  \
    if((M) > 512 && (N) > 512) {                                                    \
        coefficient = (double)SMPI_DTRSM_PHI_COEFFICIENT;                           \
        intercept = (double)SMPI_DTRSM_PHI_INTERCEPT;                               \
    } else {                                                                        \
        coefficient = (double)SMPI_DTRSM_CPU_COEFFICIENT;                           \
        intercept = (double)SMPI_DTRSM_CPU_INTERCEPT;                               \
    }                                                                               \
    if((Side) == HplLeft) {                                                         \
        expected_time = coefficient*((double)(M))*((double)(M))*((double)(N));      \
    } else {                                                                        \
        expected_time = coefficient*((double)(M))*((double)(N))*((double)(N));      \
    }                                                                               \
    expected_time += intercept                                                      \
    if(expected_time > 0)                                                           \
        |\color{colorfuncall}smpi\_execute\_benched|(expected_time);                                        \
})
#+END_SRC

#+BEGIN_SRC C :exports none :tangle HPL_dtrsm_macro_simple_old.c
#define |\color{colorfuncall}HPL\_dtrsm|(layout, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb) ({      \
    double expected_time = (9.882e-12)*((double)M)*((double)M)*((double)N) + 4.329e-02;   \
    if(expected_time > 0)                                                                 \
        |\color{colorfuncall}smpi\_execute\_benched|(expected_time);                      \
})
#+END_SRC

#+BEGIN_SRC C :exports none :tangle HPL_dtrsm_macro_simple.c
#define |\color{colorfuncall}HPL\_dtrsm|(layout, Side, Uplo, TransA, Diag,      \ 
        M, N, alpha, A, lda, B, ldb) ({                  \
    double expected_time = (9.882e-12)*((double)M)*      \
                   ((double)M)*((double)N) + 4.329e-02;  \
    if(expected_time > 0)                                \
        |\color{colorfuncall}smpi\_execute\_benched|(expected_time);             \
})
#+END_SRC

#+BEGIN_SRC C :exports none :tangle HPL_dgemm_macro_simple.c
#define |\color{colorfuncall}HPL\_dgemm|(layout, TransA, TransB,                \
        M, N, K, alpha, A, lda, B, ldb, beta, C, ldc) ({ \
    double expected_time = (1.029e-11)*((double)M)*      \
                   ((double)N)*((double)K) + 1.981e-12;  \
    if(expected_time > 0)                                \
        |\color{colorfuncall}smpi\_execute\_benched|(expected_time);             \
})
#+END_SRC

#+BEGIN_EXPORT latex
\CH{Found this in Tom's logbook. Check if this is the final version. Also, we can apparently just call \texttt{make SMPI\_OPTS=-DSMPI\_OPTIMIZATION} (what about \texttt{arch=SMPI}?). See his logbook}
#+END_EXPORT
** Adjusting the behavior of HPL
#+LaTeX: \label{sec:hplchanges}

HPL uses pseudo-randomly generated
matrices that need to be setup every time HPL is executed. The time
spent on this just as the validation of the computed result is
not considered in the reported \si{\giga\flops} performance. 
We skip all the
computations since we replaced them by a kernel model and therefore, 
result validation is meaningless. Since both 
phases do not have an impact on the reported performance, we can safely
skip them.

In addition to the main computation kernels =dgemm= and =dtrsm=, 
we identified seven other BLAS functions through
profiling as computationally expensive enough to justify a specific
handling: =dgemv=, =dswap=, =daxpy=,
=dscal=, =dtrsv=, =dger= and =idamax=. Similarly, a significant amount of time was
spent in fifteen functions implemented in HPL: 
=HPL_dlaswp*N=, =HPL_dlaswp*T=, =HPL_dlacpy= and =HPL_dlatcpy=.
# =HPL_dlaswp00N=, =HPL_dlaswp01N=, =HPL_dlaswp01T=, =HPL_dlaswp02N=, =HPL_dlaswp03N=,
# =HPL_dlaswp03T=, =HPL_dlaswp04N=, =HPL_dlaswp04T=, =HPL_dlaswp05N=, =HPL_dlaswp05T=,
# =HPL_dlaswp06N=, =HPL_dlaswp06T=, =HPL_dlaswp10N=, =HPL_dlacpy= and =HPL_dlatcpy=.

All of these functions are called during the
LU factorization and hence impact the performance measured by HPL; however, because of
the removal of the =dgemm= and =dtrsm= computations, they all operate on
bogus data and hence also produce bogus data. We also determined
through experiments that their impact on the performance prediction is
minimal and hence modeled them for the sake of simplicity as being instantaneous.

Note that HPL
implements an LU factorization with partial pivoting and a special
treatment of the =idamax= function that returns the index of the first
element equaling the maximum absolute value. Although we ignored the
cost of this function as well, we set its return value to an arbitrary
value to make the simulation fully deterministic.
We confirmed that this modification is harmless in terms of performance prediction while it
speeds up the simulation by an additional factor of $\approx3$ to $4$
on small ($N=30,000$) and even more on large scenarios.
** Memory folding
As explained in Section\ref{sec:smpi}, when emulating an application
with SMPI, all MPI processes are run within the same simulation process on a single
node. The memory consumption of the simulation can therefore quickly reach
several \si{\tera\byte} of RAM. 

Yet, as we no longer operate on real data, storing the whole
input matrix $A$ is needless. However, since only a minimal portion of the code was
modified, some functions may still read or write some parts of the matrix.
It is thus not possible to simply remove the memory allocations of
large data structures altogether. Instead, SMPI's =SHARED_MALLOC= mechanism can be used
to share unimportant data structures between all ranks, minimizing the memory footprint.

#+BEGIN_EXPORT latex
\tikzstyle{switch}=[draw, circle, minimum width=1cm, minimum height = 1cm]
\tikzstyle{compute}=[draw, rectangle, minimum width=0.5cm, minimum height = 0.5cm, node distance=0.5cm]
\tikzstyle{base}=[ellipse, minimum width=2cm, minimum height = 0.5cm, node distance = 0.5cm]
\tikzstyle{bigswitch}=[base, draw]
\begin{figure}%[htbp]
  \centering
  {\begin{minipage}{1.0\linewidth}
  \subfigure[Structure of the panel in HPL.\label{fig:panel_structure}]{\small
    \begin{minipage}[b]{\linewidth}\centering
      \begin{tikzpicture}[yscale=.6,scale=0.8]
        \draw [fill=gray] (3, 2) -- (6, 2) -- (6, 3) -- (3, 3) -- cycle;
        \draw (0, 2) -- (9, 2) -- (9, 3) -- (0, 3) -- cycle;
        \draw[dashed] (3, 2) -- (3, 3);
        \draw[dashed] (6, 2) -- (6, 3);
        \node(1) at (1.5, 2.5) {matrix parts};
        \node(2) at (4.5, 2.5) {indices};
        \node(3) at (7.5, 2.5) {matrix parts};
        \draw[decorate,line width=1pt,decoration={brace,raise=0.2cm}] (0, 3) -- (3, 3) node [pos=0.5, yshift=0.5cm] {can be shared};
        \draw[decorate,line width=1pt,decoration={brace,raise=0.2cm}] (6, 3) -- (9, 3) node [pos=0.5, yshift=0.5cm] {can be shared};
        \draw[decorate,line width=1pt,decoration={brace,raise=0.2cm, mirror}] (3, 2) -- (6, 2) node [pos=0.5, yshift=-0.5cm] {must not be shared};
      \end{tikzpicture}
    \end{minipage}}
  \subfigure[Reusing panel allocation from an iteration to another.\label{fig:panel_reuse}]{\small
    \begin{minipage}[b]{\linewidth}\centering
      \begin{tikzpicture}[yscale=.6]
        \draw [fill=gray] (2, 1) -- (4, 1) -- (4, 1.5) -- (2, 1.5) --cycle;
        \draw (0, 1) -- (6, 1) -- (6, 1.5) -- (0, 1.5) -- cycle;
        \draw[dashed] (2, 1) -- (2, 1.5);
        \draw[dashed] (4, 1) -- (4, 1.5);

        \draw [fill=gray] (2, 0) -- (3, 0) -- (3, .5) -- (2, .5) --cycle;
        \draw (1, 0) -- (4, 0) -- (4, .5) -- (1, .5) -- cycle;
        \draw[dashed] (2, 0) -- (2, .5);
        \draw[dashed] (3, 0) -- (3, .5);

        \draw[-latex] (2, 1) -- (2, .5);
        \draw[decorate,line width=1pt,decoration={brace,raise=0.2cm}] (0, 1.5) -- (6, 1.5) node [pos=0.5, yshift=0.5cm] {initial buffer};
        \draw[decorate,line width=1pt,decoration={brace,raise=0.2cm, mirror}] (1, 0) -- (4, 0) node [pos=0.5, yshift=-0.5cm] {current buffer};
      \end{tikzpicture}
    \end{minipage}
  }    
  \end{minipage}}
  \caption{Panel structure and allocation strategy when simulating.\label{fig:panel}}\vspace{-1em}
\end{figure}
#+END_EXPORT

The largest two allocated data structures in HPL are the input matrix =A=
(with a size of typically several \si{\giga\byte} per process) and the =panel= which contains
information about the sub-matrix currently being factorized. This sub-matrix 
typically occupies a few hundred \si{\mega\byte} per process.
Although using the default =SHARED_MALLOC= mechanism works flawlessly
for =A=, a more careful strategy needs to be used for the
=panel=. Indeed, the =panel= is an intricate data structure with both \texttt{int}s
(accounting for matrix indices, error codes, MPI tags, and pivoting information)
and \texttt{double}s (corresponding to a copy of a sub-matrix of =A=). To
optimize data transfers, HPL flattens this structure into a single
allocation of \texttt{double}s (see
Figure\ref{fig:panel_structure}). Using a fully shared memory
allocation for the =panel= therefore leads to index corruption that results in
classic invalid memory accesses as well as communication
deadlocks, as processes may not send to or receive from the correct
process. Since \texttt{int}s and \texttt{double}s are stored in
non-contiguous parts of this flat allocation, it is therefore
essential to have a mechanism that preserves the process-specific
content. We have thus introduced the macro
=SMPI_PARTIAL_SHARED_MALLOC= that works as follows: 
~mem = SMPI_PARTIAL_SHARED_MALLOC(500, {27,42 , 100,200}, 2)~.
In this example, 500 bytes are allocated in =mem= with the elements
=mem[27]=, ..., =mem[41]= and =mem[100]=, ..., =mem[199]= being shared between
processes (they are therefore generally completely corrupted) while all other
elements remain private. To apply this to HPL's =panel= data\-structure
and partially share it between processes, we only had to modify a few lines. 

Designating memory explicitly as private, shared or partially shared
helps with both memory management and overall performance. 
As SMPI is internally aware of the memory's
visibility, it can avoid calling =memcopy= when large messages
containing shared segments are sent from one MPI rank to another.
For fully private or partially shared segments, SMPI
identifies and copies only those parts that are process-dependent
(private) into the corresponding buffers on the receiver side.

HPL simulation times were considerably improved in our experiments because
the =panel= as the most frequently transferred datastructure 
is partially shared with only a small part being private.
The additional error introduced by this technique was negligible (below \SI{1}{\percent}) while the
memory consumption was lowered significantly: for a matrix of order $40,000$ and $64$ MPI processes, the memory consumption
decreased from about \SI{13.5}{\giga\byte} to less than \SI{40}{\mega\byte}.
** Panel reuse
HPL \texttt{malloc}s/\texttt{free}s panels in each
iteration, with the size of the panel strictly decreasing from
iteration to iteration. As we explained above, the partial sharing of panels requires
many calls to =mmap= and introduces an overhead that makes these repeated
allocations / frees become a bottleneck. Since
the very first allocation can fit all subsequent panels, we modified
HPL to allocate only the first panel and reuse it for subsequent
iterations (see Figure\ref{fig:panel_reuse}).

We consider this optimization harmless with respect to simulation
accuracy as the maximum additional error that we observed was always less than \SI{1}{\percent}. Simulation
time is reduced significantly, albeit the reached speed-up is less impressive than for previous
optimizations: For a very small matrix of order $40,000$ and $64$ MPI processes,
the simulation time decreases by four seconds, from \SI{20.5}{\sec} to
\SI{16.5}{\sec}. Responsible for this is a reduction of system time,
namely from \SI{5.9}{\sec} to \SI{1.7}{\sec}. The number of page faults decreased from $2$ million to
$0.2$ million, confirming the devastating effect these allocations/deallocations would have at scale.
** MPI process representation (mmap vs. dlopen)
We already explained in Section\ref{sec:appmodeling} that SMPI
supports two mechanisms to keep local static and global variables
private to each rank, even though they run in the same process. In
this section, we discuss the impact of the choice.

- *mmap* When =mmap= is used, SMPI copies the =data= segment on startup for
  each rank into the heap. When control is transferred from one rank
  to another, the =data= segment is =mmap='ed to the location of the other
  rank's copy on the heap. All ranks have hence the same addresses in
  the virtual address space at their disposition although =mmap= ensures
  they point to different physical addresses. This also means
  inevitably that caches must be flushed to ensure that no data of one
  rank leaks into the other rank, making =mmap= a rather expensive
  operation.

# \TOM{Can you tell me how often these operations were executed, as
# you've already done in your journal on 2017-04-11 ("Looking at the
# syscalls")?}
- *dlopen* With =dlopen=, copies of the global variables are still made
  but they are stored inside the =data= segment as opposed to the
  heap. When switching from one rank to another, the starting virtual
  address for the storage is readjusted rather than the target of the
  addresses.  This means that each rank has distinct addresses for
  global variables. The main advantage of this approach is that caches
  do not need to be flushed as is the case for the =mmap= approach,
  because data consistency can always be guaranteed.
\noindent
*Impact of choice of mmap/dlopen*
The choice of =mmap= or =dlopen= influences the simulation time indirectly
through its impact on system/user time and page faults, \eg for a
matrix of order $80,000$ and $32$ MPI processes, the number
of minor page faults drops from \num{4412047} (with =mmap=) to
\num{6880} (with =dlopen=). This results in a reduction of system time from 
\SI{10.64}{\sec} (out of \SI{51.47}{\sec} in total) to
\SI{2.12}{\sec}. Obviously, the larger the matrix and the number of
processes, the larger the number of context switch during the
simulation, and thus the higher the gain.

# See Tom's journal (Performance evaluation of the privatization
# mechanism: =dlopen= vs =mmap= ) ; there are some graphs that we might be
# able to use, such as in
# https://github.com/Ezibenroc/m2_internship_journal/blob/master/simgrid_privatization/

** Huge pages    
For larger matrix orders (\ie $N$ larger than a few hundred thousand), the performance of the simulation quickly
deteriorates as the memory consumption rises rapidly.

We explained already how we fold the memory in order to reduce the /physical/
memory usage. The /virtual/ memory, on the other hand, is still
allocated for every process since the allocation calls are still executed.
Without a reduction of allocated virtual addresses, the page table
rapidly becomes too large to fit in a single node. More
precisely, the size of the page table containing pages of size \SI{4}{\kibi\byte} can be computed as:

    #+LATEX: \[ PT_{size}(N) = \frac{N^2 \cdot \texttt{sizeof(double)}}{4,096} \cdot \texttt{sizeof(pointer)} \]

This means that the addresses in the page table for a matrix of order $N=4,000,000$
consume $PT_{size}(4,000,000) = \num{2.5e11}$ bytes, \ie
\SI{250}{\giga\byte} on a system where double-precision floating-point numbers
and addresses take 8 bytes. Thankfully, the x86-64 architecture supports several page
sizes, known as ``huge pages'' in Linux. Typically, these pages are
around \SI{2}{\mebi\byte} (instead of \SI{4}{\kibi\byte}), although other sizes
(\SIrange{2}{256}{\mebi\byte}) are possible as well. 
Changing the page size requires administrator (root) privileges as the
Linux kernel support for /hugepages/ needs to be activated and a
=hugetlbfs= file system must be mounted. After at least one huge
page has been allocated, the path of the allocated file system can then be
passed on to SimGrid.
Setting the page size to \SI{2}{\mebi\byte} reduces drastically the page table size.
For example, for a matrix of order $N=4,000,000$, it shrinks from \SI{250}{\giga\byte}
to \SI{0.488}{\giga\byte}.

# Unfortunately, changing the page size requires administrator (root) privileges as the
# Linux kernel support for /hugepages/ needs to be activated and a
# =hugetlbfs= file system must be mounted. After at least one huge
# page was allocated, the path of the allocated file system can then be
# passed on to SimGrid that will then pass the flag =MAP_HUGETLB=
# to =mmap= in =SMPI_SHARED_MALLOC= and replace the file given to =mmap= by
# a file opened in the =hugetlbfs= file system.
# #+LATEX: \CH{I think this is too detailed. Who cares if we pass MAP\_HUGETLB?}
* Scalability Evaluation
#+LaTeX: \label{sec:scalabilityevol}

#+BEGIN_EXPORT latex
\begin{figure}[t]
  \centering
  \includegraphics[width=\linewidth,page=2]{./figures/scalability_plot_size.pdf}
%  \includegraphics[width=\linewidth,page=2]{./figures/scalability_plot_nbproc.pdf}
  \caption{Time complexity and memory consumption are linear in the number of processes but remain mildly quadratic with matrix rank.}\vspace{-1em}
  \label{fig:hpl_scalability}
  \labspace
\end{figure}
#+END_EXPORT

#+BEGIN_EXPORT latex
\begin{figure*}%[!htb]
  \centering
  % \begin{minipage}[b]{.27\textwidth}
  %   \includegraphics[width=\linewidth,page=2]{./figures/stampede_knc_model.pdf}
  %   \vspace{-2em}
  %   \caption{Automatic offloading on the KNC depends on matrix dimensions.}
  %   \vspace{-1em}
  %   \label{fig:hpl_mkl}
  % \end{minipage}~~~
  \begin{minipage}[b]{.7\textwidth}\centering
    \scalebox{.88}{\begin{tabular}{l|r|r|r|r}
    & \multicolumn{2}{c|}{CPU (\texttt{CPU})} & \multicolumn{2}{c}{KNC (\texttt{PHI}) }\\
    & Coefficient $[\si{\sec\per\flop}]$& Intercept $[\sec]$ & Coefficient $[\si{\sec\per\flop}]$& Intercept $[\sec]$ \\
    \hline
    \texttt{DGEMM} & \num{1.029e-11} & \num{2.737e-02} & \num{1.981e-12} & \num{6.316e-01} \\
    \texttt{DTRSM} & \num{9.882e-12} & \num{4.329e-02} & \num{1.954e-12} & \num{5.222e-01}
    \end{tabular}}\medskip\\
    \lstset{frame=bt,language=C,numbers=none,escapechar=|}\lstinputlisting{HPL_dtrsm_macro_real.c}
    \caption{Modeling automatic offloading on KNC in MKL BLAS kernels.}
    \vspace{-1em}
    \label{fig:macro_real}
  \end{minipage}~~~\begin{minipage}[b]{.27\textwidth}
  \centering
  \includegraphics[width=\linewidth,page=1]{./figures/stampede_calibration_send.png}
  \caption{Modeling communication time on stampede. Each color is manually adjusted and 
       corresponds to a different synchronization mode 
      (eager, rendez-vous,...). }\vspace{-1em}
  \label{fig:stampede_calibration}
  \labspace
  \end{minipage}
\end{figure*}
#+END_EXPORT

# SMPI_DGEMM_COEFFICIENT=1.029e-11 SMPI_DGEMM_INTERCEPT=2.737e-02 SMPI_DGEMM_PHI_COEFFICIENT=1.981e-12 SMPI_DGEMM_PHI_INTERCEPT=6.316e-01 \
# SMPI_DTRSM_COEFFICIENT=9.882e-12 SMPI_DTRSM_INTERCEPT=4.329e-02 SMPI_DTRSM_PHI_COEFFICIENT=1.954e-12 SMPI_DTRSM_PHI_INTERCEPT=5.222e-01"

In Section\ref{sec:em} we explained the problems we encountered when trying
to run a large-scale simulation on a single node and how we solved them. 
For the most part, we identified and eliminated bottlenecks one after
another while simultaneously making sure that the accuracy of our performance prediction was
not impacted. Certainly, the main goal was to reduce the
complexity from $\O(N^3) + \O(N^2\cdot{}P\cdot{}Q)$ to something more reasonable.
The $\O(N^3)$ was removed through skipping most computations. 
Ideally, since there are $N/NB$ iterations (steps), 
the complexity of simulating one step should be decreased to something independent of
$N$. SimGrid's fluid models, used to simulate communications, do not
depend on $N$. Therefore, the time to simulate a step of HPL should mostly depend on $P$ and
$Q$. Yet, some memory operations on the panel that are related to pivoting
are intertwined in HPL with collective communications, meaning that it
is impossible to completely get rid of the $\O(N)$ complexity without
modifying HPL more profoundly.

Although our goal was to model and simulate HPL on the Stampede
platform, we decided to conduct a first evaluation on a
similar, albeit non-existing, platform comprising 4,096 8-core nodes
interconnected through a $\langle2;16,32;1,16;1,1\rangle$ fat-tree topology
built on ideal network links with a bandwidth of
\SI{50}{\giga\byte\per\sec} and a latency of \SI{5}{\micro\sec}. We ran
simulations with $512$; $1,024$; $2,048$ or $4,096$ MPI processes and
with matrices of orders \num{5e5}, \num{1e6}, \num{2e6} or \num{4e6}.
The impact of the matrix order on total makespan and memory is illustrated in Figure\ref{fig:hpl_scalability}. 
With all previously described
optimizations enabled, the simulation with the largest matrix took close to $47$ hours and consumed
\SI{16}{\giga\byte} of memory whereas the smallest one took $20$ minutes and \SI{282}{\mega\byte} of memory.
One can also see that, when the matrix order ($N$) is increased, memory consumption and
simulation time both grow slightly quadratic as the amount of matrix
elements is $N^{2}$ and the number of steps of the algorithm also linearly.

Moreover, all the simulations spend less than \SI{10}{\percent} of their execution time in kernel
mode, which means the number of system calls is reasonably low.
** Hidden section                                                 :noexport:
Got data and code from the "2017-06-05 Monday: Plots for scalability
test" section of Tom's journal:

#+begin_src R :results output :session *R* :exports both
library(ggplot2)
library(ggrepel)
library(reshape2)
library(gridExtra)
results = rbind(
    read.csv('/home/alegrand/Work/SimGrid/tom/m2_internship_journal/scalability/result_500000_512.csv'),
    read.csv('/home/alegrand/Work/SimGrid/tom/m2_internship_journal/scalability/result_500000_1024.csv'),
    read.csv('/home/alegrand/Work/SimGrid/tom/m2_internship_journal/scalability/result_500000_2048.csv'),
    read.csv('/home/alegrand/Work/SimGrid/tom/m2_internship_journal/scalability/result_500000_4096.csv'),
    read.csv('/home/alegrand/Work/SimGrid/tom/m2_internship_journal/scalability/result_1000000_512.csv'),
    read.csv('/home/alegrand/Work/SimGrid/tom/m2_internship_journal/scalability/result_1000000_1024.csv'),
    read.csv('/home/alegrand/Work/SimGrid/tom/m2_internship_journal/scalability/result_1000000_2048.csv'),
    read.csv('/home/alegrand/Work/SimGrid/tom/m2_internship_journal/scalability/result_1000000_4096.csv'),
    read.csv('/home/alegrand/Work/SimGrid/tom/m2_internship_journal/scalability/result_2000000_512.csv'),
    read.csv('/home/alegrand/Work/SimGrid/tom/m2_internship_journal/scalability/result_2000000_1024.csv'),
    read.csv('/home/alegrand/Work/SimGrid/tom/m2_internship_journal/scalability/result_2000000_2048.csv'),
    read.csv('/home/alegrand/Work/SimGrid/tom/m2_internship_journal/scalability/result_2000000_4096.csv'),
    read.csv('/home/alegrand/Work/SimGrid/tom/m2_internship_journal/scalability/result_4000000_512.csv'),
    read.csv('/home/alegrand/Work/SimGrid/tom/m2_internship_journal/scalability/result_4000000_1024.csv'),
    read.csv('/home/alegrand/Work/SimGrid/tom/m2_internship_journal/scalability/result_4000000_2048.csv'),
    read.csv('/home/alegrand/Work/SimGrid/tom/m2_internship_journal/scalability/result_4000000_4096.csv')
)
results$simulation_time = results$simulation_time/3600
results$memory_size = results$memory_size * 1e-9
number_verb <- function(n) {
    return(format(n,big.mark=",",scientific=FALSE))
}
results$size_verb = factor(unlist(lapply(results$size, number_verb)), levels = c('500,000','1,000,000','2,000,000','4,000,000'))
results$nb_proc_verb = factor(unlist(lapply(results$nb_proc, number_verb)), levels = c('512', '1,024', '2,048', '4,096'))
results
#+end_src

#+RESULTS:
#+begin_example
             topology nb_roots nb_proc    size  full_time        time Gflops
1  2;16,32;1,16;1,1;8       16     512  500000    91246.1    91246.02  913.3
2  2;16,32;1,16;1,1;8       16    1024  500000    46990.1    46990.02 1773.0
3  2;16,32;1,16;1,1;8       16    2048  500000    24795.5    24795.50 3361.0
4  2;16,32;1,16;1,1;8       16    4096  500000    13561.0    13561.01 6145.0
5    2;16,32;1,16;1,1       16     512 1000000   716521.0   716521.00  930.4
6    2;16,32;1,16;1,1       16    1024 1000000   363201.0   363201.04 1836.0
7    2;16,32;1,16;1,1       16    2048 1000000   186496.0   186495.70 3575.0
8  2;16,32;1,16;1,1;8       16    4096 1000000    97836.6    97836.54 6814.0
9    2;16,32;1,16;1,1       16     512 2000000  5685080.0  5685077.72  938.1
10   2;16,32;1,16;1,1       16    1024 2000000  2861010.0  2861012.55 1864.0
11   2;16,32;1,16;1,1       16    2048 2000000  1448900.0  1448899.09 3681.0
12 2;16,32;1,16;1,1;8       16    4096 2000000   742691.0   742690.59 7181.0
13 2;16,32;1,16;1,1;8       16     512 4000000 45305100.0 45305083.56  941.8
14 2;16,32;1,16;1,1;8       16    1024 4000000 22723800.0 22723820.45 1878.0
15 2;16,32;1,16;1,1;8       16    2048 4000000 11432900.0 11432938.62 3732.0
16 2;16,32;1,16;1,1;8       16    4096 4000000  5787160.0  5787164.09 7373.0
   simulation_time application_time user_time system_time major_page_fault
1        0.3311083          204.992   1098.25       93.12                0
2        0.6895222          441.897   2296.51      184.70                0
3        1.4144361          872.425   4741.26      349.79                0
4        3.1448889         1947.320  10640.63      679.53                0
5        0.7319722          500.970   2367.19      259.91                0
6        1.6771917         1036.960   5515.36      515.05                0
7        3.4421944         2092.950  11389.36      995.39                0
8        7.2368056         4362.660  24082.38     1966.10                0
9        1.9263500         1169.660   6193.80      683.73                0
10       4.2217500         2551.100  13714.01     1430.93                0
11       8.9621111         5236.560  29357.92     2844.89                0
12      18.0156389        10643.600  59444.40     5402.24                0
13       4.8156944         3030.400  15090.31     1945.23                0
14      10.6613611         6435.870  34249.71     3827.36                0
15      23.2042222        13080.500  75523.95     7684.52                0
16      47.1275000        26745.400 154314.76    15085.08                0
   minor_page_fault cpu_utilization        uss         rss page_table_size
1            960072            0.99  155148288  2055086080        10604000
2           1054062            0.99  369696768  4383203328        21240000
3           1282294            0.99 1012477952  9367576576        42912000
4           1852119            0.99 3103875072 15318568960        87740000
5           1916208            0.99  153665536  2317279232        10600000
6           2002989            0.99  369676288  4837175296        21252000
7           2154982            0.99 1010696192  7774138368        42908000
8           2768705            0.99 3103895552 16934834176        87748000
9           3801905            0.99  150765568  2758770688        10604000
10          3872820            0.99  365555712  5273034752        21220000
11          4038099            0.99 1009606656  7415914496        42884000
12          4704339            0.99 3102445568 19464646656        87748000
13          7663911            0.98  151576576  2056916992        10604000
14          7725625            0.99  369872896  4120702976        21212000
15          7917525            0.99 1012191232  9221050368        42880000
16          8550745            0.99 3113381888 20408209408        87808000
   memory_size size_verb nb_proc_verb
1    0.2825585   500,000          512
2    0.4299489   500,000        1,024
3    0.9628262   500,000        2,048
4    2.8140421   500,000        4,096
5    0.8944435 1,000,000          512
6    1.0553098 1,000,000        1,024
7    1.5811707 1,000,000        2,048
8    3.4254070 1,000,000        4,096
9    3.3384202 2,000,000          512
10   3.4971116 2,000,000        1,024
11   4.0274084 2,000,000        2,048
12   5.9101348 2,000,000        4,096
13  13.0790605 4,000,000          512
14  13.2755579 4,000,000        1,024
15  13.8251837 4,000,000        2,048
16  15.7636690 4,000,000        4,096
#+end_example

#+begin_src R :results output :session *R* :exports both
  library(ggplot2)
  library(gridExtra)
  library(grid)

  generic_do_plot <- function(plot, fixed_shape=TRUE) {
  #   For xrange, see https://stackoverflow.com/questions/7705345/how-can-i-extract-plot-axes-ranges-for-a-ggplot2-object
  #   old version for xrange (broken)
  #   xrange = ggplot_build(plot)$panel$ranges[[1]]$x.range
  #   new version for xrange (may break in the next ggplot update...)
      xrange = ggplot_build(plot)$layout$panel_ranges[[1]]$x.range
      xwidth = xrange[2] - xrange[1]
      if(fixed_shape) {
          point = stat_summary(fun.y = mean, geom="point", shape=21)
      }
      else {
          point = stat_summary(fun.y = mean, geom="point")
      }
      return(plot +
          stat_summary(fun.data = mean_se, geom = "errorbar", width=xwidth/20)+
          stat_summary(fun.y = mean, geom="line")+
          point+
          theme_bw()+
          expand_limits(x=0, y=0))
  }
  do_plot <- function(df, x, y, color, color_title, fixed_val, other_fixed_val=-1) {
      if(y == "simulation_time") {
          y_title = "Simulation time (seconds)"
          title = "Simulation time"
      }
      else if(y == "memory_size") {
          y_title = "Memory consumption (bytes)"
          title = "Memory consumption"
      }
      else {
          stopifnot(y == "Gflops")
          y_title = "Performance estimation (Gflops)"
          title = "Performance estimation"
      }
      if(x == "size") {
          fixed_arg = "nb_proc"
          x_title = "Matrix size"
          title = paste(title, "for different matrix sizes\nUsing", fixed_val, "MPI processes")
      }
      else {
          stopifnot(x == "nb_proc")
          fixed_arg = "size"
          x_title = "Number of processes"
          title = paste(title, "for different number of processes\nUsing a matrix size of", format(fixed_val,big.mark=",",scientific=FALSE))
      }
      sub_df = df[df[fixed_arg] == fixed_val,]
      p = generic_do_plot(ggplot(sub_df, aes_string(x=x, y=y, linetype=color, color=color, group=color))) +
          ggtitle(title)+
          xlab(x_title)+
          ylab(y_title)+
          labs(colour=color_title)+
          labs(linetype=color_title)
      if(other_fixed_val != -1) {
          rect <- data.frame(xmin=-Inf, xmax=Inf, ymin=-Inf, ymax=Inf)
          my_xmin = other_fixed_val * 0.9
          my_xmax = other_fixed_val * 1.1
          my_ymax = max(sub_df[sub_df[x] == other_fixed_val,][y])
          y_delta = my_ymax * 0.1
          my_ymax = my_ymax + y_delta
          my_ymin = min(sub_df[sub_df[x] == other_fixed_val,][y]) - y_delta
          p = p + geom_rect(data=rect, aes(xmin=my_xmin, xmax=my_xmax, ymin=my_ymin, ymax=my_ymax),color="grey20", alpha=0.1, inherit.aes=FALSE)
      }
      return(p)
  }

  # From https://stackoverflow.com/a/38420690/4110059
  grid_arrange_shared_legend <- function(..., nrow = 1, ncol = length(list(...)), position = c("bottom", "right")) {

    plots <- list(...)
    position <- match.arg(position)
    g <- ggplotGrob(plots[[1]] + theme(legend.position = position))$grobs
    legend <- g[[which(sapply(g, function(x) x$name) == "guide-box")]]
    lheight <- sum(legend$height)
    lwidth <- sum(legend$width)
    gl <- lapply(plots, function(x) x + theme(legend.position = "none"))
    gl <- c(gl, nrow = nrow, ncol = ncol)

    combined <- switch(position,
                       "bottom" = arrangeGrob(do.call(arrangeGrob, gl),
                                              legend,
                                              ncol = 1,
                                              heights = unit.c(unit(1, "npc") - lheight, lheight)),
                       "right" = arrangeGrob(do.call(arrangeGrob, gl),
                                             legend,
                                             ncol = 2,
                                             widths = unit.c(unit(1, "npc") - lwidth, lwidth)))
    grid.newpage()
    grid.draw(combined)

  }

  do_multiple_plot <- function(df, x1, x2, y, color, color_title, fixed_x1, fixed_x2) {
      my_ymax = max(df[y])
      return(
          grid_arrange_shared_legend(
              do_plot(df, x1, y, color, color_title, fixed_x1, fixed_x2) + expand_limits(x=0, y=my_ymax),
              do_plot(df, x2, y, color, color_title, fixed_x2, fixed_x1) + expand_limits(x=0, y=my_ymax),
              nrow=1, ncol=2
          ))
  }

  do_four_plot <- function(df, x1, x2, y1, y2, color, color_title, fixed_x1, fixed_x2) {
      my_y1max = max(df[y1])
      my_y2max = max(df[y2])
      return(
          grid_arrange_shared_legend(
              do_plot(df, x1, y1, color, color_title, fixed_x1, fixed_x2) + expand_limits(x=0, y=my_y1max),
              do_plot(df, x2, y1, color, color_title, fixed_x2, fixed_x1) + expand_limits(x=0, y=my_y1max),
              do_plot(df, x1, y2, color, color_title, fixed_x1, fixed_x2) + expand_limits(x=0, y=my_y2max),
              do_plot(df, x2, y2, color, color_title, fixed_x2, fixed_x1) + expand_limits(x=0, y=my_y2max),
              nrow=2, ncol=2
          ))
  }
#+end_src

#+RESULTS:

#+begin_src R :file figures/scalability_2.pdf :results value graphics :results output :session *R* :exports both :width 4 :height 2.5
nbproc_time = generic_do_plot(ggplot(results, aes(x=nb_proc, y=simulation_time, color=size_verb))) +
    xlab("Number of processes") +
    ylab("Simulation time (hours)") +
    labs(colour="Matrix size")+
    ggtitle("Simulation time for different number of processes")+
    theme(legend.position = "none")+
    geom_text_repel(
        data = subset(results, nb_proc == max(nb_proc)),
        aes(label = size_verb),
        nudge_x = 45,
        segment.color = NA,
        show.legend = FALSE
      )
nbproc_time
#+end_src

#+RESULTS:
[[file:figures/scalability_2.pdf]]

#+begin_src R :file figures/scalability_4.pdf :results value graphics :results output :session *R* :exports both :width 4 :height 2.5
nbproc_mem = generic_do_plot(ggplot(results, aes(x=nb_proc, y=memory_size, color=size_verb))) +
    xlab("Number of processes") +
    ylab("Memory consumption (gigabytes)") +
    labs(colour="Matrix size")+
    ggtitle("Memory consumption for different number of processes")+
    theme(legend.position = "none")+
    geom_text_repel(
        data = subset(results, nb_proc == max(nb_proc)),
        aes(label = size_verb),
        nudge_x = 45,
        segment.color = NA,
        show.legend = FALSE
    )
nbproc_mem
#+end_src

#+RESULTS:
[[file:figures/scalability_4.pdf]]


#+begin_src R :file figures/scalability_1.pdf :results value graphics :results output :session *R* :exports both :width 4 :height 2.5
size_time = generic_do_plot(ggplot(results, aes(x=size, y=simulation_time, color=nb_proc_verb))) +
    xlab("Matrix rank") +
    ylab("Simulation time (hours)") +
    labs(colour="Number of processes")+ scale_color_brewer(palette="Set1")+
#    ggtitle("Simulation time for different matrix sizes")+
    theme(legend.position = "none")+
    geom_text_repel(
        data = subset(results, size == max(size)),
        aes(label = nb_proc_verb),
        nudge_x = 45,
        segment.color = NA,
        show.legend = FALSE
      )
size_time
#+end_src

#+RESULTS:
[[file:figures/scalability_1.pdf]]

#+begin_src R :file figures/scalability_3.pdf :results value graphics :results output :session *R* :exports both :width 4 :height 2.5
size_mem = generic_do_plot(ggplot(results, aes(x=size, y=memory_size, color=nb_proc_verb))) +
    xlab("Matrix rank") +
    ylab("Memory consumption (gigabytes)") +
    labs(colour="Number of processes")+
#    ggtitle("Memory consumption for different matrix sizes")+
    theme(legend.position = "none")+scale_color_brewer(palette="Set1")+
    geom_text_repel(
        data = subset(results, size == max(size)),
        aes(label = nb_proc_verb),
        nudge_x = 45,
        segment.color = NA,
        show.legend = FALSE
      )
size_mem
#+end_src

#+RESULTS:
[[file:figures/scalability_3.pdf]]

#+begin_src R :file figures/scalability_plot_size.pdf :results value graphics :results output :session *R* :exports both :width 7 :height 4
grid_arrange_shared_legend(size_time, size_mem, nrow=1, ncol=2)
#+end_src

#+RESULTS:
[[file:figures/scalability_plot_size.pdf]]

#+begin_src R :file figures/scalability_plot_nbproc.pdf :results value graphics :results output :session *R* :exports both :width 8 :height 3.5
grid_arrange_shared_legend(nbproc_time, nbproc_mem, nrow=1, ncol=2)
#+end_src

#+RESULTS:
[[file:figures/scalability_plot_nbproc.pdf]]


#+begin_src R :results output :session *R* :exports both
fit_sim = lm(data=results, simulation_time ~ nb_proc*(size+I(size^2)))
summary(fit_sim)
#+end_src

#+RESULTS:
#+begin_example

Call:
lm(formula = simulation_time ~ nb_proc * (size + I(size^2)), 
    data = results)

Residuals:
      Min        1Q    Median        3Q       Max 
-0.192256 -0.050079 -0.004809  0.045721  0.231054 

Coefficients:
                    Estimate Std. Error t value Pr(>|t|)    
(Intercept)       -1.522e-01  1.866e-01  -0.815   0.4339    
nb_proc           -1.162e-04  7.907e-05  -1.469   0.1725    
size               6.919e-08  2.214e-07   0.313   0.7610    
I(size^2)         -8.691e-14  4.689e-14  -1.853   0.0935 .  
nb_proc:size       1.608e-09  9.379e-11  17.142 9.64e-09 ***
nb_proc:I(size^2)  3.450e-16  1.987e-17  17.366 8.49e-09 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.1343 on 10 degrees of freedom
Multiple R-squared:  0.9999,	Adjusted R-squared:  0.9999 
F-statistic: 2.46e+04 on 5 and 10 DF,  p-value: < 2.2e-16
#+end_example

#+begin_src R :results output :session *R* :exports both
grid.lines = 26
x.pred <- seq(min(results$nb_proc), max(results$nb_proc), length.out = grid.lines)
y.pred <- seq(min(results$size), max(results$size), length.out = grid.lines)
xy <- expand.grid( nb_proc = x.pred, size = y.pred)
z.pred <- matrix(predict(fit_sim, newdata = xy), 
                 nrow = grid.lines, ncol = grid.lines)
# fitted points for droplines to surface
fitpoints <- predict(fit_sim)
#+end_src

#+RESULTS:

#+begin_src R :results output graphics :file (org-babel-temp-file "figure" ".png") :exports both :width 600 :height 400 :session *R* 
library("plot3D")
scatter3D(
   results$nb_proc, results$size, results$simulation_time, ticktype = "detailed", phi = 20, theta = -50, bty ="g",
    pch = 20, cex = 2, type="l", r=10,
    surf = list(x = x.pred, y = y.pred, z = z.pred,  
    facets = NA, fit = fitpoints),colvar=NULL)
#+end_src

#+RESULTS:
[[file:/tmp/babel-23284Iao/figure23284S2p.png]]

#+begin_src R :results output graphics :file (org-babel-temp-file "figure" ".png") :exports both :width 600 :height 400 :session *R* 
library("plot3D")
scatter3D(results$nb_proc, results$size, results$simulation_time, ticktype = "detailed", phi = 0, theta = -50, bty ="g",
           surf = list(x = unique(results$nb_proc), y = unique(results$size), z =  matrix(results$simulation_time, nrow=length(unique(results$nb_proc))),  facets = NA))
#+end_src

#+RESULTS:
[[file:/tmp/babel-23284Iao/figure23284QCE.png]]


#+begin_src R :results output :session *R* :exports both
fit_sim = lm(data=results,  memory_size ~ (nb_proc + I(nb_proc^2)) + I(size^2))
summary(fit_sim)
#+end_src

#+RESULTS:
#+begin_example

Call:
lm(formula = memory_size ~ (nb_proc + I(nb_proc^2)) + I(size^2), 
    data = results)

Residuals:
      Min        1Q    Median        3Q       Max 
-0.046408 -0.005840  0.001738  0.011710  0.058452 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)  -3.785e-02  2.247e-02  -1.685   0.1179    
nb_proc       1.264e-04  2.519e-05   5.019   0.0003 ***
I(nb_proc^2)  1.288e-07  5.211e-09  24.712 1.17e-11 ***
I(size^2)     8.166e-13  1.063e-15 767.967  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.02691 on 12 degrees of freedom
Multiple R-squared:      1,	Adjusted R-squared:      1 
F-statistic: 2.043e+05 on 3 and 12 DF,  p-value: < 2.2e-16
#+end_example

#+begin_src R :results output :session *R* :exports both
grid.lines = 26
x.pred <- seq(min(results$nb_proc), max(results$nb_proc), length.out = grid.lines)
y.pred <- seq(min(results$size), max(results$size), length.out = grid.lines)
xy <- expand.grid( nb_proc = x.pred, size = y.pred)
z.pred <- matrix(predict(fit_sim, newdata = xy), 
                 nrow = grid.lines, ncol = grid.lines)
# fitted points for droplines to surface
fitpoints <- predict(fit_sim)
#+end_src

#+RESULTS:

#+begin_src R :results output graphics :file (org-babel-temp-file "figure" ".png") :exports both :width 600 :height 400 :session *R* 
library("plot3D")
scatter3D(
   results$nb_proc, results$size, results$memory_size, ticktype = "detailed", phi = -10, theta = -50, bty ="g",
    pch = 18, cex = 2, 
    surf = list(x = x.pred, y = y.pred, z = z.pred,  
    facets = NA, fit = fitpoints),colvar=NULL)
#+end_src

#+RESULTS:
[[file:/tmp/babel-23284Iao/figure23284e_o.png]]

* Modeling Stampede and Simulating HPL
#+LaTeX: \label{sec:science}

** Modeling Stampede
*** Computations
Each node of the Stampede cluster comprises two 8-core Intel Xeon
E5-2680 8C \SI{2.7}{\GHz} CPUs and one 61-core Intel Xeon Phi SE10P
(KNC) \SI{1.1}{\GHz} accelerator that is roughly three times more
powerful than the two CPUs and can be used in two ways:
either as a classical accelerator, \ie for offloading expensive
computations from the CPU, or by compiling
binaries specifically for and executing them directly on the Xeon Phi.
While the accelerator's \SI{8}{\gibi\byte} of RAM are rather
small, the main advantage of the second approach is that data does not
need to be transferred back and forth between the node's CPUs and the
accelerator via the x16 PCIe bus.

The HPL output submitted to the TOP500 (Figure\ref{fig:hpl_output})
does not indicate how the KNC was used. However, because of the values assigned
to $P$ and $Q$, we are certain that only a single MPI process per node
was run. For this reason, it is likely that the KNC used as an accelerator. 
With Intel's Math Kernel Library (MKL), this is effortless as the MKL comes with
support for automatic offloading *for* selected BLAS functions. 
Unfortunately, we do not know which MKL version was used in 2013 and therefore decided to
use the default version used on Stampede in the beginning of 2017, \ie
version 11.1.1. The MKL documentation states
that, depending on the matrix geometry, the computation will run on
either all the cores of the CPU or exclusively on the KNC.  In the case of
=DGEMM=, the computation of $A=\alpha\cdot{}A+\beta\cdot{}B\times{}C$ with $A, B, C$ of
dimensions $M\times{}K$, $K\times{}N$ and $M\times{}N$, respectively, is offloaded onto the KNC whenever $M$
and $N$ are both larger than $1280$ while $K$ is simultaneously larger
than $256$. Similarly, offloading for =DTRSM= is used when both $M$ and $N$
are larger than $512$, which results in a
better throughput but incurs a higher latency. The complexity for =DGEMM= is always of the order
of $M\cdot{}N\cdot{}K$ ($M\cdot{}N^2$ for =DTRSM=) but the model that describes the time it
takes to run =DGEMM= (=DTRSM=) is very different for small and large
matrices. The table in Figure\ref{fig:macro_real} indicates the
parameters of the linear regression for the four scenarios (=DGEMM=
or =DTRSM= and CPU or Phi). The measured performance was close to the
peak performance: \eg for =DGEMM= on the Phi reached
$2/\num{1.981e-12} = \SI{1.009}{\tera\flops}$. Since the granularity
used in HPL (see Figure\ref{fig:hpl_output}) is 1024, all calls (except
for maybe the very last iteration) are offloaded to the KNC. 
In any case, this behavior can easily be accounted for by replacing the
macro in Figure\ref{fig:macro_simple} by the one in Figure\ref{fig:macro_real}.

# The accelerators are essential to the performance of the cluster,
# delivering \SI{7}{\peta\flops} of sustainable performance whereas
# the CPUs are only capable of delivering \SI{2}{\peta\flops}. On
# matrices of the size used for this work, however, CPUs are barely
# used.

# See CH's journal from [2017-10-03 Tue] to see how the version was determined
**** R figures                                                  :noexport:
#+begin_src R :results output :session *R* :exports both
  library(gridExtra)
  library(ggplot2)
  dgemm <- read.csv('/home/alegrand/Work/SimGrid/tom/m2_internship_journal/stampede/offloading_dgemm.csv')
  dgemm$m = as.double(dgemm$m)
  dtrsm <- read.csv('/home/alegrand/Work/SimGrid/tom/m2_internship_journal/stampede/offloading_dtrsm.csv')
  dtrsm$m = as.double(dtrsm$m)

  dgemm_new = dgemm[dgemm$automatic_offload == 'True',]
  dtrsm_new = dtrsm[dtrsm$automatic_offload == 'True',]
  dgemm_new$offload = (dgemm_new$m > 1280 & dgemm_new$n > 1280 & dgemm_new$k > 256)
  dtrsm_new$offload = (dtrsm_new$m > 512 & dtrsm_new$n > 512)

  dgemm_new$flops = dgemm_new$m * dgemm_new$n * dgemm_new$k;
  dtrsm_new$flops = dtrsm_new$m * dtrsm_new$n^2;

  dgemm_new$type = "dgemm";
  dtrsm_new$type = "dtrsm";

  df = dtrsm_new
  df$k = NA;
  df$lead_C = NA;
  df = rbind(df,dgemm_new)
  head(df)
  tail(df)
#+end_src

#+RESULTS:
#+begin_example
       time    m    n lead_A lead_B automatic_offloading offload        flops
1  0.029975 7251  261   7251   7251                 True   FALSE    493945371
4  2.227428  578 4619   4619   4619                 True    TRUE  12331723058
5  0.042097 4424  420   4424   4424                 True   FALSE    780393600
8  0.018786 3115  305   3115   3115                 True   FALSE    289772875
10 2.931274  650 5466   5466   5466                 True    TRUE  19420151400
12 3.240624 5606 6490   6490   6490                 True    TRUE 236125280600
    type  k lead_C
1  dtrsm NA     NA
4  dtrsm NA     NA
5  dtrsm NA     NA
8  dtrsm NA     NA
10 dtrsm NA     NA
12 dtrsm NA     NA
         time    m    n lead_A lead_B automatic_offloading offload        flops
89   0.083594  244 5757   5757   5757                 True   FALSE    847038924
91   4.932572 5527 6493   6493   6493                 True    TRUE 189518248891
931  2.943795 1425 6127   6127   6127                 True    TRUE  33954761775
96   0.262358   62 6621   6621   6621                 True   FALSE   2151851484
981  2.749753 4991 2256   4991   4991                 True    TRUE  17002140960
1001 2.383139 1421 1348   1646   1646                 True    TRUE   3152926168
      type    k lead_C
89   dgemm  603   5757
91   dgemm 5281   6493
931  dgemm 3889   6127
96   dgemm 5242   6621
981  dgemm 1510   4991
1001 dgemm 1646   1646
#+end_example

#+begin_src R :results output :session *R* :exports both
get_legend<-function(myggplot){
  tmp <- ggplot_gtable(ggplot_build(myggplot))
  leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
  legend <- tmp$grobs[[leg]]
  return(legend)
}
#+end_src

#+RESULTS:

#+begin_src R :results output graphics :file figures/stampede_knc_model.pdf :exports both :width 3.5 :height 5.3 :session *R* 
  labels = rbind(
      data.frame(x = 2E11, y=3.5, label="Offloading on the KNC", offload = T),
      data.frame(x = 1.1E11, y=1, label="Computation on the CPU", offload = F));
  
  p1 = ggplot(dgemm_new, aes_string(x='m*n*k', y='time', color='offload')) +
      geom_point() + geom_smooth(method="lm",fullrange=T) + theme(legend.position="top") + 
      geom_text(data = labels, aes(x=x, y=y, color=offload, label=label)) + ylim(0,1.1*max(dgemm_new$time)) +
      ggtitle('Duration of DGEMM') + theme_bw() +
      xlab("M.N.K [Flop]") + ylab("Duration [s]") + scale_color_brewer(palette="Set1")

  p1_legend = get_legend(p1);
  p1 = p1 + theme(legend.position="none")
  labels = rbind(
      data.frame(x = 3E11, y=2.8, label="Offloading on the KNC", offload = T),
      data.frame(x = 1.6E11, y=.6, label="Computation on the CPU", offload = F));
  p2 = ggplot(dtrsm_new, aes_string(x='m*n*n', y='time', color='offload')) +
      geom_point()+ geom_smooth(method="lm",fullrange=T) +  ylim(0,1.1*max(dtrsm_new$time)) +
      geom_text(data = labels, aes(x=x, y=y, color=offload, label=label)) +
      ggtitle('Duration of DTRSM') + theme_bw() +
      xlab("M.N² [Flop]") + ylab("Duration [s]") + scale_color_brewer(palette="Set1")
  p2 = p2 + theme(legend.position="none")

   lay <- rbind(c(1), c(2));
   grid.arrange(p1,p2, layout_matrix = lay,widths=c(1), heights=c(2,2));
#+end_src

#+RESULTS:
[[file:figures/stampede_knc_model.pdf]]

#+begin_src sh :results output :exports both
pdfcrop figures/stampede_knc_model.pdf figures/stampede_knc_model.pdf
#+end_src

#+begin_src R :results output graphics :file (org-babel-temp-file "figure" ".png") :exports both :width 600 :height 400 :session *R* 
  ggplot(df, aes(x=flops, y=time, color=offload)) +
      geom_point() + geom_smooth(method="lm") + 
      facet_wrap(~type)
#+end_src

#+RESULTS:
[[file:/tmp/babel-1674kfe/figure1674sWN.png]]

*** Communications
# #+BEGIN_EXPORT latex
# \begin{figure}[t]
#   \centering
#   \includegraphics[width=\linewidth,page=1]{./figures/stampede_calibration_send.png}
#   \caption{Modeling communication time on stampede. Each color corresponds to a
#       manually adjusted breakpoint corresponding to a synchronization mode 
#       (eager, rendez-vous,...). }\vspace{-1em}
#   \label{fig:stampede_calibration}
#   \labspace
# \end{figure}
# #+END_EXPORT

We unfortunately do not know for sure which version of Intel MPI was used in
2013, so we decided to use the default one on Stampede
in May 2017, \ie version 3.1.4. As explained in 
Section\ref{sec:smpi}, SMPI's communication model is a hybrid model
between the LogP family and a fluid model. For each message, the send mode
(\eg fully asynchronous, detached or eager) is determined solely by the
message size. It is hence possible to model the resulting performance
of communication operations through a piece-wise linear model, as depicted in
Figure\ref{fig:stampede_calibration}. For a thorough discussion of
the calibration techniques used to obtain this model,
see\cite{smpi}. As illustrated, the results for
=MPI_Send= are quite stable and piece-wise regular, but the behavior of
=MPI_Recv= is surprising: for small messages with a size of less than \SI{17420}{\byte}
(represented by purple, blue and red dots), one can observe two modes,
namely ``slow'' and ``fast'' communications. ``Slow''
operations take twice longer and are much more common than the 
``fast'' ones. We observed this behavior in several experiments even though both MPI
processes that were used in the calibration were connected through 
the same local switch. When observed, this ``perturbation'' was present throughout the execution of that
calibration. 
Having taken into consideration that small messages are scarce in HPL, we eventually decided to
ignore this phenomenon and opted to use the more favorable scenario (fast
communications) for small messages. We believe that the impact of
our choice on the simulation accuracy is minimal as primarily large,
bulk messages are sent that make use of the /rendez-vous/ mode (depicted in dark green).

Furthermore, we configured SMPI to use Stampede's network topology,
\ie Mellanox FDR InfiniBand technology with \SI{56}{\giga\bit\per\second}, setup in
a fat-tree topology (see Figure\ref{fig:fat_tree_topology}). We
assumed the routing was done through D-mod-K\cite{dmodk} as it is
commonly used on this topology.
**** Stampede network calibration figures                       :noexport:
This figure is generated in [[file:~/Work/SimGrid/platform-calibration/data/stampede_17_06_01-17:14/calibration/analysis.org][the platform calibration repository]]. Data
should be read from there. Final adjustments (in the "Combined plot
section") were done here:

#+begin_src R :results output :session *R* :exports both
library(gridExtra)
get_legend<-function(myggplot){
  tmp <- ggplot_gtable(ggplot_build(myggplot))
  leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
  legend <- tmp$grobs[[leg]]
  return(legend)
}

p1 = eth$p_send + theme(legend.position="top", legend.background = element_rect(fill = "white", colour = NA)) + guides(colour = guide_legend(override.aes = list(alpha = 1)))
#     + annotate("text",x=1E2,y=2.3E-6, label="40GB IB model",color="black")


p2 = eth$p_recv + theme(legend.position="top", legend.background = element_rect(fill = "white", colour = NA)) 
#     + annotate("text",x=1E2,y=2.3E-6, label="40GB IB model",color="black")

p1_legend = get_legend(p1);
p1 = p1 + theme(legend.position="none")
p2 = p2 + theme(legend.position="none")

# lay <- rbind(c(1,1),
#              c(2,3));
# p = grid.arrange(p1_legend,p1,p2, layout_matrix = lay,widths=c(2,2), heights=c(1.3,4));
# ggsave(filename="/tmp/taurus_send_recv.pdf",plot=p,width = 6, height = 4)

lay <- rbind(c(1,2));
p = grid.arrange(p1,p2, layout_matrix = lay,widths=c(2,2), heights=c(4));
ggsave(filename="/tmp/stampede_send_recv_eth.pdf",plot=p,width = 6, height = 3)
#+end_src

#+RESULTS:
: Warning message:
: Transformation introduced infinite values in continuous x-axis
: Warning messages:
: 1: Transformation introduced infinite values in continuous x-axis 
: 2: Transformation introduced infinite values in continuous x-axis

#+begin_src R :results output :session *R* :exports both
ggsave(filename="/tmp/stampede_send_recv_eth.png",plot=p,width = 6, height = 3)
#+end_src

#+RESULTS:

#+begin_src sh :results output :exports both
cp /tmp/stampede_send_recv_eth.png ./figures/stampede_calibration_send.png
#+end_src

#+RESULTS:
*** Summary of modeling uncertainties
For the compiler, Intel MPI and MKL, we were unable to determine
which version was used in 2013, but decided to go for rather optimistic
choices. The models for the MKL and for Intel MPI are close to the peak
performance. It is plausible that the compiler managed to optimize
computations in HPL. While it is true that most of these computations
are executed in our simulations, they are not accounted for. This
allows us to obtain fully deterministic simulations without harming the
outcome of the simulation as these parts only represent a tiny fraction of
the total execution time of HPL. A few HPL compilation flags (\eg
=HPL_NO_MPI_DATATYPE= and =HPL_COPY_L= that control whether MPI datatypes
should be used and how, respectively) could not be deduced from
HPL's original output on Stampede but we believe their impact to be
minimal. Finally, the HPL output reports the use of HPL v2.1 but the
main difference between v2.1 and v2.2 is the option to
continuously report factorization progress. We hence decided to apply
our modifications to the later version of HPL.

With all these modifications in place, we expected the prediction of
our simulations to be optimistic yet close to results obtained by a real life execution.
# - iMPI version ???
# - HPL compilation ? Possible modifications s.a. using openMP to have thread taking care of MPI communications and progressions.
** Simulating HPL
*** Performance Prediction
Figure\ref{fig:stampede_prediction} compares two simulation scenarios
with the original result from 2013. The solid red line represents the HPL
performance prediction as obtained with SMPI with the Stampede model
that we described in the previous section. Although we expected SMPI to be
optimistic, the prediction was surprisingly much lower than the TOP500 result.
We verified that no part of HPL was left unmodeled and decided to
investigate whether a flaw in our network model that would result in
too much congestion could explain the performance. 
Alas, even a congestion-free network model 
(represented by the dashed blue line in Figure\ref{fig:stampede_prediction}) only
results in minor improvements. In our experiments to model =DGEMM= and =DTRSM=,
either the CPU or the KNC seemed to be used at one time and a specifically
optimized version of the MKL may have been used in 2013. 
Removing the offloading latency and modeling each node as a
single \SI{1.2}{\tera\flops} node does not sufficiently explain the
divide between our results and reality.

#+BEGIN_EXPORT latex
\begin{figure}[t]
  \centering
  \includegraphics[width=\linewidth,page=1]{./figures/stampede_simgrid.pdf}
  \caption{Performance prediction of HPL on Stampede using SimGrid.}\vspace{-1em}
  \label{fig:stampede_prediction}
  \labspace
\end{figure}
#+END_EXPORT

**** HPL prediction                                             :noexport:

Starting from Tom's journal, entry "2017-09-27 Wednesday : Complete experiments with the crosstraffic desabled"
#+begin_src R :results output :session *R* :exports both
library(ggplot2)
# files <- dir('/home/alegrand/Work/SimGrid/tom/m2_internship_journal/stampede/simulation/8', pattern = '\\.csv', full.names = TRUE)
# tables <- lapply(files, read.csv)
# results = do.call(rbind, tables)

classical = rbind(read.csv('/home/alegrand/Work/SimGrid/tom/m2_internship_journal/stampede/simulation/8/result_1000000.csv'),
    read.csv('/home/alegrand/Work/SimGrid/tom/m2_internship_journal/stampede/simulation/8/result_2000000.csv'),
    read.csv('/home/alegrand/Work/SimGrid/tom/m2_internship_journal/stampede/simulation/8/result_3875000.csv'))
classical$mode = 'classical'

highbw = rbind(read.csv('/home/alegrand/Work/SimGrid/tom/m2_internship_journal/stampede/simulation/8/result_highbw_1000000.csv'),
    read.csv('/home/alegrand/Work/SimGrid/tom/m2_internship_journal/stampede/simulation/8/result_highbw_2000000.csv'),
    read.csv('/home/alegrand/Work/SimGrid/tom/m2_internship_journal/stampede/simulation/8/result_highbw_3875000.csv'))
highbw$mode = 'highbw'

fatpipe = rbind(read.csv('/home/alegrand/Work/SimGrid/tom/m2_internship_journal/stampede/simulation/8/result_fatpipe_1000000.csv'),
    read.csv('/home/alegrand/Work/SimGrid/tom/m2_internship_journal/stampede/simulation/8/result_fatpipe_2000000.csv'),
    read.csv('/home/alegrand/Work/SimGrid/tom/m2_internship_journal/stampede/simulation/8/result_fatpipe_3875000.csv'))
fatpipe$mode = 'fatpipe'

results = rbind(classical, highbw, fatpipe)
#+end_src

#+RESULTS:

#+begin_src R :results output graphics :file figures/stampede_simgrid.pdf :exports both :width 5 :height 3.5 :session *R* 
res_lab = results[results$mode != 'highbw' & results$size > 3E6,];
res_lab$x=res_lab$size;
res_lab$y=res_lab$Gflops;
res_lab$xl=res_lab$x*.9;
res_lab$yl=res_lab$y*.9;


res_lab$xl=res_lab$x*.8;
res_lab$yl=res_lab$y*.9;

res_lab[res_lab$mode=='classical',]$xl=3.4e6
res_lab[res_lab$mode=='classical',]$yl=2.5e6
res_lab[res_lab$mode=='fatpipe',]$xl=2.e6
res_lab[res_lab$mode=='fatpipe',]$yl=4.1e6

res_lab$label = NA;
res_lab[res_lab$mode=='fatpipe',]$label = "Simulation\n (No Contention)";
res_lab[res_lab$mode=='classical',]$label = "Simulation\n (Fat Tree)";

ggplot(results[results$mode != 'highbw',], aes(x=size, y=Gflops, color=mode, linetype=mode)) + 
    # Inner labels
    geom_segment(data=res_lab, 
                 aes(x=xl, xend=x, y=yl, yend=y), linetype="solid",
                 color="black") +
    geom_label(data=res_lab, 
               aes(label = factor(label), x=xl, y=yl, fill=mode),
               colour =  "white", fontface = "bold") +
    # SMPI lines
    geom_point() + geom_line() +  
    # Top500 perf
    geom_hline(yintercept=5.16811e+06) + 
    annotate("text",x=3875000, 1E2,y=5.4e+06, hjust="right",
        label="Top 500 performance",color="black") + 
    annotate("text",x=3875000, 1E2,y=4.96e+06, hjust="right",
        label="(5.168 TeraFlop/s)",color="black") + 
    annotate("point",x=3875000, 1E2,y=5.16811e+06,color="black") + 
    # Cosmetics
    guides(fill=FALSE, color=FALSE, linetype=FALSE) + ylab("GFlop/s") + xlab("Matrix rank") +
    theme_bw() +  scale_color_brewer(palette="Set1") +  scale_fill_brewer(palette="Set1")+ 
    theme(legend.position="top") +
    ggtitle('Performance of HPL')
#+end_src

#+RESULTS:
[[file:figures/stampede_simgrid.pdf]]


#+begin_src R :results output graphics :file (org-babel-temp-file "figure" ".png") :exports both :width 600 :height 400 :session *R* 
ggplot(results, aes(x=size, y=simulation_time, color=mode, linetype=mode)) + geom_point() + geom_line() + ggtitle('Simulation time')
#+end_src

#+RESULTS:
[[file:/tmp/babel-23284Iao/figure232842Rn.png]]

*** Performance Gap Investigation
# The simulation time to get the full-scale trace was:
# - 420 seconds for two iterations (250 seconds spent in HPL),
# - 609 seconds for five iterations (265 seconds spent in HPL).
In this section, we explain our investigation and give possible reasons for
the aforementioned mismatch (apparent in Figure\ref{fig:stampede_prediction}). With SMPI, it is simple to trace
the first iterations of HPL to get an idea of what could be
improved (the trace for the first five iterations can be obtained in
about 609 seconds on a commodity computer and is compressed about
\SI{175}{\mega\byte} large). Figure\ref{fig:hpl_gantt} illustrates the
very synchronous and iterative nature of the first iterations: One can identify first a factorization of the panel, then a broadcast to all the
nodes, and finally an update of trailing matrix.
More than one fifth of each iteration is spent communicating (although the first
iterations are the ones with the lowest communication to computation ratio),
which prevents HPL from reaching the Top500 performance. 
Overlapping of these heavy communication phases with computation would improve
performance significantly. The fact that this is
almost not happening can be explained by the look-ahead ~DEPTH~
parameter that was supposedly set to =0= (see
Figure\ref{fig:hpl_output}). This is quite surprising as even
the tuning section of the HPL documentation indicates that a depth of
1 is supposed to yield the best results, even though a large problem size could
be needed to see some performance gain. We discussed this
surprising behavior with the Stampede-team and were informed that the
run in 2013 was executed with an HPL binary provided by Intel
and probably specifically modified for Stampede. We
believe that some configuration values have been hardcoded to enforce an overlap of
iterations with others. Indeed, the shortened part (marked ``[...]'') in
Figure\ref{fig:hpl_output} provides information about the progress of
HPL throughout iterations and statistics for the panel-owning process
about the time spent in the most important parts. 
According to these statistics, the total time
spent in the =Update= section was \SI{9390}{\sec} whereas the total
execution time was \SI{7505}{\sec}, which is impossible unless iterations have overlapped.

The broadcast and swapping algorithms use very heavy
communication patterns. This is not at all surprising since for a matrix of
this order, several hundred megabytes need to be broadcast. 
Although the output states that the =blongM= algorithm was
used it could be the case that another algorithm had been used.
We tried the other of the 6 broadcast algorithms HPL comes with but
did not achieve significantly better overall performance. 
An analysis of the symbols in the Intel binary
revealed that another broadcast algorithm named
=HPL_bcast_bpush= was available. Unlike the others, this new algorithm relies on non-blocking sends,
which could contribute to the performance obtained in 2013.
Likewise, the swapping algorithm that was used (~SWAP=Binary-exchange~) involves communications that are rather long and
organized in trees, which is surprising as the ~spread-roll~ algorithm
is recommended for large matrices.

#+BEGIN_EXPORT latex
\begin{figure}[t]
  \centering
  \includegraphics[width=\linewidth,page=1]{./figures/fullscale_unzoomed.png}
  \caption{Gantt chart of the first two iterations of HPL. Communication states are in 
      red while computations are in cyan. Each communication between two process
      is represented with a white arrow, which results in very cluttered white areas.}\vspace{-1em}
  \label{fig:hpl_gantt}
  \labspace
\end{figure}
#+END_EXPORT

We do not aim to reverse engineer the Intel HPL code. We can, however,
already draw two conclusions from our simple analysis: 1) it is apparent that many optimizations have been done on
the communication side and 2) it is very likely that the reported
parameters are not the ones used in the real execution, probably because 
these values were hardcoded and the configuration output file was not updated accordingly.
*** Gantt charts                                                 :noexport:
#+begin_src sh :results output :exports both
cp /home/alegrand/Work/SimGrid/tom/m2_internship_journal/stampede/communications/fullscale_unzoomed.png figures/
#+end_src

#+RESULTS:

* Conclusions
#+LaTeX: \label{sec:cl}

Studying HPC applications at scale can be very time- and
resource-consuming. Simulation is often an effective approach in this
context and SMPI has previously been successfully validated in several small-scale
studies with standard HPC applications\cite{smpi,heinrich:hal-01523608}.  In this
article, we proposed and evaluated extensions to the SimGrid/SMPI
framework that allowed us to emulate HPL at the scale of a
supercomputer. Our application of choice, HPL, is particularly challenging in terms of simulation
as it implements its own set of non-blocking collective operations
that rely on =MPI_Iprobe= in order to facilitate overlapping with computations.

More specifically, we tried to reproduce the execution of HPL on the
Stampede supercomputer conducted in $2013$ for the TOP500, which
involved a \SI{120}{\tera\byte} matrix and took two hours on 6,006\nbsp{}nodes.  
Our emulation of a similar configuration ran on a single machine for
about $62$ hours and required less than \SI{19}{\giga\byte} of RAM. This emulation
employed several non-trivial operating-system level optimizations
(memory mapping, dynamic library loading, huge pages) that have since been
integrated into the last version of SimGrid/SMPI.

The downside of scaling this high is a less well-controlled scenario.
The reference run of HPL on Stampede was done several years ago and we only
have very limited information about the setup (\eg software versions
and configuration), but a reservation and re-execution on the whole
machine was impossible for us. We nevertheless modeled Stampede carefully, which
allowed us to predict the performance that would
have been obtained using an unmodified, freely available version of HPL.
Unfortunately, despite all our efforts, the predicted performance
was much lower than what was reported in 2013. We determined that this
discrepancy comes from the fact that a modified, closed-source version of HPL
supplied by Intel was used in 2013.
We believe that some of the HPL configuration parameters were
hardcoded and therefore misreported in the output. A quick analysis of the optimized
HPL binary confirmed that algorithmic differences were likely to be the
reason for the performance differences.

We conclude that a large-scale (in)validation is unfortunately not
possible due to the modified source code being unavailable to us.
We claim that the modifications we made are
minor and are applicable to that optimized version. In fact, while HPL
comprises 16K lines of ANSI C over 149 files, our modifications only
changed 14 files with 286 line insertions and 18 deletions.

We believe being capable of precisely predicting an application's
performance on a given platform will become
invaluable in the future to aid compute centers with the decision of
whether a new machine (and what technology) will work best for a given
application or if an upgrade of the current machine should be
considered. As a future work, we intend to conduct similar studies
with other HPC benchmarks (\eg HPCG or HPGMG) and with other top500
machines. From our experience, we believe that a faithful and public
reporting of the experimental conditions (compiler options, library
versions, HPL output, etc.) is invaluable and allows researchers
to better understand of these platforms actually behave.

# This goal will be subject to a more thorough investigation
# in the very near future.

# As we saw in Section\ref{sec:hplchanges}, two BLAS functions (=dgemm=
# and =dtrsm=) were the dominating factor with regards to the runtime although other BLAS
# functions were called as well. For this study, we neglected the other
# functions but with a fully automatic calibration procedure for any
# BLAS function results could effortlessly become more precise as the
# application could just be linked against a BLAS-replacement
# library. 
# #+LaTeX: \CH{Problem here: HPL uses \texttt{HPL\_dtrsm()} wrappers.}

# #+LaTeX: \CH{I like the idea of pointing out again that our simulation takes much longer (48 hours instead of 2?) but that we use 1/6000 of the ressources}

* Acknowledgements

Experiments presented in this paper were carried out using the Grid'5000 testbed, supported by a scientific interest group hosted by Inria and including CNRS, RENATER and several Universities as well as other organizations (see https://www.grid5000.fr).
We warmly thank our TACC colleagues for their support in this study and
providing us with as much information as they could.
** References                                                       :ignore:

# See next section to understand how refs.bib file is created.

#+LATEX: \bibliographystyle{IEEEtran}
#+LATEX: \bibliography{refs}

* Bib file is here                                                 :noexport:

Tangle this file with C-c C-v t

#+begin_src bib :tangle refs.bib
@IEEEtranBSTCTL{IEEEexample:BSTcontrol,
   CTLuse_article_number = "yes",
   CTLuse_paper = "yes",
   CTLuse_url = "yes",
   CTLuse_forced_etal = "yes",
   CTLmax_names_forced_etal = "6",
   CTLnames_show_etal = "3",
   CTLuse_alt_spacing = "yes",
   CTLalt_stretch_factor = "4",
   CTLdash_repeated_names = "yes",
   CTLname_format_string = "{f. ~}{vv ~}{ll}{, jj}",
   CTLname_latex_cmd = "",
   CTLname_url_prefix = "[Online]. Available:"
}

@mastersthesis{cornebize:hal-01544827,
  TITLE = {{Capacity Planning of Supercomputers: Simulating MPI Applications at Scale}},
  AUTHOR = {Cornebize, Tom},
  URL = {https://hal.inria.fr/hal-01544827},
  SCHOOL = {{Grenoble INP ; Universit{\'e} Grenoble - Alpes}},
  YEAR = {2017},
  MONTH = Jun,
  KEYWORDS = {Simulation ;  MPI runtime and applications ;  Performance prediction and extrapolation ;  High Performance LINPACK},
  PDF = {https://hal.inria.fr/hal-01544827/file/report.pdf},
  HAL_ID = {hal-01544827},
  HAL_VERSION = {v1},
}

@incollection{grid5000,
   title = {Adding Virtualization Capabilities to the {Grid'5000} Testbed},
   author = {Balouek, Daniel and Carpen-Amarie, Alexandra and Charrier, Ghislain and Desprez, Fr{\'e}d{\'e}ric and Jeannot, Emmanuel and Jeanvoine, Emmanuel and L{\`e}bre, Adrien and Margery, David and Niclausse, Nicolas and Nussbaum, Lucas and Richard, Olivier and P{\'e}rez, Christian and Quesnel, Flavien and Rohr, Cyril and Sarzyniec, Luc},
   booktitle = {Cloud Computing and Services Science},
   publisher = {Springer International Publishing},
   OPTpages = {3-20},
   volume = {367},
   editor = {Ivanov, IvanI. and Sinderen, Marten and Leymann, Frank and Shan, Tony },
   series = {Communications in Computer and Information Science },
   isbn = {978-3-319-04518-4 },
   doi = {10.1007/978-3-319-04519-1\_1 },
   year = {2013},
}

%%% Online simulation of MPI applications
@article{xsim,
  author        = "Christian Engelmann",
  title         = {{Scaling To A Million Cores And Beyond: {Using} Light-Weight
                   Simulation to Understand The Challenges Ahead On The Road To
                   Exascale}},
  journal       = "FGCS",
  volume        = 30,
  pages         = "59--65",
  month         = jan,
  year          = 2014,
  publisher     = "Elsevier"}

@Article{sstmacro,
  author = {Curtis L. Janssen and Helgi Adalsteinsson and Scott Cranford and Joseph P. Kenny and Ali Pinar and David A. Evensky and Jackson Mayo},
  journal = {International Journal of Parallel and Distributed Systems},
  title = {A Simulator for Large-scale Parallel Architectures},
  volume = {1},
  number = {2},
  pages = {57--73},
  year = {2010},
  note = "\url{http://dx.doi.org/10.4018/jdst.2010040104}",
  doi = {10.4018/jdst.2010040104}
}

@article{SST,
  author    = {Rodrigues, Arun and Hemmert, Karl and Barrett, Brian
                  and Kersey, Chad and Oldfield, Ron and Weston, Marlo
                  and Riesen, Rolf and Cook, Jeanine and Rosenfeld,
                  Paul and CooperBalls, Elliot and Jacob, Bruce },
  title     = {{The Structural Simulation Toolkit}},
  journal   = {{SIGMETRICS} Performance Evaluation Review},
  volume    = 38,
  number    = 4,
  pages     = {37--42},
  year      = 2011
}

@article{dickens_tpds96,
  title={{Parallelized Direct Execution Simulation of Message-Passing
                  Parallel Programs}},
  author={Dickens, Phillip and Heidelberger, Philip and Nicol, David},
  journal={IEEE Transactions on Parallel and Distributed Systems},
  volume=7,
  number=10,
  year=1996,
  pages={1090--1105}
}

@ARTICLE{bagrodia_ijhpca01,
  author={Bagrodia, Rajive and Deelman, Ewa and Phan, Thomas},
  title={{Parallel Simulation of Large-Scale Parallel Applications}},
  journal={International Journal of High Performance Computing and
                  Applications},
  volume=15,
  number=1,
  year=2001,
  pages={3--12}
}

%%% Offline simulation of MPI applications
@INPROCEEDINGS{loggopsim_10,
  title={{LogGOPSim - Simulating Large-Scale Applications in 
          the LogGOPS Model}},
  author={Hoefler, Torsten and Siebert, Christian and Lumsdaine, Andrew},
  month=Jun,
  year={2010},
  pages = {597--604},
  booktitle={Proc. of the LSAP Workshop},
}

@inproceedings{hoefler-goal,
  author={T. Hoefler and C. Siebert and A. Lumsdaine},
  title={{Group Operation Assembly Language - A Flexible Way to Express Collective Communication}},
  year={2009},
  booktitle={Proc. of the 38th ICPP}
}

@inproceedings{bigsim_04,
  author={Zheng, Gengbin and Kakulapati, Gunavardhan and Kale,
                  Laxmikant},
  title={{BigSim: A Parallel Simulator for Performance Prediction of
                  Extremely Large Parallel Machines}},
  year=2004,
  booktitle={Proc. of the 18th IPDPS},
}

@inproceedings{dimemas,
	title = {{Dimemas: Predicting MPI Applications Behaviour in Grid Environments}},
	year = {2003},
	month = jun,
	booktitle = {Proc. of the Workshop on Grid Applications and
                  Programming Tools},
	author = {Rosa M. Badia and Jes{\'u}s Labarta and Judit Gim{\'e}nez and Francesc Escal{\'e}}
}

@article {CODES,
 title = {Enabling Parallel Simulation of Large-Scale {HPC} Network Systems},
 journal = {IEEE Transactions on Parallel and Distributed Systems},
 year = {2016},
 author = {Mubarak, M. and C. D. Carothers and Robert B. Ross and Philip H. Carns}
}

@article{ROSS_SC12,
author = {Misbah Mubarak and Christopher D. Carothers and Robert Ross and Philip Carns},
title = {{Modeling a Million-Node Dragonfly Network Using Massively Parallel Discrete-Event Simulation}},
journal ={SC Companion},
year = {2012},
pages = {366-376},
}

%%% Self citations on previous work
@Article{simgrid,
  title = {{Versatile, Scalable, and Accurate Simulation of Distributed Applications and Platforms}},
  author = {Casanova, Henri and Giersch, Arnaud and Legrand, Arnaud and Quinson, Martin and Suter, Fr{\'e}d{\'e}ric},
  publisher = {Elsevier},
  pages = {2899-2917},
  journal = {Journal of Parallel and Distributed Computing},
  volume = {74},
  number = {10},
  year = {2014}
}

@InProceedings{simetierre,
  author = {Bobelin, Laurent and Legrand, Arnaud and 
                  M{\'a}rquez, David Alejandro Gonz{\'a}lez and Navarro,
                  Pierre and Quinson, Martin and Suter,
                  Fr{\'e}d{\'e}ric and Thiery, Christophe},
  title = 	 {{Scalable Multi-Purpose Network Representation for
                  Large Scale Distributed System Simulation}},
  booktitle = {Proc. of the 12th IEEE/ACM International
                  Symposium on Cluster, Cloud and Grid Computing},
  year = 	 2012,
  pages = {220--227},
  address = 	 {Ottawa, Canada}
}

@InProceedings {simgrid_simix2_12,
  author = {Martin Quinson and Cristian Rosa and Christophe Thi{\'e}ry},
  title = {Parallel Simulation of Peer-to-Peer Systems},
  booktitle = {{P}roc. of the 12th {IEEE/ACM} {I}ntl. {S}ymposium on {C}luster, Cloud and Grid {C}omputing},
  year = {2012},
  address = {Ottawa, Canada}   
} 

@InProceedings {DCLV_LSAP_10,
  title = {{Fast and Scalable Simulation of Volunteer Computing Systems
                  Using SimGrid}},
  booktitle = {Proc. of the Workshop on Large-Scale System and Application
                  Performance},
  year = {2010},
  month = Jun,
  address = {Chicago, IL},
  author = {Donassolo, Bruno and Casanova, Henri and Legrand, Arnaud
                  and Velho, Pedro},
  category = {core}
} 

@InProceedings{SMPI_IPDPS,
  author	= {Clauss, Pierre-Nicolas and Stillwell, Mark and Genaud,
		  St\'ephane and Suter, Fr\'ed\'eric and Casanova, Henri and
		  Quinson, Martin},
  title	= {{Single Node On-Line Simulation of MPI Applications with
		  SMPI}},
  booktitle= {Proc. of the 25th IEEE Intl. Parallel and
		  Distributed Processing Symposium},
  year	= 2011,
  address	= {Anchorage, AK}
}


@Article{Velho_TOMACS13,
  author = {Velho, Pedro and Schnorr, Lucas and Casanova, Henri and Legrand, Arnaud},
  title = 	 {{On the Validity of Flow-level {TCP} Network Models for Grid and Cloud Simulations}},
  journal = 	 {ACM Transactions on Modeling and Computer Simulation},
  year = 	 {2013},
 PUBLISHER = {ACM}, 
  VOLUME = 23, 
  NUMBER = 4,
  pages = 23, 
  MONTH = Oct
}

@article{smpi,
  TITLE = {Simulating {MPI} applications: the {SMPI} approach},
  AUTHOR = {Degomme, Augustin and Legrand, Arnaud and Markomanolis, Georges and Quinson, Martin and Stillwell, Mark S and Suter, Frédéric},
  JOURNAL = {{IEEE Transactions on Parallel and Distributed Systems}},
  PUBLISHER = {{Institute of Electrical and Electronics Engineers}},
  volume =       "28",
  number =       "8",
  pages =        "2387--2400",
  PAGES = {14},
  YEAR = {2017},
  MONTH = Feb,
  DOI = {10.1109/TPDS.2017.2669305},
  KEYWORDS = {Simulation ; MPI runtime and applications ; Performance prediction and extrapolation},
  PDF = {https://hal.inria.fr/hal-01415484/file/smpi_article.pdf},
  HAL_ID = {hal-01415484},
  HAL_VERSION = {v2},
  category =     "core",
}

@InProceedings{heinrich:hal-01523608,
  title =        "{Predicting the Energy Consumption of {MPI} Applications
                 at Scale Using a Single Node}",
  author =       "Franz C. Heinrich and Tom Cornebize and Augustin
                 Degomme and Arnaud Legrand and Alexandra Carpen-Amarie
                 and Sascha Hunold and Anne-Cécile Orgerie and Martin
                 Quinson",
  URL =          "https://hal.inria.fr/hal-01523608",
  booktitle =    "Proc. of the 19th IEEE Cluster Conference",
  year =         "2017",
  keywords =     "simulation ; HPC ; energy ; platform modeling",
  pdf =          "https://hal.inria.fr/hal-01523608/file/predicting-energy-consumption-at-scale.pdf",
  hal_id =       "hal-01523608",
  category =     "core",
}

% Trace extrapolation
@InProceedings{scalaextrap,
  author = 	 {Xing Wu and Frank Mueller},
  title = 	 {{S}cala{E}xtrap: Trace-Based Communication Extrapolation
                  for {SPMD} Programs},
  booktitle = {Proc. of the 16th ACM Symp. on Principles and
               Practice of Parallel Programming},
  year = 	 {2011},
  pages = {113--122},
}

@InProceedings{pmac_lspp13,
  author = 	 {Laura Carrington and Michael Laurenzano and Ananta Tiwari},
  title = 	 {Inferring Large-scale Computation Behavior via Trace Extrapolation},
  booktitle = {Proc. of the Workshop on Large-Scale Parallel Processing},
  year = 	 {2013},
}

@Misc{hpl,
  author = 	 {Antoine Petitet and Clint Whaley and Jack Dongarra and Andy Cleary and Piotr Luszczek},
  title = 	 {{HPL} - A Portable Implementation of the {High-Performance Linpack} Benchmark for Distributed-Memory Computers},
  howpublished = {\url{http://www.netlib.org/benchmark/hpl}},
  month = 	 {February},
  year = 	 {2016},
  note = 	 {Version 2.2}
}

@book{top500,
 author = {Meuer, Hans Werner and Strohmaier, Erich and Dongarra, Jack and Simon, Horst D.},
 title = {The {TOP500}: History, Trends, and Future Directions in {High Performance Computing}},
 year = {2014},
 isbn = {143981595X, 9781439815953},
 edition = {1st},
 publisher = {Chapman \& Hall/CRC},
} 

@techreport{dmodk,
  author = 	 {Eitan Zahavi},
  title = 	 {{D-Mod-K} Routing Providing Non-Blocking Traffic for Shift Permutations on Real Life Fat Trees},
  institution = {Technion Israel Institute of Technology},
  year = 	 {2010},
}
#+end_src

* Emacs Setup 							   :noexport:
# Local Variables:
# eval:    (require 'org-install)
# eval:    (org-babel-do-load-languages 'org-babel-load-languages '( (shell . t) (R . t) (perl . t) (ditaa . t) ))
# eval:    (setq org-confirm-babel-evaluate nil)
# eval:    (unless (boundp 'org-latex-classes) (setq org-latex-classes nil))
# eval:    (add-to-list 'org-latex-classes '("IEEEtran"
# "\\documentclass[conference, 10pt]{IEEEtran}\n \[NO-DEFAULT-PACKAGES]\n \[EXTRA]\n  \\usepackage{graphicx}\n  \\usepackage{hyperref}"  ("\\section{%s}" . "\\section*{%s}") ("\\subsection{%s}" . "\\subsection*{%s}")                       ("\\subsubsection{%s}" . "\\subsubsection*{%s}")                       ("\\paragraph{%s}" . "\\paragraph*{%s}")                       ("\\subparagraph{%s}" . "\\subparagraph*{%s}")))
# eval:    (add-to-list 'org-latex-classes '("llncs" "\\documentclass{llncs2e/llncs}\n \[NO-DEFAULT-PACKAGES]\n \[EXTRA]\n"  ("\\section{%s}" . "\\section*{%s}") ("\\subsection{%s}" . "\\subsection*{%s}")                       ("\\subsubsection{%s}" . "\\subsubsection*{%s}")                       ("\\paragraph{%s}" . "\\paragraph*{%s}")                       ("\\subparagraph{%s}" . "\\subparagraph*{%s}")))
# eval:    (add-to-list 'org-latex-classes '("acm-proc-article-sp" "\\documentclass{acm_proc_article-sp}\n \[NO-DEFAULT-PACKAGES]\n \[EXTRA]\n"  ("\\section{%s}" . "\\section*{%s}") ("\\subsection{%s}" . "\\subsection*{%s}")                       ("\\subsubsection{%s}" . "\\subsubsection*{%s}")                       ("\\paragraph{%s}" . "\\paragraph*{%s}")                       ("\\subparagraph{%s}" . "\\subparagraph*{%s}")))
# eval:    (add-to-list 'org-latex-classes '("sig-alternate" "\\documentclass{sig-alternate}\n \[NO-DEFAULT-PACKAGES]\n \[EXTRA]\n"  ("\\section{%s}" . "\\section*{%s}") ("\\subsection{%s}" . "\\subsection*{%s}")                       ("\\subsubsection{%s}" . "\\subsubsection*{%s}")                       ("\\paragraph{%s}" . "\\paragraph*{%s}")                       ("\\subparagraph{%s}" . "\\subparagraph*{%s}")))
# eval:    (setq org-alphabetical-lists t)
# eval:    (setq org-src-fontify-natively t)
# eval:   (setq ispell-local-dictionary "american")
# eval:   (eval (flyspell-mode t))
# eval:   (setq org-todo-keyword-faces '(("FLAWED" . (:foreground "RED" :weight bold))))
# eval:   (custom-set-variables '(org-babel-shell-names (quote ("sh" "bash" "csh" "ash" "dash" "ksh" "mksh" "posh" "zsh"))))
# eval:   (add-to-list 'load-path ".")
# eval:   (require 'ox-extra)
# eval:   (setq org-latex-tables-centered nil)
# eval:   (ox-extras-activate '(ignore-headlines))
# End: