diff --git a/module2/exo4/stat_activity.html b/module2/exo4/stat_activity.html
deleted file mode 100644
index 76b7ed142801be3e693ca279159daa1016af4d40..0000000000000000000000000000000000000000
--- a/module2/exo4/stat_activity.html
+++ /dev/null
@@ -1,687 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
-"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
-<html xmlns="http://www.w3.org/1999/xhtml" lang="fr" xml:lang="fr">
-<head>
-<!-- 2018-09-05 mer. 07:41 -->
-<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
-<meta name="viewport" content="width=device-width, initial-scale=1" />
-<title>Analyse des mots-clés de mon journal</title>
-<meta name="generator" content="Org mode" />
-<meta name="author" content="Arnaud Legrand" />
-<style type="text/css">
- <!--/*--><![CDATA[/*><!--*/
-  .title  { text-align: center;
-             margin-bottom: .2em; }
-  .subtitle { text-align: center;
-              font-size: medium;
-              font-weight: bold;
-              margin-top:0; }
-  .todo   { font-family: monospace; color: red; }
-  .done   { font-family: monospace; color: green; }
-  .priority { font-family: monospace; color: orange; }
-  .tag    { background-color: #eee; font-family: monospace;
-            padding: 2px; font-size: 80%; font-weight: normal; }
-  .timestamp { color: #bebebe; }
-  .timestamp-kwd { color: #5f9ea0; }
-  .org-right  { margin-left: auto; margin-right: 0px;  text-align: right; }
-  .org-left   { margin-left: 0px;  margin-right: auto; text-align: left; }
-  .org-center { margin-left: auto; margin-right: auto; text-align: center; }
-  .underline { text-decoration: underline; }
-  #postamble p, #preamble p { font-size: 90%; margin: .2em; }
-  p.verse { margin-left: 3%; }
-  pre {
-    border: 1px solid #ccc;
-    box-shadow: 3px 3px 3px #eee;
-    padding: 8pt;
-    font-family: monospace;
-    overflow: auto;
-    margin: 1.2em;
-  }
-  pre.src {
-    position: relative;
-    overflow: visible;
-    padding-top: 1.2em;
-  }
-  pre.src:before {
-    display: none;
-    position: absolute;
-    background-color: white;
-    top: -10px;
-    right: 10px;
-    padding: 3px;
-    border: 1px solid black;
-  }
-  pre.src:hover:before { display: inline;}
-  /* Languages per Org manual */
-  pre.src-asymptote:before { content: 'Asymptote'; }
-  pre.src-awk:before { content: 'Awk'; }
-  pre.src-C:before { content: 'C'; }
-  /* pre.src-C++ doesn't work in CSS */
-  pre.src-clojure:before { content: 'Clojure'; }
-  pre.src-css:before { content: 'CSS'; }
-  pre.src-D:before { content: 'D'; }
-  pre.src-ditaa:before { content: 'ditaa'; }
-  pre.src-dot:before { content: 'Graphviz'; }
-  pre.src-calc:before { content: 'Emacs Calc'; }
-  pre.src-emacs-lisp:before { content: 'Emacs Lisp'; }
-  pre.src-fortran:before { content: 'Fortran'; }
-  pre.src-gnuplot:before { content: 'gnuplot'; }
-  pre.src-haskell:before { content: 'Haskell'; }
-  pre.src-hledger:before { content: 'hledger'; }
-  pre.src-java:before { content: 'Java'; }
-  pre.src-js:before { content: 'Javascript'; }
-  pre.src-latex:before { content: 'LaTeX'; }
-  pre.src-ledger:before { content: 'Ledger'; }
-  pre.src-lisp:before { content: 'Lisp'; }
-  pre.src-lilypond:before { content: 'Lilypond'; }
-  pre.src-lua:before { content: 'Lua'; }
-  pre.src-matlab:before { content: 'MATLAB'; }
-  pre.src-mscgen:before { content: 'Mscgen'; }
-  pre.src-ocaml:before { content: 'Objective Caml'; }
-  pre.src-octave:before { content: 'Octave'; }
-  pre.src-org:before { content: 'Org mode'; }
-  pre.src-oz:before { content: 'OZ'; }
-  pre.src-plantuml:before { content: 'Plantuml'; }
-  pre.src-processing:before { content: 'Processing.js'; }
-  pre.src-python:before { content: 'Python'; }
-  pre.src-R:before { content: 'R'; }
-  pre.src-ruby:before { content: 'Ruby'; }
-  pre.src-sass:before { content: 'Sass'; }
-  pre.src-scheme:before { content: 'Scheme'; }
-  pre.src-screen:before { content: 'Gnu Screen'; }
-  pre.src-sed:before { content: 'Sed'; }
-  pre.src-sh:before { content: 'shell'; }
-  pre.src-sql:before { content: 'SQL'; }
-  pre.src-sqlite:before { content: 'SQLite'; }
-  /* additional languages in org.el's org-babel-load-languages alist */
-  pre.src-forth:before { content: 'Forth'; }
-  pre.src-io:before { content: 'IO'; }
-  pre.src-J:before { content: 'J'; }
-  pre.src-makefile:before { content: 'Makefile'; }
-  pre.src-maxima:before { content: 'Maxima'; }
-  pre.src-perl:before { content: 'Perl'; }
-  pre.src-picolisp:before { content: 'Pico Lisp'; }
-  pre.src-scala:before { content: 'Scala'; }
-  pre.src-shell:before { content: 'Shell Script'; }
-  pre.src-ebnf2ps:before { content: 'ebfn2ps'; }
-  /* additional language identifiers per "defun org-babel-execute"
-       in ob-*.el */
-  pre.src-cpp:before  { content: 'C++'; }
-  pre.src-abc:before  { content: 'ABC'; }
-  pre.src-coq:before  { content: 'Coq'; }
-  pre.src-groovy:before  { content: 'Groovy'; }
-  /* additional language identifiers from org-babel-shell-names in
-     ob-shell.el: ob-shell is the only babel language using a lambda to put
-     the execution function name together. */
-  pre.src-bash:before  { content: 'bash'; }
-  pre.src-csh:before  { content: 'csh'; }
-  pre.src-ash:before  { content: 'ash'; }
-  pre.src-dash:before  { content: 'dash'; }
-  pre.src-ksh:before  { content: 'ksh'; }
-  pre.src-mksh:before  { content: 'mksh'; }
-  pre.src-posh:before  { content: 'posh'; }
-  /* Additional Emacs modes also supported by the LaTeX listings package */
-  pre.src-ada:before { content: 'Ada'; }
-  pre.src-asm:before { content: 'Assembler'; }
-  pre.src-caml:before { content: 'Caml'; }
-  pre.src-delphi:before { content: 'Delphi'; }
-  pre.src-html:before { content: 'HTML'; }
-  pre.src-idl:before { content: 'IDL'; }
-  pre.src-mercury:before { content: 'Mercury'; }
-  pre.src-metapost:before { content: 'MetaPost'; }
-  pre.src-modula-2:before { content: 'Modula-2'; }
-  pre.src-pascal:before { content: 'Pascal'; }
-  pre.src-ps:before { content: 'PostScript'; }
-  pre.src-prolog:before { content: 'Prolog'; }
-  pre.src-simula:before { content: 'Simula'; }
-  pre.src-tcl:before { content: 'tcl'; }
-  pre.src-tex:before { content: 'TeX'; }
-  pre.src-plain-tex:before { content: 'Plain TeX'; }
-  pre.src-verilog:before { content: 'Verilog'; }
-  pre.src-vhdl:before { content: 'VHDL'; }
-  pre.src-xml:before { content: 'XML'; }
-  pre.src-nxml:before { content: 'XML'; }
-  /* add a generic configuration mode; LaTeX export needs an additional
-     (add-to-list 'org-latex-listings-langs '(conf " ")) in .emacs */
-  pre.src-conf:before { content: 'Configuration File'; }
-
-  table { border-collapse:collapse; }
-  caption.t-above { caption-side: top; }
-  caption.t-bottom { caption-side: bottom; }
-  td, th { vertical-align:top;  }
-  th.org-right  { text-align: center;  }
-  th.org-left   { text-align: center;   }
-  th.org-center { text-align: center; }
-  td.org-right  { text-align: right;  }
-  td.org-left   { text-align: left;   }
-  td.org-center { text-align: center; }
-  dt { font-weight: bold; }
-  .footpara { display: inline; }
-  .footdef  { margin-bottom: 1em; }
-  .figure { padding: 1em; }
-  .figure p { text-align: center; }
-  .inlinetask {
-    padding: 10px;
-    border: 2px solid gray;
-    margin: 10px;
-    background: #ffffcc;
-  }
-  #org-div-home-and-up
-   { text-align: right; font-size: 70%; white-space: nowrap; }
-  textarea { overflow-x: auto; }
-  .linenr { font-size: smaller }
-  .code-highlighted { background-color: #ffff00; }
-  .org-info-js_info-navigation { border-style: none; }
-  #org-info-js_console-label
-    { font-size: 10px; font-weight: bold; white-space: nowrap; }
-  .org-info-js_search-highlight
-    { background-color: #ffff00; color: #000000; font-weight: bold; }
-  .org-svg { width: 90%; }
-  /*]]>*/-->
-</style>
-<link rel="stylesheet" type="text/css" href="http://www.pirilampo.org/styles/readtheorg/css/htmlize.css"/>
-<link rel="stylesheet" type="text/css" href="http://www.pirilampo.org/styles/readtheorg/css/readtheorg.css"/>
-<script src="https://ajax.googleapis.com/ajax/libs/jquery/2.1.3/jquery.min.js"></script>
-<script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/js/bootstrap.min.js"></script>
-<script type="text/javascript" src="http://www.pirilampo.org/styles/lib/js/jquery.stickytableheaders.js"></script>
-<script type="text/javascript" src="http://www.pirilampo.org/styles/readtheorg/js/readtheorg.js"></script>
-<script type="text/javascript">
-/*
-@licstart  The following is the entire license notice for the
-JavaScript code in this tag.
-
-Copyright (C) 2012-2018 Free Software Foundation, Inc.
-
-The JavaScript code in this tag is free software: you can
-redistribute it and/or modify it under the terms of the GNU
-General Public License (GNU GPL) as published by the Free Software
-Foundation, either version 3 of the License, or (at your option)
-any later version.  The code is distributed WITHOUT ANY WARRANTY;
-without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE.  See the GNU GPL for more details.
-
-As additional permission under GNU GPL version 3 section 7, you
-may distribute non-source (e.g., minimized or compacted) forms of
-that code without the copy of the GNU GPL normally required by
-section 4, provided you include this license notice and a URL
-through which recipients can access the Corresponding Source.
-
-
-@licend  The above is the entire license notice
-for the JavaScript code in this tag.
-*/
-<!--/*--><![CDATA[/*><!--*/
- function CodeHighlightOn(elem, id)
- {
-   var target = document.getElementById(id);
-   if(null != target) {
-     elem.cacheClassElem = elem.className;
-     elem.cacheClassTarget = target.className;
-     target.className = "code-highlighted";
-     elem.className   = "code-highlighted";
-   }
- }
- function CodeHighlightOff(elem, id)
- {
-   var target = document.getElementById(id);
-   if(elem.cacheClassElem)
-     elem.className = elem.cacheClassElem;
-   if(elem.cacheClassTarget)
-     target.className = elem.cacheClassTarget;
- }
-/*]]>*///-->
-</script>
-<script type="text/javascript">
-    function rpl(expr,a,b) {
-      var i=0
-      while (i!=-1) {
-         i=expr.indexOf(a,i);
-         if (i>=0) {
-            expr=expr.substring(0,i)+b+expr.substring(i+a.length);
-            i+=b.length;
-         }
-      }
-      return expr
-    }
-
-    function show_org_source(){
-       document.location.href = rpl(document.location.href,".php",".org");
-    }
-</script>
-</head>
-<body>
-<div id="content">
-<h1 class="title">Analyse des mots-clés de mon journal</h1>
-<div id="table-of-contents">
-<h2>Table des matières</h2>
-<div id="text-table-of-contents">
-<ul>
-<li><a href="#orge93496f">1. Mise en forme des données</a></li>
-<li><a href="#org91643e3">2. Statistiques de base</a></li>
-<li><a href="#org1db0ff8">3. Représentations graphiques</a></li>
-</ul>
-</div>
-</div>
-<p>
-J'ai la chance de ne pas avoir de comptes à rendre trop précis sur le
-temps que je passe à faire telle ou telle chose. Ça tombe bien car je
-n'aime pas vraiment suivre précisément et quotidiennement le temps que
-je passe à faire telle ou telle chose. Par contre, comme vous avez pu
-le voir dans une des vidéos de ce module, je note beaucoup
-d'informations dans mon journal et j'étiquette (quand j'y pense) ces
-informations. Je me suis dit qu'il pourrait être intéressant de voir
-si l'évolution de l'utilisation de ces étiquettes révélait quelque
-chose sur mes centres d'intérêts professionnels. Je ne compte pas en
-déduire grand chose de significatif sur le plan statistique vu que je
-sais que ma rigueur dans l'utilisation de ces étiquettes et leur
-sémantique a évolué au fil des années mais bon, on va bien voir ce
-qu'on y trouve.
-</p>
-
-<div id="outline-container-orge93496f" class="outline-2">
-<h2 id="orge93496f"><span class="section-number-2">1</span> Mise en forme des données</h2>
-<div class="outline-text-2" id="text-1">
-<p>
-Mon journal est stocké dans <code>/home/alegrand/org/journal.org</code>. Les
-entrées de niveau 1 (une étoile) indiquent l'année, celles de niveau 2
-(2 étoiles) le mois, celles de niveau 3 (3 étoiles) la date du jour et
-enfin, celles de profondeur plus importantes ce sur quoi j'ai
-travaillé ce jour là. Ce sont généralement celles-ci qui sont
-étiquetées avec des mots-clés entre ":" à la fin de la ligne. 
-</p>
-
-<p>
-Je vais donc chercher à extraire les lignes comportant trois <code>*</code> en
-début de ligne et celles commençant par une <code>*</code> et terminant par des
-mots-clés (des <code>:</code> suivis éventuellement d'un espace). L'expression
-régulière n'est pas forcément parfaite mais ça me donne une première
-idée de ce que j'aurai besoin de faire en terme de remise en forme.
-</p>
-
-<div class="org-src-container">
-<pre class="src src-shell">grep -e <span class="org-string">'^\*\*\* '</span> -e <span class="org-string">'^\*.*:.*: *$'</span> ~/org/journal.org | tail -n 20
-</pre>
-</div>
-
-<pre class="example">
-*** 2018-06-01 vendredi
-**** CP Inria du 01/06/18                                  :POLARIS:INRIA:
-*** 2018-06-04 lundi
-*** 2018-06-07 jeudi
-**** The Cognitive Packet Network - Reinforcement based Network Routing with Random Neural Networks (Erol Gelenbe) :Seminar:
-*** 2018-06-08 vendredi
-**** The credibility revolution in psychological science: the view from an editor's desk (Simine Vazire, UC DAVIS) :Seminar:
-*** 2018-06-11 lundi
-**** LIG leaders du 11 juin 2018                             :POLARIS:LIG:
-*** 2018-06-12 mardi
-**** geom_ribbon with discrete x scale                                  :R:
-*** 2018-06-13 mercredi
-*** 2018-06-14 jeudi
-*** 2018-06-20 mercredi
-*** 2018-06-21 jeudi
-*** 2018-06-22 vendredi
-**** Discussion Nicolas Benoit (TGCC, Bruyère)                    :SG:WP4:
-*** 2018-06-25 lundi
-*** 2018-06-26 mardi
-**** Point budget/contrats POLARIS                         :POLARIS:INRIA:
-</pre>
-
-<p>
-OK, je suis sur la bonne voie. Je vois qu'il y a pas mal d'entrées
-sans annotation. Tant pis. Il y a aussi souvent plusieurs mots-clés
-pour une même date et pour pouvoir bien rajouter la date du jour en
-face de chaque mot-clé, je vais essayer un vrai langage plutôt que
-d'essayer de faire ça à coup de commandes shell. Je suis de l'ancienne
-génération donc j'ai plus l'habitude de Perl que de Python pour ce
-genre de choses. Curieusement, ça s'écrit bien plus facilement (ça m'a
-pris 5 minutes) que ça ne se relit&#x2026; &#9786;
-</p>
-
-<div class="org-src-container">
-<pre class="src src-perl">open INPUT, <span class="org-string">"/home/alegrand/org/journal.org"</span> or <span class="org-keyword">die</span> $<span class="org-variable-name">_</span>;
-open OUTPUT, <span class="org-string">"&gt; ./org_keywords.csv"</span> or <span class="org-keyword">die</span>;
-$<span class="org-variable-name">date</span>=<span class="org-string">""</span>;
-print OUTPUT <span class="org-string">"Date,Keyword\n"</span>;
-%<span class="org-underline"><span class="org-variable-name">skip</span></span> = <span class="org-type">my</span> %<span class="org-underline"><span class="org-variable-name">params</span></span> = map { $<span class="org-variable-name">_</span> =&gt; 1 } (<span class="org-string">""</span>, <span class="org-string">"ATTACH"</span>, <span class="org-string">"Alvin"</span>, <span class="org-string">"Fred"</span>, <span class="org-string">"Mt"</span>, <span class="org-string">"Henri"</span>, <span class="org-string">"HenriRaf"</span>);
-
-<span class="org-keyword">while</span>(defined($<span class="org-variable-name">line</span>=&lt;<span class="org-constant">INPUT</span>&gt;)) {
-    chomp($<span class="org-variable-name">line</span>);
-    <span class="org-keyword">if</span>($<span class="org-variable-name">line</span> =~ <span class="org-string">'^\*\*\* (20[\d\-]*)'</span>) {
-        $<span class="org-variable-name">date</span>=$<span class="org-variable-name">1</span>;
-    }
-    <span class="org-keyword">if</span>($<span class="org-variable-name">line</span> =~ <span class="org-string">'^\*.*(:\w*:)\s*$'</span>) {
-        @<span class="org-underline"><span class="org-variable-name">kw</span></span>=split(<span class="org-string">/:/</span>,$<span class="org-variable-name">1</span>);
-        <span class="org-keyword">if</span>($<span class="org-variable-name">date</span> eq <span class="org-string">""</span>) { <span class="org-keyword">next</span>;}
-        <span class="org-keyword">foreach</span> $<span class="org-variable-name">k</span> (@<span class="org-underline"><span class="org-variable-name">kw</span></span>) {
-            <span class="org-keyword">if</span>(exists($<span class="org-variable-name">skip</span>{$<span class="org-variable-name">k</span>})) { <span class="org-keyword">next</span>;}
-            print OUTPUT <span class="org-string">"$date,$k\n"</span>;
-        }
-    }
-}
-</pre>
-</div>
-
-<p>
-Vérifions à quoi ressemble le résultat :
-</p>
-<div class="org-src-container">
-<pre class="src src-shell">head org_keywords.csv
-<span class="org-builtin">echo</span> <span class="org-string">"..."</span>
-tail org_keywords.csv
-</pre>
-</div>
-
-<pre class="example">
-Date,Keyword
-2011-02-08,R
-2011-02-08,Blog
-2011-02-08,WP8
-2011-02-08,WP8
-2011-02-08,WP8
-2011-02-17,WP0
-2011-02-23,WP0
-2011-04-05,Workload
-2011-05-17,Workload
-...
-2018-05-17,POLARIS
-2018-05-30,INRIA
-2018-05-31,LIG
-2018-06-01,INRIA
-2018-06-07,Seminar
-2018-06-08,Seminar
-2018-06-11,LIG
-2018-06-12,R
-2018-06-22,WP4
-2018-06-26,INRIA
-</pre>
-
-<p>
-C'est parfait !
-</p>
-</div>
-</div>
-
-<div id="outline-container-org91643e3" class="outline-2">
-<h2 id="org91643e3"><span class="section-number-2">2</span> Statistiques de base</h2>
-<div class="outline-text-2" id="text-2">
-<p>
-Je suis bien plus à l'aise avec R qu'avec Python. J'utiliserai les
-package du tidyverse dès que le besoin s'en fera sentir. Commençons
-par lire ces données :
-</p>
-<div class="org-src-container">
-<pre class="src src-R"><span class="org-constant">library</span>(lubridate) <span class="org-comment-delimiter"># </span><span class="org-comment">&#224; installer via install.package("tidyverse")</span>
-<span class="org-constant">library</span>(dplyr)
-df=read.csv(<span class="org-string">"./org_keywords.csv"</span>,header=T)
-df$Year=year(date(df$Date))
-</pre>
-</div>
-
-<pre class="example">
-
-Attachement du package : ‘lubridate’
-
-The following object is masked from ‘package:base’:
-
-    date
-
-Attachement du package : ‘dplyr’
-
-The following objects are masked from ‘package:lubridate’:
-
-    intersect, setdiff, union
-
-The following objects are masked from ‘package:stats’:
-
-    filter, lag
-
-The following objects are masked from ‘package:base’:
-
-    intersect, setdiff, setequal, union
-</pre>
-
-<p>
-Alors, à quoi ressemblent ces données :
-</p>
-<div class="org-src-container">
-<pre class="src src-R">str(df)
-summary(df)
-</pre>
-</div>
-
-<pre class="example">
-'data.frame':	566 obs. of  3 variables:
- $ Date   : Factor w/ 420 levels "2011-02-08","2011-02-17",..: 1 1 1 1 1 2 3 4 5 6 ...
- $ Keyword: Factor w/ 36 levels "Argonne","autotuning",..: 22 3 36 36 36 30 30 29 29 36 ...
- $ Year   : num  2011 2011 2011 2011 2011 ...
-         Date         Keyword         Year     
- 2011-02-08:  5   WP4     : 77   Min.   :2011  
- 2016-01-06:  5   POLARIS : 56   1st Qu.:2013  
- 2016-03-29:  5   R       : 48   Median :2016  
- 2017-12-11:  5   LIG     : 40   Mean   :2015  
- 2017-12-12:  5   Teaching: 38   3rd Qu.:2017  
- 2016-01-26:  4   WP7     : 36   Max.   :2018  
- (Other)   :537   (Other) :271
-</pre>
-
-<p>
-Les types ont l'air corrects, 568 entrées, tout va bien.
-</p>
-<div class="org-src-container">
-<pre class="src src-R">df <span class="org-ess-XopX">%&gt;%</span> group_by(Keyword, Year) <span class="org-ess-XopX">%&gt;%</span> summarize(Count=n()) <span class="org-ess-XopX">%&gt;%</span> 
-   ungroup() <span class="org-ess-XopX">%&gt;%</span> arrange(Keyword,Year) <span class="org-constant">-&gt;</span> df_summarized
-df_summarized
-</pre>
-</div>
-
-<pre class="example">
-# A tibble: 120 x 3
-   Keyword     Year Count
-   &lt;fct&gt;      &lt;dbl&gt; &lt;int&gt;
- 1 Argonne     2012     4
- 2 Argonne     2013     6
- 3 Argonne     2014     4
- 4 Argonne     2015     1
- 5 autotuning  2012     2
- 6 autotuning  2014     1
- 7 autotuning  2016     4
- 8 Blog        2011     2
- 9 Blog        2012     6
-10 Blog        2013     4
-# ... with 110 more rows
-</pre>
-
-<p>
-Commençons par compter combien d'annotations je fais par an.
-</p>
-<div class="org-src-container">
-<pre class="src src-R">df_summarized_total_year = df_summarized <span class="org-ess-XopX">%&gt;%</span> group_by(Year) <span class="org-ess-XopX">%&gt;%</span> summarize(Cout=sum(Count))
-df_summarized_total_year
-</pre>
-</div>
-
-<pre class="example">
-# A tibble: 8 x 2
-   Year  Cout
-  &lt;dbl&gt; &lt;int&gt;
-1  2011    24
-2  2012    57
-3  2013    68
-4  2014    21
-5  2015    80
-6  2016   133
-7  2017   135
-8  2018    48
-</pre>
-
-<p>
-Ah, visiblement, je m'améliore au fil du temps et en 2014, j'ai oublié
-de le faire régulièrement.
-</p>
-
-<p>
-L'annotation étant libre, certains mots-clés sont peut-être très peu
-présents. Regardons ça.
-</p>
-<div class="org-src-container">
-<pre class="src src-R">df_summarized <span class="org-ess-XopX">%&gt;%</span> group_by(Keyword) <span class="org-ess-XopX">%&gt;%</span> summarize(Count=sum(Count)) <span class="org-ess-XopX">%&gt;%</span>  arrange(Count) <span class="org-ess-XopX">%&gt;%</span> as.data.frame()
-</pre>
-</div>
-
-<pre class="example">
-         Keyword Count
-1       Gradient     1
-2          LaTeX     1
-3         Orange     1
-4             PF     1
-5        twitter     2
-6            WP1     2
-7            WP6     2
-8   Epistemology     3
-9           BULL     4
-10 Vulgarization     4
-11      Workload     4
-12    GameTheory     5
-13      noexport     5
-14    autotuning     7
-15        Python     7
-16         Stats     7
-17           WP0     7
-18            SG     8
-19           git     9
-20     HACSPECIS    10
-21          Blog    12
-22         BOINC    12
-23          HOME    12
-24           WP3    12
-25       OrgMode    14
-26       Argonne    15
-27        Europe    18
-28       Seminar    28
-29           WP8    28
-30         INRIA    30
-31           WP7    36
-32      Teaching    38
-33           LIG    40
-34             R    48
-35       POLARIS    56
-36           WP4    77
-</pre>
-
-<p>
-OK, par la suite, je me restraindrai probablement à ceux qui
-apparaissent au moins trois fois.
-</p>
-</div>
-</div>
-
-<div id="outline-container-org1db0ff8" class="outline-2">
-<h2 id="org1db0ff8"><span class="section-number-2">3</span> Représentations graphiques</h2>
-<div class="outline-text-2" id="text-3">
-<p>
-Pour bien faire, il faudrait que je mette une sémantique et une
-hiérarchie sur ces mots-clés mais je manque de temps là. Comme
-j'enlève les mots-clés peu fréquents, je vais quand même aussi
-rajouter le nombre total de mots-clés pour avoir une idée de ce que
-j'ai perdu. Tentons une première représentation graphique :
-</p>
-<div class="org-src-container">
-<pre class="src src-R"><span class="org-constant">library</span>(ggplot2)
-df_summarized <span class="org-ess-XopX">%&gt;%</span> filter(Count &gt; 3) <span class="org-ess-XopX">%&gt;%</span>
-    ggplot(aes(x=Year, y=Count)) + 
-    geom_bar(aes(fill=Keyword),stat=<span class="org-string">"identity"</span>) + 
-    geom_point(data=df_summarized <span class="org-ess-XopX">%&gt;%</span> group_by(Year) <span class="org-ess-XopX">%&gt;%</span> summarize(Count=sum(Count))) +
-    theme_bw()
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="barchart1.png" alt="barchart1.png" />
-</p>
-</div>
-
-<p>
-Aouch. C'est illisible avec une telle palette de couleurs mais vu
-qu'il y a beaucoup de valeurs différentes, difficile d'utiliser une
-palette plus discriminante. Je vais quand même essayer rapidement
-histoire de dire&#x2026; Pour ça, j'utiliserai une palette de couleur
-("Set1") où les couleurs sont toutes bien différentes mais elle n'a
-que 9 couleurs. Je vais donc commencer par sélectionner les 9
-mots-clés les plus fréquents.
-</p>
-
-<div class="org-src-container">
-<pre class="src src-R"><span class="org-constant">library</span>(ggplot2)
-frequent_keywords = df_summarized <span class="org-ess-XopX">%&gt;%</span> group_by(Keyword) <span class="org-ess-XopX">%&gt;%</span> 
-    summarize(Count=sum(Count)) <span class="org-ess-XopX">%&gt;%</span>  arrange(Count) <span class="org-ess-XopX">%&gt;%</span> 
-    as.data.frame() <span class="org-ess-XopX">%&gt;%</span> tail(n=9)
-
-df_summarized <span class="org-ess-XopX">%&gt;%</span> filter(Keyword <span class="org-ess-XopX">%in%</span> frequent_keywords$Keyword) <span class="org-ess-XopX">%&gt;%</span>
-    ggplot(aes(x=Year, y=Count)) + 
-    geom_bar(aes(fill=Keyword),stat=<span class="org-string">"identity"</span>) + 
-    geom_point(data=df_summarized <span class="org-ess-XopX">%&gt;%</span> group_by(Year) <span class="org-ess-XopX">%&gt;%</span> summarize(Count=sum(Count))) +
-    theme_bw() + scale_fill_brewer(palette=<span class="org-string">"Set1"</span>)
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="barchart2.png" alt="barchart2.png" />
-</p>
-</div>
-
-<p>
-OK. Visiblement, la part liée à l'administration (<code>Inria</code>, <code>LIG</code>, <code>POLARIS</code>)
-et à l'enseignement (<code>Teaching</code>) augmente. L'augmentation des parties
-sur <code>R</code> est à mes yeux signe d'une amélioration de ma maîtrise de
-l'outil. L'augmentation de la partie <code>Seminar</code> ne signifie pas grand
-chose car ce n'est que récemment que j'ai commencé à étiqueter
-systématiquement les notes que je prenais quand j'assiste à un
-exposé. Les étiquettes sur <code>WP</code> ont trait à la terminologie d'un ancien
-projet ANR que j'ai continué à utiliser (<code>WP4</code> = prédiction de
-performance HPC, <code>WP7</code> = analyse et visualisation, <code>WP8</code> = plans
-d'expérience et moteurs d'expérimentation&#x2026;). Le fait que <code>WP4</code>
-diminue est plutôt le fait que les informations à ce sujet sont
-maintenant plutôt les journaux de mes doctorants qui réalisent
-vraiment les choses que je ne fais que superviser.
-</p>
-
-<p>
-Bon, une analyse de ce genre ne serait pas digne de ce nom sans un
-<i>wordcloud</i> (souvent illisible, mais tellement sexy! &#9786;). Pour ça, je
-m'inspire librement de ce post :
-<a href="http://onertipaday.blogspot.com/2011/07/word-cloud-in-r.html">http://onertipaday.blogspot.com/2011/07/word-cloud-in-r.html</a>
-</p>
-
-<div class="org-src-container">
-<pre class="src src-R"><span class="org-constant">library</span>(wordcloud) <span class="org-comment-delimiter"># </span><span class="org-comment">&#224; installer via install.package("wordcloud")</span>
-<span class="org-constant">library</span>(RColorBrewer)
-pal2 <span class="org-constant">&lt;-</span> brewer.pal(8,<span class="org-string">"Dark2"</span>)
-df_summarized <span class="org-ess-XopX">%&gt;%</span> group_by(Keyword) <span class="org-ess-XopX">%&gt;%</span> summarize(Count=sum(Count)) <span class="org-constant">-&gt;</span> df_summarized_keyword
-wordcloud(df_summarized_keyword$Keyword, df_summarized_keyword$Count,
-     random.order=<span class="org-type">FALSE</span>, rot.per=.15, colors=pal2, vfont=c(<span class="org-string">"sans serif"</span>,<span class="org-string">"plain"</span>))
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="wordcloud.png" alt="wordcloud.png" />
-</p>
-</div>
-
-<p>
-Bon&#x2026; voilà, c'est "joli" mais sans grand intérêt, tout
-particulièrement quand il y a si peu de mots différents.
-</p>
-</div>
-</div>
-</div>
-<div id="postamble" class="status">
-<p class="author">Auteur: Arnaud Legrand</p>
-<p class="date">Created: 2018-09-05 mer. 07:41</p>
-<p class="validation"><a href="http://validator.w3.org/check?uri=referer">Validate</a></p>
-</div>
-</body>
-</html>
diff --git a/module2/exo5/challenger.html b/module2/exo5/challenger.html
deleted file mode 100644
index 48c246cb304020a805f92e339fea3482e9f17fec..0000000000000000000000000000000000000000
--- a/module2/exo5/challenger.html
+++ /dev/null
@@ -1,577 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
-"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
-<html xmlns="http://www.w3.org/1999/xhtml" lang="fr" xml:lang="fr">
-<head>
-<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
-<meta name="viewport" content="width=device-width, initial-scale=1" />
-<title>Analyse du risque de défaillance des joints toriques de la navette Challenger</title>
-<meta name="generator" content="Org mode" />
-<meta name="author" content="Konrad Hinsen, Arnaud Legrand, Christophe Pouzat" />
-<style type="text/css">
- <!--/*--><![CDATA[/*><!--*/
-  .title  { text-align: center;
-             margin-bottom: .2em; }
-  .subtitle { text-align: center;
-              font-size: medium;
-              font-weight: bold;
-              margin-top:0; }
-  .todo   { font-family: monospace; color: red; }
-  .done   { font-family: monospace; color: green; }
-  .priority { font-family: monospace; color: orange; }
-  .tag    { background-color: #eee; font-family: monospace;
-            padding: 2px; font-size: 80%; font-weight: normal; }
-  .timestamp { color: #bebebe; }
-  .timestamp-kwd { color: #5f9ea0; }
-  .org-right  { margin-left: auto; margin-right: 0px;  text-align: right; }
-  .org-left   { margin-left: 0px;  margin-right: auto; text-align: left; }
-  .org-center { margin-left: auto; margin-right: auto; text-align: center; }
-  .underline { text-decoration: underline; }
-  #postamble p, #preamble p { font-size: 90%; margin: .2em; }
-  p.verse { margin-left: 3%; }
-  pre {
-    border: 1px solid #ccc;
-    box-shadow: 3px 3px 3px #eee;
-    padding: 8pt;
-    font-family: monospace;
-    overflow: auto;
-    margin: 1.2em;
-  }
-  pre.src {
-    position: relative;
-    overflow: visible;
-    padding-top: 1.2em;
-  }
-  pre.src:before {
-    display: none;
-    position: absolute;
-    background-color: white;
-    top: -10px;
-    right: 10px;
-    padding: 3px;
-    border: 1px solid black;
-  }
-  pre.src:hover:before { display: inline;}
-  /* Languages per Org manual */
-  pre.src-asymptote:before { content: 'Asymptote'; }
-  pre.src-awk:before { content: 'Awk'; }
-  pre.src-C:before { content: 'C'; }
-  /* pre.src-C++ doesn't work in CSS */
-  pre.src-clojure:before { content: 'Clojure'; }
-  pre.src-css:before { content: 'CSS'; }
-  pre.src-D:before { content: 'D'; }
-  pre.src-ditaa:before { content: 'ditaa'; }
-  pre.src-dot:before { content: 'Graphviz'; }
-  pre.src-calc:before { content: 'Emacs Calc'; }
-  pre.src-emacs-lisp:before { content: 'Emacs Lisp'; }
-  pre.src-fortran:before { content: 'Fortran'; }
-  pre.src-gnuplot:before { content: 'gnuplot'; }
-  pre.src-haskell:before { content: 'Haskell'; }
-  pre.src-hledger:before { content: 'hledger'; }
-  pre.src-java:before { content: 'Java'; }
-  pre.src-js:before { content: 'Javascript'; }
-  pre.src-latex:before { content: 'LaTeX'; }
-  pre.src-ledger:before { content: 'Ledger'; }
-  pre.src-lisp:before { content: 'Lisp'; }
-  pre.src-lilypond:before { content: 'Lilypond'; }
-  pre.src-lua:before { content: 'Lua'; }
-  pre.src-matlab:before { content: 'MATLAB'; }
-  pre.src-mscgen:before { content: 'Mscgen'; }
-  pre.src-ocaml:before { content: 'Objective Caml'; }
-  pre.src-octave:before { content: 'Octave'; }
-  pre.src-org:before { content: 'Org mode'; }
-  pre.src-oz:before { content: 'OZ'; }
-  pre.src-plantuml:before { content: 'Plantuml'; }
-  pre.src-processing:before { content: 'Processing.js'; }
-  pre.src-python:before { content: 'Python'; }
-  pre.src-R:before { content: 'R'; }
-  pre.src-ruby:before { content: 'Ruby'; }
-  pre.src-sass:before { content: 'Sass'; }
-  pre.src-scheme:before { content: 'Scheme'; }
-  pre.src-screen:before { content: 'Gnu Screen'; }
-  pre.src-sed:before { content: 'Sed'; }
-  pre.src-sh:before { content: 'shell'; }
-  pre.src-sql:before { content: 'SQL'; }
-  pre.src-sqlite:before { content: 'SQLite'; }
-  /* additional languages in org.el's org-babel-load-languages alist */
-  pre.src-forth:before { content: 'Forth'; }
-  pre.src-io:before { content: 'IO'; }
-  pre.src-J:before { content: 'J'; }
-  pre.src-makefile:before { content: 'Makefile'; }
-  pre.src-maxima:before { content: 'Maxima'; }
-  pre.src-perl:before { content: 'Perl'; }
-  pre.src-picolisp:before { content: 'Pico Lisp'; }
-  pre.src-scala:before { content: 'Scala'; }
-  pre.src-shell:before { content: 'Shell Script'; }
-  pre.src-ebnf2ps:before { content: 'ebfn2ps'; }
-  /* additional language identifiers per "defun org-babel-execute"
-       in ob-*.el */
-  pre.src-cpp:before  { content: 'C++'; }
-  pre.src-abc:before  { content: 'ABC'; }
-  pre.src-coq:before  { content: 'Coq'; }
-  pre.src-groovy:before  { content: 'Groovy'; }
-  /* additional language identifiers from org-babel-shell-names in
-     ob-shell.el: ob-shell is the only babel language using a lambda to put
-     the execution function name together. */
-  pre.src-bash:before  { content: 'bash'; }
-  pre.src-csh:before  { content: 'csh'; }
-  pre.src-ash:before  { content: 'ash'; }
-  pre.src-dash:before  { content: 'dash'; }
-  pre.src-ksh:before  { content: 'ksh'; }
-  pre.src-mksh:before  { content: 'mksh'; }
-  pre.src-posh:before  { content: 'posh'; }
-  /* Additional Emacs modes also supported by the LaTeX listings package */
-  pre.src-ada:before { content: 'Ada'; }
-  pre.src-asm:before { content: 'Assembler'; }
-  pre.src-caml:before { content: 'Caml'; }
-  pre.src-delphi:before { content: 'Delphi'; }
-  pre.src-html:before { content: 'HTML'; }
-  pre.src-idl:before { content: 'IDL'; }
-  pre.src-mercury:before { content: 'Mercury'; }
-  pre.src-metapost:before { content: 'MetaPost'; }
-  pre.src-modula-2:before { content: 'Modula-2'; }
-  pre.src-pascal:before { content: 'Pascal'; }
-  pre.src-ps:before { content: 'PostScript'; }
-  pre.src-prolog:before { content: 'Prolog'; }
-  pre.src-simula:before { content: 'Simula'; }
-  pre.src-tcl:before { content: 'tcl'; }
-  pre.src-tex:before { content: 'TeX'; }
-  pre.src-plain-tex:before { content: 'Plain TeX'; }
-  pre.src-verilog:before { content: 'Verilog'; }
-  pre.src-vhdl:before { content: 'VHDL'; }
-  pre.src-xml:before { content: 'XML'; }
-  pre.src-nxml:before { content: 'XML'; }
-  /* add a generic configuration mode; LaTeX export needs an additional
-     (add-to-list 'org-latex-listings-langs '(conf " ")) in .emacs */
-  pre.src-conf:before { content: 'Configuration File'; }
-
-  table { border-collapse:collapse; }
-  caption.t-above { caption-side: top; }
-  caption.t-bottom { caption-side: bottom; }
-  td, th { vertical-align:top;  }
-  th.org-right  { text-align: center;  }
-  th.org-left   { text-align: center;   }
-  th.org-center { text-align: center; }
-  td.org-right  { text-align: right;  }
-  td.org-left   { text-align: left;   }
-  td.org-center { text-align: center; }
-  dt { font-weight: bold; }
-  .footpara { display: inline; }
-  .footdef  { margin-bottom: 1em; }
-  .figure { padding: 1em; }
-  .figure p { text-align: center; }
-  .inlinetask {
-    padding: 10px;
-    border: 2px solid gray;
-    margin: 10px;
-    background: #ffffcc;
-  }
-  #org-div-home-and-up
-   { text-align: right; font-size: 70%; white-space: nowrap; }
-  textarea { overflow-x: auto; }
-  .linenr { font-size: smaller }
-  .code-highlighted { background-color: #ffff00; }
-  .org-info-js_info-navigation { border-style: none; }
-  #org-info-js_console-label
-    { font-size: 10px; font-weight: bold; white-space: nowrap; }
-  .org-info-js_search-highlight
-    { background-color: #ffff00; color: #000000; font-weight: bold; }
-  .org-svg { width: 90%; }
-  /*]]>*/-->
-</style>
-<link rel="stylesheet" type="text/css" href="http://www.pirilampo.org/styles/readtheorg/css/htmlize.css"/>
-<link rel="stylesheet" type="text/css" href="http://www.pirilampo.org/styles/readtheorg/css/readtheorg.css"/>
-<script src="https://ajax.googleapis.com/ajax/libs/jquery/2.1.3/jquery.min.js"></script>
-<script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/js/bootstrap.min.js"></script>
-<script type="text/javascript" src="http://www.pirilampo.org/styles/lib/js/jquery.stickytableheaders.js"></script>
-<script type="text/javascript" src="http://www.pirilampo.org/styles/readtheorg/js/readtheorg.js"></script>
-<script type="text/javascript">
-/*
-@licstart  The following is the entire license notice for the
-JavaScript code in this tag.
-
-Copyright (C) 2012-2018 Free Software Foundation, Inc.
-
-The JavaScript code in this tag is free software: you can
-redistribute it and/or modify it under the terms of the GNU
-General Public License (GNU GPL) as published by the Free Software
-Foundation, either version 3 of the License, or (at your option)
-any later version.  The code is distributed WITHOUT ANY WARRANTY;
-without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE.  See the GNU GPL for more details.
-
-As additional permission under GNU GPL version 3 section 7, you
-may distribute non-source (e.g., minimized or compacted) forms of
-that code without the copy of the GNU GPL normally required by
-section 4, provided you include this license notice and a URL
-through which recipients can access the Corresponding Source.
-
-
-@licend  The above is the entire license notice
-for the JavaScript code in this tag.
-*/
-<!--/*--><![CDATA[/*><!--*/
- function CodeHighlightOn(elem, id)
- {
-   var target = document.getElementById(id);
-   if(null != target) {
-     elem.cacheClassElem = elem.className;
-     elem.cacheClassTarget = target.className;
-     target.className = "code-highlighted";
-     elem.className   = "code-highlighted";
-   }
- }
- function CodeHighlightOff(elem, id)
- {
-   var target = document.getElementById(id);
-   if(elem.cacheClassElem)
-     elem.className = elem.cacheClassElem;
-   if(elem.cacheClassTarget)
-     target.className = elem.cacheClassTarget;
- }
-/*]]>*///-->
-</script>
-<script type="text/javascript">
-    function rpl(expr,a,b) {
-      var i=0
-      while (i!=-1) {
-         i=expr.indexOf(a,i);
-         if (i>=0) {
-            expr=expr.substring(0,i)+b+expr.substring(i+a.length);
-            i+=b.length;
-         }
-      }
-      return expr
-    }
-
-    function show_org_source(){
-       document.location.href = rpl(document.location.href,".php",".org");
-    }
-</script>
-<script type="text/x-mathjax-config">
-    MathJax.Hub.Config({
-        displayAlign: "center",
-        displayIndent: "0em",
-
-        "HTML-CSS": { scale: 100,
-                        linebreaks: { automatic: "false" },
-                        webFont: "TeX"
-                       },
-        SVG: {scale: 100,
-              linebreaks: { automatic: "false" },
-              font: "TeX"},
-        NativeMML: {scale: 100},
-        TeX: { equationNumbers: {autoNumber: "AMS"},
-               MultLineWidth: "85%",
-               TagSide: "right",
-               TagIndent: ".8em"
-             }
-});
-</script>
-<script type="text/javascript"
-        src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js?config=TeX-AMS_HTML"></script>
-</head>
-<body>
-<div id="content">
-<h1 class="title">Analyse du risque de défaillance des joints toriques de la navette Challenger</h1>
-<p>
-<b>Préambule :</b> Les explications données dans ce document sur le contexte
-de l'étude sont largement reprises de l'excellent livre d'Edward
-R. Tufte intitulé <i>Visual Explanations: Images and Quantities, Evidence
-and Narrative</i>, publié en 1997 par <i>Graphics Press</i> et réédité en 2005,
-ainsi que de l'article de Dalal et al. intitulé <i>Risk Analysis of the
-Space Shuttle: Pre-Challenger Prediction of Failure</i> et publié en 1989
-dans <i>Journal of the American Statistical Association</i>.
-</p>
-
-<div id="outline-container-orgc84b2f3" class="outline-2">
-<h2 id="orgc84b2f3"><span class="section-number-2">1</span> Contexte</h2>
-<div class="outline-text-2" id="text-1">
-<p>
-Dans cette étude, nous vous proposons de revenir sur <a href="https://fr.wikipedia.org/wiki/Accident_de_la_navette_spatiale_Challenger">l'accident de la
-navette spatiale Challenger</a>. Le 28 Janvier 1986, 73 secondes après son
-lancement, la navette Challenger se désintègre (voir Figure <a href="#org22bba0b">1</a>)
-et entraîne avec elle, les sept astronautes à son bord. Cette
-explosion est due à la défaillance des deux joints toriques
-assurant l'étanchéité entre les parties hautes et basses des
-propulseurs (voir Figure <a href="#orgc01ded2">2</a>). Ces joints ont perdu de leur
-efficacité en raison du froid particulier qui régnait au moment du
-lancement. En effet, la température ce matin là était juste en dessous
-de 0°C alors que l'ensemble des vols précédents avaient été effectués
-à une température d'au moins 7 à 10°C de plus.
-</p>
-
-
-<div id="org22bba0b" class="figure">
-<p><img src="challenger5.jpg" alt="challenger5.jpg" />
-</p>
-<p><span class="figure-number">Figure&nbsp;1&nbsp;: </span>Photos de la catastrophe de Challenger.</p>
-</div>
-
-
-
-<div id="orgc01ded2" class="figure">
-<p><img src="o-ring.png" alt="o-ring.png" />
-</p>
-<p><span class="figure-number">Figure&nbsp;2&nbsp;: </span>Schéma des propulseurs de la navette challenger. Les joints toriques (un joint principale et un joint secondaire) en caoutchouc de plus de 11 mètres de circonférence assurent l'étanchéité entre la partie haute et la partie basse du propulseur.</p>
-</div>
-
-<p>
-Le plus étonnant est que la cause précise de cet accident avait été
-débattue intensément plusieurs jours auparavant et était encore
-discutée la veille même du décollage, pendant trois heures de
-télé-conférence entre les ingénieurs de la Morton Thiokol
-(constructeur des moteurs) et de la NASA. Si la cause immédiate de
-l'accident (la défaillance des joints toriques) a rapidement été
-identifiée, les raisons plus profondes qui ont conduit à ce désastre
-servent régulièrement de cas d'étude, que ce soit dans des cours de
-management (organisation du travail, décision technique malgré des
-pressions politiques, problèmes de communication), de statistiques
-(évaluation du risque, modélisation, visualisation de données), ou de
-sociologie (symptôme d'un historique, de la bureaucratie et du
-conformisme à des normes organisationnelles). 
-</p>
-
-<p>
-Dans l'étude que nous vous proposons, nous nous intéressons
-principalement à l'aspect statistique mais ce n'est donc qu'une
-facette (extrêmement limitée) du problème et nous vous invitons à lire
-par vous même les documents donnés en référence dans le
-préambule. L'étude qui suit reprend donc une partie des analyses
-effectuées cette nuit là et dont l'objectif était d'évaluer
-l'influence potentielle de la température et de la pression à laquelle
-sont soumis les joints toriques sur leur probabilité de
-dysfonctionnement. Pour cela, nous disposons des résultats des
-expériences réalisées par les ingénieurs de la NASA durant les 6
-années précédant le lancement de la navette Challenger.
-</p>
-
-<p>
-Dans le répertoire <code>module2/exo5/</code> de votre espace <code>gitlab</code>, vous
-trouverez les données d'origine ainsi qu'une analyse pour chacun des
-différents parcours proposés. Cette analyse comporte quatre étapes :
-</p>
-<ol class="org-ol">
-<li>Chargement des données</li>
-<li>Inspection graphique des données</li>
-<li>Estimation de l'influence de la température</li>
-<li>Estimation de la probabilité de dysfonctionnement des joints
-toriques</li>
-</ol>
-
-<p>
-Les deux premières étapes ne supposent que des compétences de base en
-R ou en Python. La troisième étape suppose une familiarité avec la
-régression logistique (généralement abordée en L3 ou M1 de stats,
-économétrie, bio-statistique&#x2026;) et la quatrième étape des bases de
-probabilités (niveau lycée). Nous vous présentons donc dans la
-prochaine section une introduction à la régression logistique qui ne
-s'attarde pas sur les détails du calcul, mais juste sur le sens donné
-aux résultats de cette régression.
-</p>
-</div>
-</div>
-
-<div id="outline-container-org2b7fcc8" class="outline-2">
-<h2 id="org2b7fcc8"><span class="section-number-2">2</span> Introduction à la régression logistique</h2>
-<div class="outline-text-2" id="text-2">
-<p>
-Imaginons que l'on dispose des données suivantes qui indiquent pour
-une cohorte d'individus s'ils ont déclaré une maladie particulière ou
-pas. Je montre ici l'analyse avec R mais le code Python n'est pas forcément
-très éloigné. Les données sont stockées dans une data frame dont voici
-un bref résumé :
-</p>
-
-<div class="org-src-container">
-<pre class="src src-R">summary(df)
-str(df)
-</pre>
-</div>
-
-<pre class="example">
-      Age            Malade     
- Min.   :22.01   Min.   :0.000  
- 1st Qu.:35.85   1st Qu.:0.000  
- Median :50.37   Median :1.000  
- Mean   :50.83   Mean   :0.515  
- 3rd Qu.:65.37   3rd Qu.:1.000  
- Max.   :79.80   Max.   :1.000
-'data.frame':	400 obs. of  2 variables:
- $ Age   : num  75.1 76.4 38.6 70.2 59.2 ...
- $ Malade: int  1 1 0 1 1 1 0 0 1 1 ...
-</pre>
-
-<p>
-Voici une représentation graphique des données qui permet de mieux
-percevoir le lien qu'il peut y avoir entre l'âge et le fait de
-contracter cette maladie ou pas :
-</p>
-<div class="org-src-container">
-<pre class="src src-R">ggplot(df,aes(x=Age,y=Malade)) + geom_point(alpha=.3,size=3) + theme_bw()
-</pre>
-</div>
-
-
-<div class="figure">
-<p><object type="image/svg+xml" data="fig1.svg" class="org-svg">
-Sorry, your browser does not support SVG.</object>
-</p>
-</div>
-
-<p>
-Il apparaît clairement sur ces données que plus l'on est âgé, plus la
-probabilité de développer cette maladie est importante. Mais comment
-estimer cette probabilité à partir uniquement de ces valeurs binaires
-(malade/pas malade) ? Pour chaque tranche d'âge (par exemple de 5 ans),
-on pourrait regarder la fréquence de la maladie (le code qui suit est
-un peu compliqué car le calcul de l'intervalle de confiance pour ce
-type de données nécessite un traitement particulier via la fonction
-<code>binconf</code>).
-</p>
-
-<div class="org-src-container">
-<pre class="src src-R">age_range=5
-df_grouped = df <span class="org-ess-XopX">%&gt;%</span> mutate(Age=age_range*(floor(Age/age_range)+.5)) <span class="org-ess-XopX">%&gt;%</span>
-                    group_by(Age) <span class="org-ess-XopX">%&gt;%</span> summarise(Malade=sum(Malade),N=n()) <span class="org-ess-XopX">%&gt;%</span> 
-                    rowwise() <span class="org-ess-XopX">%&gt;%</span> 
-                    do(data.frame(Age=.$Age, binconf(.$Malade, .$N, alpha=0.05))) <span class="org-ess-XopX">%&gt;%</span>
-                    as.data.frame()
-
-ggplot(df_grouped,aes(x=Age)) + geom_point(data=df,aes(y=Malade),alpha=.3,size=3)  +
-    geom_errorbar(data=df_grouped,
-                  aes(x=Age,ymin=Lower, ymax=Upper, y=PointEst), color=<span class="org-string">"darkred"</span>) +
-    geom_point(data=df_grouped, aes(x=Age, y=PointEst), size=3, shape=21, color=<span class="org-string">"darkred"</span>) +
-    theme_bw() 
-</pre>
-</div>
-
-
-<div class="figure">
-<p><object type="image/svg+xml" data="fig1bis.svg" class="org-svg">
-Sorry, your browser does not support SVG.</object>
-</p>
-</div>
-
-<p>
-L'inconvénient de cette approche est que ce calcul est effectué
-indépendemment pour chaque tranches d'âges, que la tranche d'âge est
-arbitraire, et qu'on n'a pas grande idée de la façon dont ça
-évolue. Pour modéliser cette évolution de façon plus continue, on
-pourrait tenter une régression linéaire (le modèle le plus simple
-possible pour rendre compte de l'influence d'un paramètre) et ainsi
-estimer l'effet de l'âge sur la probabilité d'être malade :
-</p>
-<div class="org-src-container">
-<pre class="src src-R">ggplot(df,aes(x=Age,y=Malade)) + geom_point(alpha=.3,size=3) + 
-    theme_bw() + geom_smooth(method=<span class="org-string">"lm"</span>)
-</pre>
-</div>
-
-
-<div class="figure">
-<p><object type="image/svg+xml" data="fig2.svg" class="org-svg">
-Sorry, your browser does not support SVG.</object>
-</p>
-</div>
-
-<p>
-La ligne bleue est la régression linéaire au sens des moindres carrés
-et la zone grise est la zone de confiance à 95% de cette
-estimation (avec les données dont on dispose et cette hypothèse de
-linéarité, la ligne bleue est la plus probable et il y a 95% de chance
-que la vraie ligne soit dans cette zone grise).
-</p>
-
-<p>
-Mais on voit clairement dans cette représentation graphique que cette
-estimation n'a aucun sens. Une probabilité doit être comprise entre 0
-et 1 et avec une régression linéaire on arrivera forcément pour des
-valeurs un peu extrêmes (jeune ou âgé) à des prédictions aberrantes
-(négative ou supérieures à 1). C'est tout simplement dû au fait qu'une
-régression linéaire fait l'hypothèse que \(\textsf{Malade} =
-\alpha.\textsf{Age} + \beta + \epsilon\), où \(\alpha\) et \(\beta\) sont des nombres réels et \(\epsilon\)
-est un bruit (une variable aléatoire de moyenne nulle), et estime \(\alpha\)
-et \(\beta\) à partir des données.
-</p>
-
-<p>
-Cette technique n'a pas de sens pour estimer une probabilité et il
-convient donc d'utiliser ce que l'on appelle une <a href="https://fr.wikipedia.org/wiki/R%C3%A9gression_logistique">régression
-logistique</a> :
-</p>
-<div class="org-src-container">
-<pre class="src src-R">ggplot(df,aes(x=Age,y=Malade)) + geom_point(alpha=.3,size=3) + 
-    theme_bw() + 
-    geom_smooth(method = <span class="org-string">"glm"</span>, 
-        method.args = list(family = <span class="org-string">"binomial"</span>)) + xlim(20,80)
-</pre>
-</div>
-
-
-<div class="figure">
-<p><object type="image/svg+xml" data="fig3.svg" class="org-svg">
-Sorry, your browser does not support SVG.</object>
-</p>
-</div>
-
-<p>
-Ici, la bibliothèque <code>ggplot</code> fait tous les calculs de régression
-logistique pour nous et nous montre uniquement le résultat "graphique"
-mais dans l'analyse que nous vous proposerons pour Challenger, nous
-réalisons la régression et la prédiction à la main (en <code>R</code> ou en <code>Python</code>
-selon le parcours que vous choisirez) de façon à pouvoir effectuer si
-besoin une inspection plus fine. Comme avant, la courbe bleue indique
-l'estimation de la probabilité d'être malade en fonction de l'âge et
-la zone grise nous donne des indications sur l'incertitude de cette
-estimation, i.e., "sous ces hypothèses et étant donné le peu de
-données qu'on a et leur variabilité, il y a 95% de chances pour que la
-vraie courbe se trouve quelque part (n'importe où) dans la zone
-grise".
-</p>
-
-<p>
-Dans ce modèle, on suppose que \(P[\textsf{Malade}] = \pi(\textsf{Age})\) avec
-\(\displaystyle\pi(x)=\frac{e^{\alpha.x + \beta}}{1+e^{\alpha.x + \beta}}\). Cette
-formule (étrange au premier abord) a la bonne propriété de nous donner
-systématiquement une valeur comprise entre 0 et 1 et de bien tendre
-rapidement vers \(0\) quand l'âge tend vers \(-\infty\) et vers \(1\) quand l'âge
-tend vers \(+\infty\) (mais ce n'est pas bien sûr pas la seule motivation). 
-</p>
-
-<p>
-En conclusion, lorsque l'on dispose de données évènementielles
-(binaires) et que l'on souhaite estimer l'influence d'un paramètre sur
-la probabilité d'occurrence de l'évènement (maladie, défaillance&#x2026;), 
-le modèle le plus naturel et le plus simple est celui de la
-régression logistique. Notez, que même en se restreignant à une petite
-partie des données (par exemple, uniquement les patients de moins de
-50 ans), il est possible d'obtenir une estimation assez raisonnable,
-même si, comme on pouvait s'y attendre, l'incertitude augmente
-singulièrement.
-</p>
-
-<div class="org-src-container">
-<pre class="src src-R">ggplot(df[df$Age&lt;50,],aes(x=Age,y=Malade)) + geom_point(alpha=.3,size=3) + 
-    theme_bw() + 
-    geom_smooth(method = <span class="org-string">"glm"</span>, 
-        method.args = list(family = <span class="org-string">"binomial"</span>),fullrange = <span class="org-type">TRUE</span>) + xlim(20,80)
-</pre>
-</div>
-
-
-<div class="figure">
-<p><object type="image/svg+xml" data="fig4.svg" class="org-svg">
-Sorry, your browser does not support SVG.</object>
-</p>
-</div>
-</div>
-</div>
-</div>
-<div id="postamble" class="status">
-<p class="date">Date: Juin 2018</p>
-<p class="author">Auteur: Konrad Hinsen, Arnaud Legrand, Christophe Pouzat</p>
-<p class="validation"><a href="http://validator.w3.org/check?uri=referer">Validate</a></p>
-</div>
-</body>
-</html>
diff --git a/module2/ressources/video_examples/README.html b/module2/ressources/video_examples/README.html
deleted file mode 100644
index 7f33fd21fd3cb7b1d0c1125552afa192d069b476..0000000000000000000000000000000000000000
--- a/module2/ressources/video_examples/README.html
+++ /dev/null
@@ -1,312 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
-"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
-<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">
-<head>
-<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
-<meta name="viewport" content="width=device-width, initial-scale=1" />
-<title>Org document examples</title>
-<meta name="generator" content="Org mode" />
-<style type="text/css">
- <!--/*--><![CDATA[/*><!--*/
-  .title  { text-align: center;
-             margin-bottom: .2em; }
-  .subtitle { text-align: center;
-              font-size: medium;
-              font-weight: bold;
-              margin-top:0; }
-  .todo   { font-family: monospace; color: red; }
-  .done   { font-family: monospace; color: green; }
-  .priority { font-family: monospace; color: orange; }
-  .tag    { background-color: #eee; font-family: monospace;
-            padding: 2px; font-size: 80%; font-weight: normal; }
-  .timestamp { color: #bebebe; }
-  .timestamp-kwd { color: #5f9ea0; }
-  .org-right  { margin-left: auto; margin-right: 0px;  text-align: right; }
-  .org-left   { margin-left: 0px;  margin-right: auto; text-align: left; }
-  .org-center { margin-left: auto; margin-right: auto; text-align: center; }
-  .underline { text-decoration: underline; }
-  #postamble p, #preamble p { font-size: 90%; margin: .2em; }
-  p.verse { margin-left: 3%; }
-  pre {
-    border: 1px solid #ccc;
-    box-shadow: 3px 3px 3px #eee;
-    padding: 8pt;
-    font-family: monospace;
-    overflow: auto;
-    margin: 1.2em;
-  }
-  pre.src {
-    position: relative;
-    overflow: visible;
-    padding-top: 1.2em;
-  }
-  pre.src:before {
-    display: none;
-    position: absolute;
-    background-color: white;
-    top: -10px;
-    right: 10px;
-    padding: 3px;
-    border: 1px solid black;
-  }
-  pre.src:hover:before { display: inline;}
-  /* Languages per Org manual */
-  pre.src-asymptote:before { content: 'Asymptote'; }
-  pre.src-awk:before { content: 'Awk'; }
-  pre.src-C:before { content: 'C'; }
-  /* pre.src-C++ doesn't work in CSS */
-  pre.src-clojure:before { content: 'Clojure'; }
-  pre.src-css:before { content: 'CSS'; }
-  pre.src-D:before { content: 'D'; }
-  pre.src-ditaa:before { content: 'ditaa'; }
-  pre.src-dot:before { content: 'Graphviz'; }
-  pre.src-calc:before { content: 'Emacs Calc'; }
-  pre.src-emacs-lisp:before { content: 'Emacs Lisp'; }
-  pre.src-fortran:before { content: 'Fortran'; }
-  pre.src-gnuplot:before { content: 'gnuplot'; }
-  pre.src-haskell:before { content: 'Haskell'; }
-  pre.src-hledger:before { content: 'hledger'; }
-  pre.src-java:before { content: 'Java'; }
-  pre.src-js:before { content: 'Javascript'; }
-  pre.src-latex:before { content: 'LaTeX'; }
-  pre.src-ledger:before { content: 'Ledger'; }
-  pre.src-lisp:before { content: 'Lisp'; }
-  pre.src-lilypond:before { content: 'Lilypond'; }
-  pre.src-lua:before { content: 'Lua'; }
-  pre.src-matlab:before { content: 'MATLAB'; }
-  pre.src-mscgen:before { content: 'Mscgen'; }
-  pre.src-ocaml:before { content: 'Objective Caml'; }
-  pre.src-octave:before { content: 'Octave'; }
-  pre.src-org:before { content: 'Org mode'; }
-  pre.src-oz:before { content: 'OZ'; }
-  pre.src-plantuml:before { content: 'Plantuml'; }
-  pre.src-processing:before { content: 'Processing.js'; }
-  pre.src-python:before { content: 'Python'; }
-  pre.src-R:before { content: 'R'; }
-  pre.src-ruby:before { content: 'Ruby'; }
-  pre.src-sass:before { content: 'Sass'; }
-  pre.src-scheme:before { content: 'Scheme'; }
-  pre.src-screen:before { content: 'Gnu Screen'; }
-  pre.src-sed:before { content: 'Sed'; }
-  pre.src-sh:before { content: 'shell'; }
-  pre.src-sql:before { content: 'SQL'; }
-  pre.src-sqlite:before { content: 'SQLite'; }
-  /* additional languages in org.el's org-babel-load-languages alist */
-  pre.src-forth:before { content: 'Forth'; }
-  pre.src-io:before { content: 'IO'; }
-  pre.src-J:before { content: 'J'; }
-  pre.src-makefile:before { content: 'Makefile'; }
-  pre.src-maxima:before { content: 'Maxima'; }
-  pre.src-perl:before { content: 'Perl'; }
-  pre.src-picolisp:before { content: 'Pico Lisp'; }
-  pre.src-scala:before { content: 'Scala'; }
-  pre.src-shell:before { content: 'Shell Script'; }
-  pre.src-ebnf2ps:before { content: 'ebfn2ps'; }
-  /* additional language identifiers per "defun org-babel-execute"
-       in ob-*.el */
-  pre.src-cpp:before  { content: 'C++'; }
-  pre.src-abc:before  { content: 'ABC'; }
-  pre.src-coq:before  { content: 'Coq'; }
-  pre.src-groovy:before  { content: 'Groovy'; }
-  /* additional language identifiers from org-babel-shell-names in
-     ob-shell.el: ob-shell is the only babel language using a lambda to put
-     the execution function name together. */
-  pre.src-bash:before  { content: 'bash'; }
-  pre.src-csh:before  { content: 'csh'; }
-  pre.src-ash:before  { content: 'ash'; }
-  pre.src-dash:before  { content: 'dash'; }
-  pre.src-ksh:before  { content: 'ksh'; }
-  pre.src-mksh:before  { content: 'mksh'; }
-  pre.src-posh:before  { content: 'posh'; }
-  /* Additional Emacs modes also supported by the LaTeX listings package */
-  pre.src-ada:before { content: 'Ada'; }
-  pre.src-asm:before { content: 'Assembler'; }
-  pre.src-caml:before { content: 'Caml'; }
-  pre.src-delphi:before { content: 'Delphi'; }
-  pre.src-html:before { content: 'HTML'; }
-  pre.src-idl:before { content: 'IDL'; }
-  pre.src-mercury:before { content: 'Mercury'; }
-  pre.src-metapost:before { content: 'MetaPost'; }
-  pre.src-modula-2:before { content: 'Modula-2'; }
-  pre.src-pascal:before { content: 'Pascal'; }
-  pre.src-ps:before { content: 'PostScript'; }
-  pre.src-prolog:before { content: 'Prolog'; }
-  pre.src-simula:before { content: 'Simula'; }
-  pre.src-tcl:before { content: 'tcl'; }
-  pre.src-tex:before { content: 'TeX'; }
-  pre.src-plain-tex:before { content: 'Plain TeX'; }
-  pre.src-verilog:before { content: 'Verilog'; }
-  pre.src-vhdl:before { content: 'VHDL'; }
-  pre.src-xml:before { content: 'XML'; }
-  pre.src-nxml:before { content: 'XML'; }
-  /* add a generic configuration mode; LaTeX export needs an additional
-     (add-to-list 'org-latex-listings-langs '(conf " ")) in .emacs */
-  pre.src-conf:before { content: 'Configuration File'; }
-
-  table { border-collapse:collapse; }
-  caption.t-above { caption-side: top; }
-  caption.t-bottom { caption-side: bottom; }
-  td, th { vertical-align:top;  }
-  th.org-right  { text-align: center;  }
-  th.org-left   { text-align: center;   }
-  th.org-center { text-align: center; }
-  td.org-right  { text-align: right;  }
-  td.org-left   { text-align: left;   }
-  td.org-center { text-align: center; }
-  dt { font-weight: bold; }
-  .footpara { display: inline; }
-  .footdef  { margin-bottom: 1em; }
-  .figure { padding: 1em; }
-  .figure p { text-align: center; }
-  .equation-container {
-    display: table;
-    text-align: center;
-    width: 100%;
-  }
-  .equation {
-    vertical-align: middle;
-  }
-  .equation-label {
-    display: table-cell;
-    text-align: right;
-    vertical-align: middle;
-  }
-  .inlinetask {
-    padding: 10px;
-    border: 2px solid gray;
-    margin: 10px;
-    background: #ffffcc;
-  }
-  #org-div-home-and-up
-   { text-align: right; font-size: 70%; white-space: nowrap; }
-  textarea { overflow-x: auto; }
-  .linenr { font-size: smaller }
-  .code-highlighted { background-color: #ffff00; }
-  .org-info-js_info-navigation { border-style: none; }
-  #org-info-js_console-label
-    { font-size: 10px; font-weight: bold; white-space: nowrap; }
-  .org-info-js_search-highlight
-    { background-color: #ffff00; color: #000000; font-weight: bold; }
-  .org-svg { width: 90%; }
-  /*]]>*/-->
-</style>
-<script type="text/javascript">
-/*
-@licstart  The following is the entire license notice for the
-JavaScript code in this tag.
-
-Copyright (C) 2012-2019 Free Software Foundation, Inc.
-
-The JavaScript code in this tag is free software: you can
-redistribute it and/or modify it under the terms of the GNU
-General Public License (GNU GPL) as published by the Free Software
-Foundation, either version 3 of the License, or (at your option)
-any later version.  The code is distributed WITHOUT ANY WARRANTY;
-without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE.  See the GNU GPL for more details.
-
-As additional permission under GNU GPL version 3 section 7, you
-may distribute non-source (e.g., minimized or compacted) forms of
-that code without the copy of the GNU GPL normally required by
-section 4, provided you include this license notice and a URL
-through which recipients can access the Corresponding Source.
-
-
-@licend  The above is the entire license notice
-for the JavaScript code in this tag.
-*/
-<!--/*--><![CDATA[/*><!--*/
- function CodeHighlightOn(elem, id)
- {
-   var target = document.getElementById(id);
-   if(null != target) {
-     elem.cacheClassElem = elem.className;
-     elem.cacheClassTarget = target.className;
-     target.className = "code-highlighted";
-     elem.className   = "code-highlighted";
-   }
- }
- function CodeHighlightOff(elem, id)
- {
-   var target = document.getElementById(id);
-   if(elem.cacheClassElem)
-     elem.className = elem.cacheClassElem;
-   if(elem.cacheClassTarget)
-     target.className = elem.cacheClassTarget;
- }
-/*]]>*///-->
-</script>
-</head>
-<body>
-<div id="content">
-<h1 class="title">Org document examples</h1>
-<p>
-In the MOOC video, I quickly demo how org-mode can be used in various
-contexts. Here are the (sometimes trimmed) corresponding
-org-files. These documents depend on many other external data files
-and are not meant to lead to reproducible documents but it will give
-you an idea of how it can be organized:
-</p>
-
-<ol class="org-ol">
-<li><a href="journal.html">journal.org</a>: an excerpt (I've only left a few code samples and links
-to some resources on R, Stats, &#x2026;) from my own journal. This is a
-personal document where everything (meeting notes, hacking, random
-thoughts, &#x2026;) goes by default. Entries are created with the <code>C-c c</code>
-shortcut.</li>
-<li><a href="labbook_single.html">labbook<sub>single.org</sub></a>: this is an excerpt from the laboratory notebook
-<a href="https://cornebize.net/">Tom Cornebize</a> wrote during his Master thesis internship under my
-supervision. This a personal labbook. I consider this notebook to be
-excellent and was the ideal level of details for us to communicate
-without any ambiguity and for him to move forward with confidence.</li>
-<li><a href="paper.html">paper.org</a>: this is an ongoing paper based on the previous labbook of
-Tom Cornebize. As such it is not reproducible as there are hardcoded
-paths and uncleaned dependencies but writing it from the labbook was
-super easy as we just had to cut and paste the parts we
-needed. What may be interesting is the organization and the org
-tricks to export to the right LaTeX style. As you may notice, in
-the end of the document, there is a commented section with emacs
-commands that are automatically executed when opening the file. It
-is an effective way to depend less on the <code>.emacs/init.el</code> which is
-generally customized by everyone.</li>
-<li><a href="labbook_several.html">labbook<sub>several.org</sub></a>: this is a labbook for a specific project shared
-by several persons. As a consequence it starts with information
-about installation, common scripts, has section with notes about all
-our meetings, a section with information about experiments and an
-other one about analysis. Entries could have been labeled by who
-wrote them but there were only a few of us and this information was
-available in git so we did not bother. In such labbook, it is common
-to find annotations indicating that such experiment was <code>:FLAWED:</code> as
-it had some issues.</li>
-<li><a href="technical_report.html">technical<sub>report.org</sub></a>: this is a short technical document I wrote
-after a colleague sent me a PDF describing an experiment he was
-conducting and asked me about how reproducible I felt it was. It
-turned out I had to cut and paste the C code from the PDF, then
-remove all the line numbers and fix syntax, etc. Obviously I got
-quite different performance results but writing everything in
-org-mode made it very easy to generate both HTML and PDF and to
-explicitly explain how the measurements were done.</li>
-</ol>
-
-<p>
-Here are a few links to other kind of examples:
-</p>
-<ul class="org-ul">
-<li>Slides: all my slides for a series of lectures is available here:
-<a href="https://github.com/alegrand/SMPE">https://github.com/alegrand/SMPE</a>. Here is a <a href="https://raw.githubusercontent.com/alegrand/SMPE/master/lectures/lecture_central_limit_theorem.org">typical source</a> and the
-<a href="https://raw.githubusercontent.com/alegrand/SMPE/master/lectures/lecture_central_limit_theorem.pdf">resulting PDF</a></li>
-<li>Lucas Schnorr, a colleague, maintains:
-<ul class="org-ul">
-<li>a set of templates for various computer science
-journals/conferences: <a href="https://github.com/schnorr/ieeeorg">IEEE</a>, <a href="https://github.com/schnorr/wileyorg">Wiley</a>, <a href="https://github.com/schnorr/acmorg">ACM</a>, <a href="https://github.com/schnorr/llncsorg">LNCS</a></li>
-<li>his lecture on programming languages for undergrads:
-<a href="https://github.com/schnorr/mlp/tree/master/conteudo">https://github.com/schnorr/mlp/tree/master/conteudo</a></li>
-</ul></li>
-</ul>
-</div>
-<div id="postamble" class="status">
-<p class="validation"><a href="http://validator.w3.org/check?uri=referer">Validate</a></p>
-</div>
-</body>
-</html>
diff --git a/module2/ressources/video_examples/labbook_single.html b/module2/ressources/video_examples/labbook_single.html
deleted file mode 100644
index 0cbb165cd9cc24445c727a77cb36202ab5eebe86..0000000000000000000000000000000000000000
--- a/module2/ressources/video_examples/labbook_single.html
+++ /dev/null
@@ -1,10103 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
-"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
-<html xmlns="http://www.w3.org/1999/xhtml" lang="EN" xml:lang="EN">
-<head>
-<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
-<meta name="viewport" content="width=device-width, initial-scale=1" />
-<title>Research journal</title>
-<meta name="generator" content="Org mode" />
-<style type="text/css">
- <!--/*--><![CDATA[/*><!--*/
-  .title  { text-align: center;
-             margin-bottom: .2em; }
-  .subtitle { text-align: center;
-              font-size: medium;
-              font-weight: bold;
-              margin-top:0; }
-  .todo   { font-family: monospace; color: red; }
-  .done   { font-family: monospace; color: green; }
-  .priority { font-family: monospace; color: orange; }
-  .tag    { background-color: #eee; font-family: monospace;
-            padding: 2px; font-size: 80%; font-weight: normal; }
-  .timestamp { color: #bebebe; }
-  .timestamp-kwd { color: #5f9ea0; }
-  .org-right  { margin-left: auto; margin-right: 0px;  text-align: right; }
-  .org-left   { margin-left: 0px;  margin-right: auto; text-align: left; }
-  .org-center { margin-left: auto; margin-right: auto; text-align: center; }
-  .underline { text-decoration: underline; }
-  #postamble p, #preamble p { font-size: 90%; margin: .2em; }
-  p.verse { margin-left: 3%; }
-  pre {
-    border: 1px solid #ccc;
-    box-shadow: 3px 3px 3px #eee;
-    padding: 8pt;
-    font-family: monospace;
-    overflow: auto;
-    margin: 1.2em;
-  }
-  pre.src {
-    position: relative;
-    overflow: visible;
-    padding-top: 1.2em;
-  }
-  pre.src:before {
-    display: none;
-    position: absolute;
-    background-color: white;
-    top: -10px;
-    right: 10px;
-    padding: 3px;
-    border: 1px solid black;
-  }
-  pre.src:hover:before { display: inline;}
-  /* Languages per Org manual */
-  pre.src-asymptote:before { content: 'Asymptote'; }
-  pre.src-awk:before { content: 'Awk'; }
-  pre.src-C:before { content: 'C'; }
-  /* pre.src-C++ doesn't work in CSS */
-  pre.src-clojure:before { content: 'Clojure'; }
-  pre.src-css:before { content: 'CSS'; }
-  pre.src-D:before { content: 'D'; }
-  pre.src-ditaa:before { content: 'ditaa'; }
-  pre.src-dot:before { content: 'Graphviz'; }
-  pre.src-calc:before { content: 'Emacs Calc'; }
-  pre.src-emacs-lisp:before { content: 'Emacs Lisp'; }
-  pre.src-fortran:before { content: 'Fortran'; }
-  pre.src-gnuplot:before { content: 'gnuplot'; }
-  pre.src-haskell:before { content: 'Haskell'; }
-  pre.src-hledger:before { content: 'hledger'; }
-  pre.src-java:before { content: 'Java'; }
-  pre.src-js:before { content: 'Javascript'; }
-  pre.src-latex:before { content: 'LaTeX'; }
-  pre.src-ledger:before { content: 'Ledger'; }
-  pre.src-lisp:before { content: 'Lisp'; }
-  pre.src-lilypond:before { content: 'Lilypond'; }
-  pre.src-lua:before { content: 'Lua'; }
-  pre.src-matlab:before { content: 'MATLAB'; }
-  pre.src-mscgen:before { content: 'Mscgen'; }
-  pre.src-ocaml:before { content: 'Objective Caml'; }
-  pre.src-octave:before { content: 'Octave'; }
-  pre.src-org:before { content: 'Org mode'; }
-  pre.src-oz:before { content: 'OZ'; }
-  pre.src-plantuml:before { content: 'Plantuml'; }
-  pre.src-processing:before { content: 'Processing.js'; }
-  pre.src-python:before { content: 'Python'; }
-  pre.src-R:before { content: 'R'; }
-  pre.src-ruby:before { content: 'Ruby'; }
-  pre.src-sass:before { content: 'Sass'; }
-  pre.src-scheme:before { content: 'Scheme'; }
-  pre.src-screen:before { content: 'Gnu Screen'; }
-  pre.src-sed:before { content: 'Sed'; }
-  pre.src-sh:before { content: 'shell'; }
-  pre.src-sql:before { content: 'SQL'; }
-  pre.src-sqlite:before { content: 'SQLite'; }
-  /* additional languages in org.el's org-babel-load-languages alist */
-  pre.src-forth:before { content: 'Forth'; }
-  pre.src-io:before { content: 'IO'; }
-  pre.src-J:before { content: 'J'; }
-  pre.src-makefile:before { content: 'Makefile'; }
-  pre.src-maxima:before { content: 'Maxima'; }
-  pre.src-perl:before { content: 'Perl'; }
-  pre.src-picolisp:before { content: 'Pico Lisp'; }
-  pre.src-scala:before { content: 'Scala'; }
-  pre.src-shell:before { content: 'Shell Script'; }
-  pre.src-ebnf2ps:before { content: 'ebfn2ps'; }
-  /* additional language identifiers per "defun org-babel-execute"
-       in ob-*.el */
-  pre.src-cpp:before  { content: 'C++'; }
-  pre.src-abc:before  { content: 'ABC'; }
-  pre.src-coq:before  { content: 'Coq'; }
-  pre.src-groovy:before  { content: 'Groovy'; }
-  /* additional language identifiers from org-babel-shell-names in
-     ob-shell.el: ob-shell is the only babel language using a lambda to put
-     the execution function name together. */
-  pre.src-bash:before  { content: 'bash'; }
-  pre.src-csh:before  { content: 'csh'; }
-  pre.src-ash:before  { content: 'ash'; }
-  pre.src-dash:before  { content: 'dash'; }
-  pre.src-ksh:before  { content: 'ksh'; }
-  pre.src-mksh:before  { content: 'mksh'; }
-  pre.src-posh:before  { content: 'posh'; }
-  /* Additional Emacs modes also supported by the LaTeX listings package */
-  pre.src-ada:before { content: 'Ada'; }
-  pre.src-asm:before { content: 'Assembler'; }
-  pre.src-caml:before { content: 'Caml'; }
-  pre.src-delphi:before { content: 'Delphi'; }
-  pre.src-html:before { content: 'HTML'; }
-  pre.src-idl:before { content: 'IDL'; }
-  pre.src-mercury:before { content: 'Mercury'; }
-  pre.src-metapost:before { content: 'MetaPost'; }
-  pre.src-modula-2:before { content: 'Modula-2'; }
-  pre.src-pascal:before { content: 'Pascal'; }
-  pre.src-ps:before { content: 'PostScript'; }
-  pre.src-prolog:before { content: 'Prolog'; }
-  pre.src-simula:before { content: 'Simula'; }
-  pre.src-tcl:before { content: 'tcl'; }
-  pre.src-tex:before { content: 'TeX'; }
-  pre.src-plain-tex:before { content: 'Plain TeX'; }
-  pre.src-verilog:before { content: 'Verilog'; }
-  pre.src-vhdl:before { content: 'VHDL'; }
-  pre.src-xml:before { content: 'XML'; }
-  pre.src-nxml:before { content: 'XML'; }
-  /* add a generic configuration mode; LaTeX export needs an additional
-     (add-to-list 'org-latex-listings-langs '(conf " ")) in .emacs */
-  pre.src-conf:before { content: 'Configuration File'; }
-
-  table { border-collapse:collapse; }
-  caption.t-above { caption-side: top; }
-  caption.t-bottom { caption-side: bottom; }
-  td, th { vertical-align:top;  }
-  th.org-right  { text-align: center;  }
-  th.org-left   { text-align: center;   }
-  th.org-center { text-align: center; }
-  td.org-right  { text-align: right;  }
-  td.org-left   { text-align: left;   }
-  td.org-center { text-align: center; }
-  dt { font-weight: bold; }
-  .footpara { display: inline; }
-  .footdef  { margin-bottom: 1em; }
-  .figure { padding: 1em; }
-  .figure p { text-align: center; }
-  .equation-container {
-    display: table;
-    text-align: center;
-    width: 100%;
-  }
-  .equation {
-    vertical-align: middle;
-  }
-  .equation-label {
-    display: table-cell;
-    text-align: right;
-    vertical-align: middle;
-  }
-  .inlinetask {
-    padding: 10px;
-    border: 2px solid gray;
-    margin: 10px;
-    background: #ffffcc;
-  }
-  #org-div-home-and-up
-   { text-align: right; font-size: 70%; white-space: nowrap; }
-  textarea { overflow-x: auto; }
-  .linenr { font-size: smaller }
-  .code-highlighted { background-color: #ffff00; }
-  .org-info-js_info-navigation { border-style: none; }
-  #org-info-js_console-label
-    { font-size: 10px; font-weight: bold; white-space: nowrap; }
-  .org-info-js_search-highlight
-    { background-color: #ffff00; color: #000000; font-weight: bold; }
-  .org-svg { width: 90%; }
-  /*]]>*/-->
-</style>
-<script type="text/javascript">
-/*
-@licstart  The following is the entire license notice for the
-JavaScript code in this tag.
-
-Copyright (C) 2012-2019 Free Software Foundation, Inc.
-
-The JavaScript code in this tag is free software: you can
-redistribute it and/or modify it under the terms of the GNU
-General Public License (GNU GPL) as published by the Free Software
-Foundation, either version 3 of the License, or (at your option)
-any later version.  The code is distributed WITHOUT ANY WARRANTY;
-without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE.  See the GNU GPL for more details.
-
-As additional permission under GNU GPL version 3 section 7, you
-may distribute non-source (e.g., minimized or compacted) forms of
-that code without the copy of the GNU GPL normally required by
-section 4, provided you include this license notice and a URL
-through which recipients can access the Corresponding Source.
-
-
-@licend  The above is the entire license notice
-for the JavaScript code in this tag.
-*/
-<!--/*--><![CDATA[/*><!--*/
- function CodeHighlightOn(elem, id)
- {
-   var target = document.getElementById(id);
-   if(null != target) {
-     elem.cacheClassElem = elem.className;
-     elem.cacheClassTarget = target.className;
-     target.className = "code-highlighted";
-     elem.className   = "code-highlighted";
-   }
- }
- function CodeHighlightOff(elem, id)
- {
-   var target = document.getElementById(id);
-   if(elem.cacheClassElem)
-     elem.className = elem.cacheClassElem;
-   if(elem.cacheClassTarget)
-     target.className = elem.cacheClassTarget;
- }
-/*]]>*///-->
-</script>
-<script type="text/x-mathjax-config">
-    MathJax.Hub.Config({
-        displayAlign: "center",
-        displayIndent: "0em",
-
-        "HTML-CSS": { scale: 100,
-                        linebreaks: { automatic: "false" },
-                        webFont: "TeX"
-                       },
-        SVG: {scale: 100,
-              linebreaks: { automatic: "false" },
-              font: "TeX"},
-        NativeMML: {scale: 100},
-        TeX: { equationNumbers: {autoNumber: "AMS"},
-               MultLineWidth: "85%",
-               TagSide: "right",
-               TagIndent: ".8em"
-             }
-});
-</script>
-<script type="text/javascript"
-        src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js?config=TeX-AMS_HTML"></script>
-</head>
-<body>
-<div id="content">
-<h1 class="title">Research journal</h1>
-<div id="table-of-contents">
-<h2>Table of Contents</h2>
-<div id="text-table-of-contents">
-<ul>
-<li><a href="#org37b371b">1. 2017</a>
-<ul>
-<li><a href="#orgd07518e">1.1. 2017-02 February</a>
-<ul>
-<li><a href="#org0193e26">1.1.1. 2017-02-06 Monday</a></li>
-<li><a href="#orgd1bec45">1.1.2. 2017-02-07 Tuesday</a></li>
-<li><a href="#orgeee76b8">1.1.3. 2017-02-08 Wednesday</a></li>
-<li><a href="#org8b866e9">1.1.4. 2017-02-09 Thursday</a></li>
-<li><a href="#org4269aac">1.1.5. 2017-02-10 Friday</a></li>
-<li><a href="#org54d2765">1.1.6. 2017-02-13 Monday</a></li>
-<li><a href="#org5db5936">1.1.7. 2017-02-14 Tuesday</a></li>
-<li><a href="#org38b7eb5">1.1.8. 2017-02-15 Wednesday</a></li>
-<li><a href="#orgfbac3c5">1.1.9. 2017-02-16 Thursday</a></li>
-<li><a href="#org5f525f9">1.1.10. 2017-02-17 Friday</a></li>
-<li><a href="#org1c5fce0">1.1.11. 2017-02-20 Monday</a></li>
-<li><a href="#org103df46">1.1.12. 2017-02-21 Tuesday</a></li>
-<li><a href="#org9b353ac">1.1.13. 2017-02-22 Wednesday</a></li>
-<li><a href="#orgf13081a">1.1.14. 2017-02-23 Thursday</a></li>
-<li><a href="#org17d672b">1.1.15. 2017-02-27 Monday</a></li>
-<li><a href="#org1008d02">1.1.16. 2017-02-28 Tuesday</a></li>
-</ul>
-</li>
-<li><a href="#orgc9f6226">1.2. 2017-03 March</a>
-<ul>
-<li><a href="#org8e0642f">1.2.1. 2017-03-01 Wednesday</a></li>
-<li><a href="#org386bbae">1.2.2. 2017-03-02 Thursday</a></li>
-<li><a href="#org5e411d0">1.2.3. 2017-03-03 Friday</a></li>
-<li><a href="#orgcf590a7">1.2.4. 2017-03-06 Monday</a></li>
-<li><a href="#org48e6f81">1.2.5. 2017-03-07 Tuesday</a></li>
-<li><a href="#org22e80f4">1.2.6. 2017-03-08 Wednesday</a></li>
-<li><a href="#orgfbfa111">1.2.7. 2017-03-09 Thursday</a></li>
-<li><a href="#orgb891b00">1.2.8. 2017-03-10 Friday</a></li>
-<li><a href="#org627e770">1.2.9. 2017-03-12 Sunday</a></li>
-<li><a href="#orgf81dcf5">1.2.10. 2017-03-13 Monday</a></li>
-<li><a href="#org7ec343b">1.2.11. 2017-03-14 Tuesday</a></li>
-<li><a href="#org4302032">1.2.12. 2017-03-15 Wednesday</a></li>
-<li><a href="#org8d1c8b9">1.2.13. 2017-03-16 Thursday</a></li>
-<li><a href="#orgefa35de">1.2.14. 2017-03-17 Friday</a></li>
-<li><a href="#org0dafd71">1.2.15. 2017-03-20 Monday</a></li>
-<li><a href="#orga5b9649">1.2.16. 2017-03-21 Tuesday</a></li>
-<li><a href="#org21c4482">1.2.17. 2017-03-22 Wednesday</a></li>
-<li><a href="#orgc090832">1.2.18. 2017-03-23 Thursday</a></li>
-<li><a href="#org65daad3">1.2.19. 2017-03-24 Friday</a></li>
-<li><a href="#orgb68199a">1.2.20. 2017-03-25 Saturday</a></li>
-<li><a href="#orgd50ff46">1.2.21. 2017-03-27 Monday</a></li>
-<li><a href="#org1844753">1.2.22. 2017-03-28 Tuesday</a></li>
-<li><a href="#org195b282">1.2.23. 2017-03-29 Wednesday</a></li>
-<li><a href="#orgbd17131">1.2.24. 2017-03-30 Thursday</a></li>
-<li><a href="#orge06311a">1.2.25. 2017-03-31 Friday</a></li>
-</ul>
-</li>
-<li><a href="#org676189b">1.3. 2017-04 April</a></li>
-<li><a href="#orgf860efb">1.4. 2017-05 May</a></li>
-<li><a href="#org288fc03">1.5. 2017-06 June</a>
-<ul>
-<li><a href="#org6e3ed46">1.5.1. 2017-06-01 Thursday</a></li>
-<li><a href="#org31c9895">1.5.2. 2017-06-02 Friday</a></li>
-<li><a href="#orgd160551">1.5.3. 2017-06-03 Saturday</a></li>
-<li><a href="#orgcafc1a9">1.5.4. 2017-06-04 Sunday</a></li>
-<li><a href="#org2d8f80c">1.5.5. 2017-06-05 Monday</a></li>
-<li><a href="#org0f48f63">1.5.6. 2017-06-06 Tuesday</a></li>
-<li><a href="#org785bf80">1.5.7. 2017-06-07 Wednesday</a></li>
-<li><a href="#org75cac9a">1.5.8. 2017-06-08 Thursday</a></li>
-<li><a href="#org393ef27">1.5.9. 2017-06-09 Friday</a></li>
-<li><a href="#org0cf0c61">1.5.10. 2017-06-12 Monday</a></li>
-<li><a href="#org8c5c59a">1.5.11. 2017-06-14 Wednesday</a></li>
-<li><a href="#orgc67c2b4">1.5.12. 2017-06-19 Monday</a></li>
-<li><a href="#org06eb321">1.5.13. 2017-06-20 Tuesday</a></li>
-<li><a href="#org311621b">1.5.14. 2017-06-21 Wednesday</a></li>
-<li><a href="#org4721195">1.5.15. 2017-06-23 Friday</a></li>
-<li><a href="#org0cbc775">1.5.16. 2017-06-24 Saturday</a></li>
-<li><a href="#org2c9c5f2">1.5.17. 2017-06-25 Sunday</a></li>
-<li><a href="#org4c26f3f">1.5.18. 2017-06-26 Monday</a></li>
-<li><a href="#org5222030">1.5.19. 2017-06-27 Tuesday</a></li>
-</ul>
-</li>
-</ul>
-</li>
-</ul>
-</div>
-</div>
-
-<div id="outline-container-org37b371b" class="outline-2">
-<h2 id="org37b371b"><span class="section-number-2">1</span> 2017</h2>
-<div class="outline-text-2" id="text-1">
-</div>
-<div id="outline-container-orgd07518e" class="outline-3">
-<h3 id="orgd07518e"><span class="section-number-3">1.1</span> 2017-02 February</h3>
-<div class="outline-text-3" id="text-1-1">
-</div>
-<div id="outline-container-org0193e26" class="outline-4">
-<h4 id="org0193e26"><span class="section-number-4">1.1.1</span> 2017-02-06 Monday</h4>
-<div class="outline-text-4" id="text-1-1-1">
-</div>
-<ol class="org-ol">
-<li><a id="org7b3d54a"></a><span class="done DONE">DONE</span> Read <a href="5218a011.pdf">An Evaluation of Network Architectures for Next Generation Supercomputers</a>&#xa0;&#xa0;&#xa0;<span class="tag"><span class="PAPER">PAPER</span></span><br />
-<div class="outline-text-5" id="text-1-1-1-1">
-<p>
-Bibtex: Chen:2016:ENA:3019057.3019059
-</p>
-<ul class="org-ul">
-<li>The authors roughly do what we want to do: they use a simulator to do performance evaluation of different topologies,
-with different workloads and routing algorithms.</li>
-<li>In a first part, they detail what are these topologies, routing algorithms and workloads.  This could give us some
-ideas of what to test. Maybe we could try to reproduce their results?</li>
-<li>They focus on topologies having:
-<ul class="org-ul">
-<li>Full uniform bandwidth.</li>
-<li>Have good partitionability and can be grown modularly.</li>
-<li>Come at a lower cost than a 3-level fat tree (which is the state of the art in terms of pure performances).</li>
-</ul></li>
-<li>They test an adversarial traffic (task i sends to task (i+D) mod G,  tuned to “be bad”).
-<ul class="org-ul">
-<li>Fat tree has great performances, regardless of the routing algorithms.</li>
-<li>Other topologies (Dragonfly+, Stacked all-to-all, Stacked 2D hyperX) have terrible performances with direct
-routing. For indirect or adaptive routing, performances are much better (but still a factor 2 lower than the fat
-tree).</li>
-</ul></li>
-<li>Then, they test neighbor traffic (the logical topology is a grid for instance).
-<ul class="org-ul">
-<li>Again, the fat tree has nearly full performances, regardless of the routing algorithm.</li>
-<li>Other topologies have lower performances with indirect routing. Their performances are ok with direct or adaptive
-routing.</li>
-</ul></li>
-<li>Next, they look at AMR.
-<ul class="org-ul">
-<li>Here, all topologies and routing algorithms have poor performances.</li>
-<li>The average throughput is high at the beginning, but decreases very quickly to nearly 0.  This long tail with low
-throughput accounts for the major part of the execution time.</li>
-<li>Thus, AMR seems to be inherently bad for parallelism.</li>
-</ul></li>
-<li>To sum up, the best routing algorithm is the adaptive routing (except maybe for the fat tree), the best topology is
-the fat tree.</li>
-<li>The authors then had a look at random-mappings of the processes to the nodes (until now, the mapping was ideal). This
-could reflect what would do a scheduler which is not topology-aware.  In general, with adaptive routing, the Fat
-Tree and the Dragonfly+ are very robust to irregular placements, the completion time is not impacted too much. This is
-not the case for stacked topologies (due to a lack of path diversity). Thus, we should use a topology-aware job
-scheduler, especially for stacked topologies.  With non-adaptive routing, all the topologies suffer of performance
-degradations.</li>
-</ul>
-</div>
-</li>
-<li><a id="orgc768d03"></a><span class="done DONE">DONE</span> Talk with Arnaud about the internship.&#xa0;&#xa0;&#xa0;<span class="tag"><span class="MEETING">MEETING</span></span><br />
-<ol class="org-ol">
-<li><a id="orge307258"></a>Two possible things to have a look at.<br />
-<div class="outline-text-6" id="text-1-1-1-2-1">
-<ul class="org-ul">
-<li>Simulate the impact of network failures on the performances.
-<ul class="org-ul">
-<li>May need to work on Simgrid implementation, to handle failures.</li>
-<li>A recent paper has shown that, in their case, removing one of the two root switches of their fat tree did not impact
-significantly the performances.</li>
-<li>A reason is that jobs rarely occupy the full tree, they are localized in one of its sub-trees. Thus, nearly no
-communication go to the top switches.</li>
-</ul></li>
-<li>Modelize in Simgrid <a href="https://www.tacc.utexas.edu/stampede/">Stampede</a> super computer.
-<ul class="org-ul">
-<li>It uses a fat tree topology.</li>
-<li>We have access to real benchmark results.</li>
-<li>We have access to its configuration (e.g. OS and compiler used).</li>
-</ul></li>
-</ul>
-</div>
-</li>
-<li><a id="org8cbd2cb"></a>A first step for this internship would be to run HPL on a fat tree, with Simgrid.<br /></li>
-<li><a id="org7c93a0a"></a>Some features of Simgrid to speedup a simulation.<br />
-<div class="outline-text-6" id="text-1-1-1-2-3">
-<ul class="org-ul">
-<li>A macro to only run the first steps of a loop and infer the total time from it.</li>
-<li>An allocator (replacing malloc/free) to share memory between processes.</li>
-</ul>
-</div>
-</li>
-<li><a id="orgdef78e2"></a>Some technical details about Simgrid<br />
-<div class="outline-text-6" id="text-1-1-1-2-4">
-<ul class="org-ul">
-<li>For every process, we run each piece of code until we reach a MPI operation.  This gives us the execution time of this
-code block.</li>
-<li>We know all the communication flows of the “current step”, thanks to the routing.
-We thus have a list of linear constraints (e.g. the bandwidth of all flows going
-through a same link should not exceed the capacity of this link). We solve this
-by maximizing the minimum bandwidth of any flow (empirically, this is close to the
-reality, where flows have a fair share of the resources).</li>
-<li>Routing is made with an AS hierarchy. There are local routing decisions (within an
-AS) and global routing decisions (between two AS).</li>
-</ul>
-</div>
-</li>
-<li><a id="org596b95e"></a>There exists other simulators.<br />
-<div class="outline-text-6" id="text-1-1-1-2-5">
-<p>
-Mainly codes/ross. Discrete event simulators, so they consider the problem at a lower level.  But being too precises has
-some drawbacks:
-</p>
-<ul class="org-ul">
-<li>The exact version of every piece of code can have noticeable impact → tedious to calibrate.</li>
-<li>The simulation takes much more time, does not scale as much as Simgrid.</li>
-</ul>
-</div>
-</li>
-</ol>
-</li>
-</ol>
-</div>
-<div id="outline-container-orgd1bec45" class="outline-4">
-<h4 id="orgd1bec45"><span class="section-number-4">1.1.2</span> 2017-02-07 Tuesday</h4>
-<div class="outline-text-4" id="text-1-1-2">
-</div>
-<ol class="org-ol">
-<li><a id="org0bf44dd"></a><span class="done DONE">DONE</span> Begin writing a journal \o/<br /></li>
-<li><a id="org5c1cf3e"></a><span class="done DONE">DONE</span> Read <a href="8815a909.pdf">Characterizing Parallel Scientific Applications on Commodity Clusters: An Empirical Study of a Tapered Fat-Tree</a>&#xa0;&#xa0;&#xa0;<span class="tag"><span class="PAPER">PAPER</span></span><br />
-<div class="outline-text-5" id="text-1-1-2-2">
-<p>
-Bibtex: Leon:2016:CPS:3014904.3015009
-</p>
-<ul class="org-ul">
-<li>The authors want to characterize the behavior of applications that run on their clusters, with an emphasis on
-communication requirements.</li>
-<li>This should help to make more informed choices when building new clusters (should we use our budget to get more links
-or more nodes?).</li>
-<li>They measured the utilization of their cluster during one week. It has a fat tree topology.  The measurements show
-that the network is not used very much: the maximal link utilization is approximately 50%, the average link
-utilization is 2.4%.</li>
-<li>They did the same measures with a tapered fat tree (they removed one of the root switches).  Except for some outliers
-having a 90% link utilization at some point, this modification did not had a major impact on the link utilization,
-which was 3.9%.</li>
-<li>The authors recorded which type of jobs were submitted. A great majority of them was really small.  95% of jobs have
-at most 16 nodes, 76% have only one node. Jobs of less than 64 nodes consume 75% of the time.  Thus, if the jobs are
-well placed, the need for distant communications is very low, which explains the good performances of the tapered fat
-tree.  Of course this may change from one cluster to another, so we should reproduce these measurements and make our
-own conclusions.</li>
-<li>Then, the authors removed one of there two top switches.</li>
-<li>A first micro-benchmark shows that it only impacts the aggregate bisection bandwidth, for large messages (&gt; 32kB).</li>
-<li>Then, they evaluated the impact of the tapering on the performances of several “real-life” applications.</li>
-<li>They found that only one of these applications was sensible to the tapering. This application does collective
-communications as well as point-to-point commulnications of large messages.</li>
-<li>However, the impact on the execution time of this application remains small: only 1-2% (it impacts its communication
-time by 6-7.5% which itself accounts for only 9-15%). Furthermore, this only happens for a large number of nodes (&gt;
-512).</li>
-<li>Finally, the authors claim that next generation hardware (faster CPU, memory and network, accelerators&#x2026;) will lead
-to some rewriting of the application to leverage this new hardware. In some applications, message sizes will be
-larger.  Thus, a tapered fat tree may have more impact with this new hardware, new experimentations will be needed to
-find out.</li>
-</ul>
-</div>
-</li>
-<li><a id="orgc766c29"></a><span class="done DONE">DONE</span> Some thoughts regarding previous paper, to discuss with Arnaud&#xa0;&#xa0;&#xa0;<span class="tag"><span class="PAPER">PAPER</span>&#xa0;<span class="MEETING">MEETING</span></span><br />
-<div class="outline-text-5" id="text-1-1-2-3">
-</div>
-<ol class="org-ol">
-<li><a id="org2abbc3e"></a>Can we have data about the utilization of clusters we will work on (Stampede, Bull)?<br />
-<div class="outline-text-6" id="text-1-1-2-3-1">
-<ul class="org-ul">
-<li>It would help us to find relevant hypothesis (e.g. “pruning the fat-tree will not have any impact”).</li>
-<li>We need this for the simulation. What kind of jobs should we run? Small ones? Large ones?</li>
-</ul>
-</div>
-</li>
-<li><a id="org89d7eb8"></a>Can we have data about the nature of the jobs submitted on these clusters?<br />
-<div class="outline-text-6" id="text-1-1-2-3-2">
-<ul class="org-ul">
-<li>What are these applications?</li>
-<li>What fraction of the time do they use for communications?</li>
-<li>Small or large messages?</li>
-<li>Again, it will help us to make hypothesis and perform meaningful experimentations.</li>
-</ul>
-<p>
-&rarr; It changes a lot from one cluster to another, or even across time. It is also hard to record (a batch scheduler does
-not know the nature of the jobs that it handles).
-</p>
-</div>
-</li>
-<li><a id="orgd617d50"></a>How to simulate “big nodes”?<br />
-<div class="outline-text-6" id="text-1-1-2-3-3">
-<ul class="org-ul">
-<li>Can we simulate MPI+OpenMP programs with Simgrid?</li>
-<li>The paper from Christian explains briefly how Simgrid simulates multi-core machines (with one MPI process per core, no
-threads).  Why don't they talk about it in the other paper? Both papers are from the same year.</li>
-</ul>
-<p>
-&rarr; It would be very hard to support OpenMP in Simgrid, the standard is quite big. Also, in OpenMP, communications are
-made with shared memory, so much more difficult to track than MPI communications.
-</p>
-</div>
-</li>
-<li><a id="org115d293"></a>Are time-independent traces larger than the “classical” traces?<br />
-<div class="outline-text-6" id="text-1-1-2-3-4">
-<p>
-&rarr; No, same thing.
-</p>
-</div>
-</li>
-<li><a id="orgffd4ce2"></a>With ScalaTrace, traces have “near-constant size”. How?<br />
-<div class="outline-text-6" id="text-1-1-2-3-5">
-<p>
-&rarr; Compression, lossless or lossy.
-</p>
-</div>
-</li>
-<li><a id="org3480372"></a>What is “detached mode” in point-to-point communication?<br />
-<div class="outline-text-6" id="text-1-1-2-3-6">
-<ul class="org-ul">
-<li>Does the OS of the sender interrupt it, to ask it to send the data?</li>
-<li>If so, why is large mode slower for the sender? In detached mode, the sender has to stop what it is doing,
-whereas in synchronous mode it is waiting.</li>
-</ul>
-<p>
-&rarr; Yes, the kernel interrupts the sender when the receiver is ready. Simgrid does not model the small messages used for
-the synchronization.
-</p>
-</div>
-</li>
-<li><a id="org8531b98"></a>What does the community think of closed source simulators, like xSim? Researchers behind xSim are doing strong claims that cannot be verified by independent researchers&#x2026;<br /></li>
-<li><a id="org4bc4a9f"></a>Why are there never confidence intervals in the plots of Simgrid's papers?<br />
-<div class="outline-text-6" id="text-1-1-2-3-8">
-<p>
-&rarr; They are often not needed, because of too small variation.
-</p>
-</div>
-</li>
-<li><a id="org9c408ee"></a>About the paper Git/Org-mode<br />
-<div class="outline-text-6" id="text-1-1-2-3-9">
-<ul class="org-ul">
-<li>Is there an implementation somewhere?  Creating custom git commands seems <a href="http://thediscoblog.com/blog/2014/03/29/custom-git-commands-in-3-steps/">really easy</a>.
-&rarr; Yes, but not packaged yet. To test the “beta version”, ask Vincent.</li>
-<li>Was not convinced by the subsection 4.3.3 (“fixing code”). When terminating an experiment, they revert all the changes
-made to the source code since these may be ad hoc changes. Then the user has to cherry pick the changes (s)he wants to
-keep. Sounds really dirty&#x2026;  It seems better to have generic scripts that you configure by giving command line
-arguments and/or configuration files.  Then you can simply put these arguments/files in the journal.</li>
-</ul>
-</div>
-</li>
-<li><a id="org218e2fe"></a>Routing in Simgrid (according to the doc)<br />
-<div class="outline-text-6" id="text-1-1-2-3-10">
-<ul class="org-ul">
-<li>Routing tables are static (to achieve high performance).  → Does it mean that handling link failures and dynanmic
-re-routing will require a large code refactoring? What about the performance penalty?</li>
-<li>Routing algorithms are either based on short path (e.g. Floyd, Dijkstra) or manually entered. What about “classical”
-algorithms like D-mod-K?  An example is provided on <a href="https://github.com/simgrid/simgrid/blob/master/examples/platforms/cluster_fat_tree.xml">Github</a>. The example implements a two levels fat-tree with
-D-mod-K. However, D-mod-K is not specified in the XML, it seems to be implicit. Does it mean that we are forced to use
-this routing algorithm for fat trees?</li>
-</ul>
-<p>
-&rarr; Read the code. Shortest path routing is a feature introduced by some Belgian researchers. For specific topologies like
-fat-trees, the routing algorithm is hard-coded.
-</p>
-</div>
-</li>
-</ol>
-</li>
-<li><a id="orgbdcd44e"></a><span class="done DONE">DONE</span> Read <a href="hal-01415484.pdf">Simulating MPI applications: the SMPI approach</a>&#xa0;&#xa0;&#xa0;<span class="tag"><span class="PAPER">PAPER</span></span><br />
-<div class="outline-text-5" id="text-1-1-2-4">
-<p>
-Bibtex: degomme:hal-01415484
-</p>
-<ul class="org-ul">
-<li>This paper is about simulation of HPC systems.</li>
-<li>The authors claim that some research papers are based on simulation made with one-off programs with poor
-documentation, making simplifying assumptions. Worse, these programs are sometimes not public. This is a big issue for
-reproducibility.</li>
-<li>The whole paper consider several important aspects that a good simulator should take care of.</li>
-<li>Several use cases for simulation.
-<ul class="org-ul">
-<li>Quantitative performance evaluation (what will be the performances if we take a bigger version of our hardware?).</li>
-<li>Qualitative performance evaluation (what will be the performances if we take different hardware?).</li>
-<li>Detection of hardware misconfiguration (leading to unexpected performance behaviors).</li>
-<li>MPI runtime tuning (e.g. choosing the algorithms of MPI collective operations).</li>
-<li>Teaching (supercomputers are expensive, we cannot let the students play with them).</li>
-</ul></li>
-</ul>
-</div>
-<ol class="org-ol">
-<li><a id="org2ba6920"></a>Capturing the behavior of an application.<br />
-<div class="outline-text-6" id="text-1-1-2-4-1">
-<ul class="org-ul">
-<li>Off-line simulation. A trace of MPI communication events is first obtained and then replayed.
-<ul class="org-ul">
-<li>We measure the durations of the CPU bursts. Then, when replaying the application, we modify them to account for the
-performance differences between the target platform and the platform used to get the traces.</li>
-<li>One problem is the size of the traces, which can be very large. To fix this, we may only record aggregated
-statistics. They can be enough to detect some anomalies, but we cannot do more in-depth analysis.</li>
-<li>Another issue is extrapolation. Being able to extrapolate in the general case require assumptions hardly justifiable.</li>
-<li>In SMPI, they use “time-independent traces”. Instead of recording time durations, they log the number of
-instructions and the number of bytes transferred by MPI primitives. These are independent of the hardware, so the
-extrapolation issue is fixed.</li>
-<li>It does not solve anything for applications that adapt their behavior to the platform. But this is hopeless with
-off-line simulation.</li>
-<li>There is still the issue of very large traces, they grow linearly with the problem size and the number of
-processes. It seems to be fixed by ScalaTrace, but no explanation is given.</li>
-</ul></li>
-<li>On-line simulation. The actual application code is executed, part of the instruction stream is intercepted and passed
-to a simulator.
-<ul class="org-ul">
-<li>Several challenges. Intercepting MPI calls. Interractions between the application and the simulation kernel.
-Obtaining full coverage of MPI standard. Over-subscribing resources.</li>
-<li>Several possibilities to capture MPI calls. Use PMPI interface (provided by every MPI implementation), but limited
-to the high- level calls. Design a specific MPICH or OpenMPI driver, but tie the solution to a specific
-implementation. One can also develop an ad-hoc implementation of the MPI standard.</li>
-<li>Many tool fold the application into a single process with several threads. This raise an issue for global variables,
-they must be protected. One can duplicate the memory area of the global variables, or use a trick based on the
-Global Offset Table (GOT).</li>
-<li>SMPI is based on a complete reimplementation of MPI standard. No full-coverage yet (e.g. remote memory access or
-multithreaded MPI applications).</li>
-<li>Run MPICH internal compliance tests as part of their automatic testing.</li>
-<li>To protect global variables, duplicate their memory zone using mmap (smart thing, much more efficient thanks to
-COW).</li>
-</ul></li>
-</ul>
-</div>
-</li>
-<li><a id="org2c25295"></a>Modeling the infrastructure (network and CPU)<br />
-<div class="outline-text-6" id="text-1-1-2-4-2">
-<ul class="org-ul">
-<li>Network modeling.
-<ul class="org-ul">
-<li>Several solutions exist to modelize the network.</li>
-<li>Packet-level simulation, here we look at individual packets. It is very precise, but it is hard to know precisely
-what we are modeling. Being precise with a wrong model is useless. Moreover, this model is very costly in terms of
-simulations.</li>
-<li>Flow model. The finest grain here is the communication. Time to transfer a message of size S from i to j: L<sub>i,j</sub> + S*B<sub>i,j</sub>. The
-B<sub>i,j</sub> are not constant, they need to be evaluated for every moment. This model catch some complex behaviors (e.g. RTT unfairness
-of TCP). Quite complex to implement, more costly than the delay model. Also, until recently, contentions could be neglected.</li>
-<li>Delay model, we have some equations to describe the communication times (e.g. LogP, LogGPS). It is elegant and cheap
-in terms of simulations, but very unprecise. Does not take into account network topology (and eventual contentions)
-and suppose a processor can only send one message at a time (single-port model).</li>
-<li>SMPI uses a hybrid network model. Point-to-point communications are divided in three modes: asynchronous, detached
-and synchronous.  Each mode has different values of bandwidth and latency, estimated by doing some benchmarks and
-then a linear regression.</li>
-<li>To modelize network contentions, SMPI has three logical links for any physical link: a downlink, an uplink, and a
-limiter link.  The bandwidth of uploads (resp. downloads) must be lower than the capacity of uplinks
-(resp. downlinks). The sum of the bandwidths must be lower than the capacity of the limiter link.</li>
-</ul></li>
-<li>CPU modeling.
-<ul class="org-ul">
-<li>Like network modeling, several solutions.</li>
-<li>Microscopic models, very precise, but also very costly.</li>
-<li>Models with a coarser grain. For instance, we neglect the CPU load induced by communications → focus on Sequential
-Execution Blocks (SEB).</li>
-<li>Most simplistic model: “CPU A is x times faster than CPU B”. Results ok for similar architectures, but not precise
-at all if too different.  For instance, number of registers, number of hypertreading cores, speed of floating point
-computations, bandwidth to memory, etc.</li>
-<li>Thus, impossible to predict precisely without a perfect knowledge of the system state (and therefore a microscopic
-model).</li>
-<li>Approach of SMPI: run SEB on a processor of the target architecture. Predict performances of <b>similar</b> architecrues by
-applying a constant factor.</li>
-<li>Also, not all the code logic is data dependent. We can therefore greatly decrease the simulation time with two
-tricks.
-<ul class="org-ul">
-<li>Kernel sampling. Annotate some regions with macros. Execute them only a few times to obtain estimations, then skip
-them.</li>
-<li>Memory folding. Share some data structures across processes.</li>
-</ul></li>
-</ul></li>
-</ul>
-</div>
-</li>
-<li><a id="org47c64e8"></a>Modeling the collective operations<br />
-<div class="outline-text-6" id="text-1-1-2-4-3">
-<ul class="org-ul">
-<li>Again, several solutions for the modelization.</li>
-<li>More analytical ones: each collective operation has a cost equation (depending for instance on the message size and
-the number of processes). As discussed for the network modelization, such approaches do not catch the eventual network
-contention.</li>
-<li>Another approach is to benchmark each collective operation on the target platform, with various parameters and
-communicators.  Then, the obtained timings are reinjected in the simulation. We cannot do performance extrapolation
-with this approach. Also, the benchmarking phase may be very long.</li>
-<li>Some replace every collective operation by the corresponding sequence of point-to-point communications (at compile
-time).  This does not capture the logic of selecting the right algorithm.</li>
-<li>Others capture this decomposition into point-to-point communication during the execution, then replay it. But this is
-limited to off-line analysis.</li>
-<li>Simgrid implements all the collective algorithms and selection logics of both OpenMPI and MPICH. We are sure to
-capture correctly the behavior of the operations, but this is an important work. Another interesting feature is that
-the user can chose the selector or the algorithm from the command line.</li>
-</ul>
-</div>
-</li>
-<li><a id="orgfa75bf2"></a>Efficient simulation engine<br />
-<div class="outline-text-6" id="text-1-1-2-4-4">
-<ul class="org-ul">
-<li>Rely on a efficient Discrete Event Simulation (DES) kernel.</li>
-<li>Some simulators parallelized this part (using MPI). But this results in a more complex implementations.</li>
-<li>In the way Simgrid works, there is not much potential parallelism. They therefore decided to keep a sequential DES.</li>
-<li>Simulation cost comes from the application itself (which can be greatly reduced, CPU modelization) and from the flow
-level model.</li>
-</ul>
-</div>
-</li>
-<li><a id="org5f8f65b"></a>Evaluation<br />
-<div class="outline-text-6" id="text-1-1-2-4-5">
-<p>
-Here, the authors show that the use cases mentionned at the beginning of the paper are all realised by Simgrid.
-</p>
-<ul class="org-ul">
-<li>Simgrid is very scalable, more than xSim which is already one of the most scalable simulators (self proclaimed).</li>
-<li>Kernel sampling and memory folding enable simulations of non-trivial applications with a very large number of cores.</li>
-<li>Then, the ability to make good predictions is demonstrated with a Mont Blanc project example. Here, Simgrid is much
-closer to the reality than LogGPS model. However, no comparison is done with other simulators, so this result is hard
-to evaluate.</li>
-<li>A quantitative performance extrapolation is demonstrated, showing good results.</li>
-<li>Empirically, the largest error made by SMPI in terms of time prediction is 5%. This allow to use SMPI to detect
-hardware misconfiguration.  Indeed, it already happened to the Simgrid team.</li>
-<li>Similarly to the previous point, the good accuracy of SMPI allow to investigate to find which MPI parameters lead to
-the best performances.</li>
-<li>Finally, for obvious reasons, using a simulator is great for teaching MPI (rather than using a real cluster).</li>
-</ul>
-</div>
-</li>
-<li><a id="org911ac1b"></a>Conclusion<br />
-<div class="outline-text-6" id="text-1-1-2-4-6">
-<ul class="org-ul">
-<li>The paper focused on MPI applications.</li>
-<li>But Simgrid has other use cases: formal verification of HPC applications, hybrid applications (MPI+CUDA).</li>
-</ul>
-</div>
-</li>
-</ol>
-</li>
-<li><a id="orgb2545ac"></a><span class="done DONE">DONE</span> Read <a href="hal-01446134.pdf">Predicting the Performance and the Power Consumption of MPI Applications With SimGrid</a>&#xa0;&#xa0;&#xa0;<span class="tag"><span class="PAPER">PAPER</span></span><br />
-<div class="outline-text-5" id="text-1-1-2-5">
-<p>
-Bibtex: heinrich:hal-01446134
-</p>
-<ul class="org-ul">
-<li>The paper is about using Simgrid to predict energy consumption.</li>
-<li>This is a challenging question, the modelization is tricky.
-<ul class="org-ul">
-<li>Power consumption of nodes has a static part (consumption of the node when idle) and a dynamic part.</li>
-<li>The static part is very significant (~50%), so we should really do something when the core is idle.</li>
-<li>A first solution is to power off the node, but the latency to power it on is large.</li>
-<li>Another solution is to use Dynamic Voltage Frequency Scaling (DVFS). This is not limited to the case where the core
-is idle, it can also be used when the load is low but non-null. Performance loss is linear in the decrease of the
-frequency, but the power consumption is quadratic.</li>
-<li>No other HPC simulator than Simgrid embed a power model yet.</li>
-</ul></li>
-</ul>
-</div>
-<ol class="org-ol">
-<li><a id="org5e5870a"></a>Modeling multi-core architecture<br />
-<div class="outline-text-6" id="text-1-1-2-5-1">
-<ul class="org-ul">
-<li>If two processes are in the same node (either a same core, or two cores of a same CPU), the simulation becomes tricky.
-<ul class="org-ul">
-<li>The “naive” approach is to simply give a fair share to each of htese processes. But it does not take into accoutn
-some memory effects.</li>
-<li>Simgrid can be pessimistic for processes heavily exploiting the L1 cache. In the simulation, the cache will be cold
-after each MPI call, in reality the cache would be hot.</li>
-<li>Simgrid can be optimistic for processes heavily exploiting the L3 cache and the memory. In the simulation, they will
-have exclusive access, in reality they will interfer between each other.</li>
-</ul></li>
-</ul>
-</div>
-</li>
-<li><a id="org32c8f4f"></a>Modeling energy consumption<br />
-<div class="outline-text-6" id="text-1-1-2-5-2">
-<ul class="org-ul">
-<li>The instantaneous power consumption is P<sub>i,f,w</sub>(u) = Pstatic<sub>i,f</sub> + Pdynamic<sub>i,f,w</sub> * u, for a machine i, a frequency f,
-a computational workload w and a usage u.</li>
-<li>In general, we assume that Pstatic<sub>i,f</sub> = Pstatic<sub>i</sub> (idle state, the frequency does not matter).</li>
-<li>Users can specify arbitrary relation (linear in the usage) for each possible frequency (in general, they should be
-quadratic in the frequency, but it may change with new technologies).</li>
-<li>Each machine can have its own model, accounting for heterogeneity in the platform.</li>
-<li>Power consumption of each host is exposed to the application, allowing it to dynamically decide to change (or not) the
-current frequency.</li>
-</ul>
-</div>
-</li>
-<li><a id="orgf0af926"></a>Modeling multicore computation<br />
-<div class="outline-text-6" id="text-1-1-2-5-3">
-<ul class="org-ul">
-<li>A first step is to run the target application with a small workload using all the cores of a single node, on the target platform.</li>
-<li>Then, re-execute the application with the same workload on top of the simulator (hence using a single core).</li>
-<li>From these measures, associate to each code region a speedup factor that should be applied when emulating.</li>
-<li>In some applications, speedups are very close to 1. In other applications, some regions have a speedup of 0.16 while
-other regions have a speedup of 14. Not taking this into accoutn can result to a large inaccuracy (~20-30%).</li>
-</ul>
-</div>
-</li>
-<li><a id="orgddadb8e"></a>Modeling the network<br />
-<div class="outline-text-6" id="text-1-1-2-5-4">
-<ul class="org-ul">
-<li>See the other paper for the details on the network model of SMPI.</li>
-<li>The authors also speak about local communications, within a node. They are implemented with shared memory. The model
-here is also piecewise linear, but with less variability and higher speed. However, they did not implement this model,
-they kept the classical network model since local communications were rare enough.</li>
-</ul>
-</div>
-</li>
-<li><a id="org19dad41"></a>Validation<br />
-<div class="outline-text-6" id="text-1-1-2-5-5">
-<ul class="org-ul">
-<li>The authors obtain a very good accuracy for performance estimations (as stated in the previous paper).</li>
-<li>For two of the three applications, they also have a very good accuracy for energy consumption estimations.</li>
-<li>With the last application, the accuracy is bad. The reason is that the application (HPL) does busy waiting on
-communications (with MPI<sub>Probe</sub>). In the current model, they assume that it does not cost energy.</li>
-</ul>
-</div>
-</li>
-<li><a id="org2bf4334"></a>Experimental environment<br />
-<div class="outline-text-6" id="text-1-1-2-5-6">
-<p>
-Minor modifications to the setup can have a major impact on the performances and/or the power consumption.  The authors
-therefore give a list of settings to track.
-</p>
-<ul class="org-ul">
-<li>Hardware. If we suppose that the cluster is homogeneous, it has to be the case. Two CPU having the same type can still
-exhibit different performances (e.g. if they come from two different batches/factories).</li>
-<li>Date of the measurements. A lot of things having an impact can change in time: temperature of the machine room,
-vibrations, BIOS and firmware version, etc.</li>
-<li>Operating system. The whole software stack and how it is compiled can have a huge impact. Also, always observe a delay
-between the boot and the beginning of experiments.</li>
-<li>Kernel configuration. For instance, its version, the scheduling algorithm, technologies like hyperthreading, etc.</li>
-<li>The application itself and the runtime (e.g. the algorithms used for collective operations).</li>
-</ul>
-</div>
-</li>
-<li><a id="orgb4a33a2"></a>Conclusion / future work<br />
-<div class="outline-text-6" id="text-1-1-2-5-7">
-<ul class="org-ul">
-<li>The approach to simulate power consumption is accurate only if the application is regular in time. To handle
-applications with very different computation patterns, we could specify the power consumption for each code
-region. But to do so, Simgrid has to be modified, and there need to be very precise measurements to instantiate the
-model (impossible with the hardware of Grid 5000, sampling rate of only 1Hz).</li>
-<li>In Simgrid, we can currently not have different network models at the same time, to account for local and remote
-communications.  A refactoring of the code is underway to fix this.</li>
-</ul>
-</div>
-</li>
-</ol>
-</li>
-</ol>
-</div>
-<div id="outline-container-orgeee76b8" class="outline-4">
-<h4 id="orgeee76b8"><span class="section-number-4">1.1.3</span> 2017-02-08 Wednesday</h4>
-<div class="outline-text-4" id="text-1-1-3">
-</div>
-<ol class="org-ol">
-<li><a id="org5d066f1"></a><span class="done DONE">DONE</span> Paper reading.<br />
-<div class="outline-text-5" id="text-1-1-3-1">
-<ul class="org-ul">
-<li>Notes have been added in the relevant section.</li>
-<li>One paper read today: “Simulating MPI applications: the SMPI approach”.</li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-org8b866e9" class="outline-4">
-<h4 id="org8b866e9"><span class="section-number-4">1.1.4</span> 2017-02-09 Thursday</h4>
-<div class="outline-text-4" id="text-1-1-4">
-</div>
-<ol class="org-ol">
-<li><a id="org62ab607"></a><span class="todo TODO">TODO</span> Read <a href="hdr.pdf">Scheduling for Large Scale Distributed Computing Systems: Approaches and Performance Evaluation Issues</a>&#xa0;&#xa0;&#xa0;<span class="tag"><span class="PAPER">PAPER</span></span><br />
-<div class="outline-text-5" id="text-1-1-4-1">
-<p>
-Bibtex: legrand:tel-01247932
-</p>
-</div>
-</li>
-<li><a id="org8e96f1d"></a><span class="done DONE">DONE</span> Read <a href="SIGOPS_paper.pdf">An Effective Git And Org-Mode Based Workflow For Reproducible Research</a>&#xa0;&#xa0;&#xa0;<span class="tag"><span class="ORGMODE">ORGMODE</span>&#xa0;<span class="GIT">GIT</span>&#xa0;<span class="PAPER">PAPER</span></span><br />
-<div class="outline-text-5" id="text-1-1-4-2">
-<p>
-Bibtex: stanisic:hal-01112795
-</p>
-<ul class="org-ul">
-<li><p>
-A branching scheme for git, based on four types of branches.
-</p>
-<ul class="org-ul">
-<li>One src branch, where the code to run the experiments is located. This branch is quite light.</li>
-<li>One xp branch per experiment, that exists only during the period of the experiment. We can find here all the data
-specific to this experiment. Also a light branch, since limited to an experiment.</li>
-<li>One data branch, in which all xp branches are merged when they are terminated. Quite an heavy branch, a lot of things.</li>
-<li>One art branch per article, where only the code and data related to the article are pulled from the data branch.</li>
-</ul>
-<p>
-When an xp branch is merged in data and deleted, a tag is added. Then, we can easily checkout to this experiment in the future.
-</p></li>
-<li>Org-mode used as a laboratory notebook. All details about the experiments (what, why, how&#x2026;) are written here.
-Thanks to literate programming, the command lines to execute are also contained in the notebook.</li>
-</ul>
-</div>
-</li>
-<li><a id="orgcedfd52"></a>Presentation about org-mode by Christian.&#xa0;&#xa0;&#xa0;<span class="tag"><span class="ORGMODE">ORGMODE</span></span><br />
-<div class="outline-text-5" id="text-1-1-4-3">
-<ul class="org-ul">
-<li>Have a per day entry in the journal.  If you work more than an our wthout writing anything in the journal, there is an
-issue.</li>
-<li>Put tags in the headlines, to be able to search them (e.g. :SMPI: or :PRESENTATION:).  Search with “match” key word.
-Hierarchy of tags, described in the headline.</li>
-<li>About papers, tags READ/UNREAD. Also the bibtex included in the file.  Attach files to the org modeo (different than a
-simple link). Use C-a, then move.</li>
-<li>Spacemacs: add a lot of stuff to evil mode.</li>
-<li>Can also use tags to have link on entries, use the CUSTOM<sub>ID</sub> tag.</li>
-<li>Can use org mode to put some code.</li>
-</ul>
-</div>
-</li>
-<li><a id="orgc05815d"></a><span class="done DONE">DONE</span> Paper reading.<br />
-<div class="outline-text-5" id="text-1-1-4-4">
-<ul class="org-ul">
-<li>One paper read today: “Predicting the Performance and the Power Consumption of MPI Applications With SimGrid”.</li>
-<li>Notes have been added in the relevant section.</li>
-</ul>
-</div>
-</li>
-<li><a id="org15b5024"></a><span class="done DONE">DONE</span> Apply the things learnt at the org-mode presentation.<br /></li>
-</ol>
-</div>
-<div id="outline-container-org4269aac" class="outline-4">
-<h4 id="org4269aac"><span class="section-number-4">1.1.5</span> 2017-02-10 Friday</h4>
-<div class="outline-text-4" id="text-1-1-5">
-</div>
-<ol class="org-ol">
-<li><a id="org72ec9d1"></a>Tried to get good org-mode settings.&#xa0;&#xa0;&#xa0;<span class="tag"><span class="ORGMODE">ORGMODE</span></span><br />
-<div class="outline-text-5" id="text-1-1-5-1">
-<ul class="org-ul">
-<li>Cloned org-mode <a href="file:///home/tom/.emacs.d/org-mode">git repository</a> to have the latest version (success).</li>
-<li>Tried to install <a href="http://mescal.imag.fr/membres/arnaud.legrand/misc/init.php">Arnaud's configuration file</a> (fail).</li>
-<li>Will try Christian's configuration file on Monday.</li>
-</ul>
-</div>
-</li>
-<li><a id="org27af5c6"></a><span class="done DONE">DONE</span> Paper reading.<br />
-<div class="outline-text-5" id="text-1-1-5-2">
-<ul class="org-ul">
-<li>One paper read today: “An Effective Git and Org-Mode Based Worflow for Reproducible Research”.</li>
-<li>Notes have been added in the relevant section.</li>
-</ul>
-</div>
-</li>
-<li><a id="org5a85e3b"></a>Begin looking at the documentation.<br />
-<div class="outline-text-5" id="text-1-1-5-3">
-<ul class="org-ul">
-<li>Documentation about the <a href="http://simgrid.gforge.inria.fr/simgrid/3.13/doc/platform.html">topology</a>.</li>
-</ul>
-</div>
-</li>
-<li><a id="orga1c37c1"></a>Run a matrix product MPI code in a fat tree<br />
-<div class="outline-text-5" id="text-1-1-5-4">
-<ul class="org-ul">
-<li>Code from the parallel system course.</li>
-<li>Tried <a href="https://github.com/simgrid/simgrid/blob/master/examples/platforms/cluster_fat_tree.xml">Github</a> example (fat tree <code>2;4,4;1,2;1,2</code>, 2 levels and 16 nodes).</li>
-<li>Tried a personal example (fat tree <code>3;4,4,4;1,4,2;1,1,1</code>, 3 levels and 64 nodes).</li>
-</ul>
-</div>
-</li>
-<li><a id="orgafcd1aa"></a><span class="done DONE">DONE</span> Find something to automatically draw a fat tree.<br />
-<div class="outline-text-5" id="text-1-1-5-5">
-<ul class="org-ul">
-<li>Maybe there exists some tools? Did not find one however.</li>
-<li>Maybe Simgrid has a way to export a topology in a graphical way? Would be very nice.</li>
-<li>Could adapt the Tikz code I wrote during 2015 internship?</li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-org54d2765" class="outline-4">
-<h4 id="org54d2765"><span class="section-number-4">1.1.6</span> 2017-02-13 Monday</h4>
-<div class="outline-text-4" id="text-1-1-6">
-</div>
-<ol class="org-ol">
-<li><a id="org1be84a5"></a>Keep working on the <a href="file:///home/tom/Documents/Fac/2017_Stage_LIG/small_tests/matmul.c">matrix product</a>.&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="C">C</span>&#xa0;<span class="BUG">BUG</span></span><br />
-<div class="outline-text-5" id="text-1-1-6-1">
-<ul class="org-ul">
-<li><p>
-Observe strange behavior.
-</p>
-<ul class="org-ul">
-<li>Commit: 719a0fd1775340628ef8f1ec0e7aa4033470356b</li>
-<li>Compilation: smpicc -O4 matmul.c -o matmul</li>
-<li>Execution: smpirun &#x2013;cfg=smpi/bcast:mpich &#x2013;cfg=smpi/running-power:6217956542.969 -np 64 -hostfile ./hostfile<sub>64.txt</sub>
--platform ./cluster<sub>fat</sub><sub>tree</sub><sub>64.xml</sub> ./matmul 2000</li>
-</ul>
-<p>
-Then, processes 0 and 63 behave very differently than others.
-</p>
-<ul class="org-ul">
-<li>Processes 0 and 63 have a communication time of about 0.21 and a computation time of about 1.52.</li>
-<li>Other processes have a communication time of about 0.85 and a computation time of about 0.75.</li>
-</ul>
-<p>
-With other topologies and/or matrix sizes, we still have this behavior (more or less accentuated).
-</p></li>
-<li>If we change the order of the loops of the sequential matrix product from i-j-k to k-i-j:
-<ul class="org-ul">
-<li>The execution time is shorter. Hypothesis: this solution has a better usage of the cache.</li>
-<li>The computation times are decreased (expected), but the communication times are also decreased (unexpected).</li>
-<li>Still observe the same trend than above for processes 0 and 63.</li>
-</ul></li>
-<li>Checked with some printf: all processes are the root of a line broadcast and of a column broadcast exactly once (expected).</li>
-<li>Tried several broadcast algorithms (default, mpich, ompi), still have the same behavior.</li>
-<li>Adding a call to MPI<sub>Barrier</sub> at the beginning of the for loop fix the issue for the communication (all processes now
-have a communication time of about 0.22) but not for the computation (still the same differences for processes 0 and
-63).</li>
-<li>When using a smaller numbmer of processes (16 or 4), communication times and computation times are more consistent
-(with still some variability).</li>
-<li>With one process and a matrix size of 250, we have a computation time of 0.10 to 0.12. When we have 64 processes and a
-matrix size of 2000, each block has as ize of 250. Thus, we can extrapolate that the “normal” computation time in this
-case should be about 0.8 (8 iterations, so 8*0.10). Thus, processes 0 and 63 have a non-normal behavior, the others
-are ok.</li>
-<li>Also tried other topologies, e.g. a simple cluster. Still have the same behavior (with different times).
-<ul class="org-ul">
-<li>Again, normal behavior with less processes (e.g. 16).</li>
-<li>We get a normal behavior if we take hostfile<sub>1600.txt</sub>, very strange.</li>
-</ul></li>
-<li>Bug fixed, the problem came from the hostfile. For some unknown reason, it missed a end-of-line character at the last
-line. I suspect that two processes (0 and 63) were therefore mapped to a same host, because the last host was not
-parsed correctly by smpi. The two versions of the file have been added to the repository.</li>
-<li>Issue reported on <a href="https://github.com/simgrid/simgrid/issues/136">Github</a>.</li>
-</ul>
-</div>
-</li>
-<li><a id="org9e9efd3"></a>Try to optimize the matrix product code.&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="C">C</span></span><br />
-<div class="outline-text-5" id="text-1-1-6-2">
-<ul class="org-ul">
-<li>For the record, the following command yields communication times between 0.27 and 0.31 and computation times between
-0.78 and 0.83, for a total time of about 1.14: smpirun &#x2013;cfg=smpi/bcast:mpich &#x2013;cfg=smpi/running-power:6217956542.969
--np 64 -hostfile ./hostfile<sub>64.txt</sub> -platform ./cluster<sub>fat</sub><sub>tree</sub><sub>64.xml</sub> ./matmul 2000</li>
-<li>Replaced malloc/free by SMPI<sub>SHARED</sub><sub>MALLOC</sub>/SMPI<sub>SHARED</sub><sub>FREE</sub>. Got similar times (approximately).</li>
-<li>Added SMPI<sub>SAMPLE</sub><sub>GLOBAL</sub>(0.5*size, 0.01) to the outer loop of the sequential matrix product. Got similar times (approximately).</li>
-<li>Remark: we should verify more rigorously that these optimizations do not change the estimated time.</li>
-<li>Greatly reduced simulation time (from 8.2s to 0.5s).</li>
-<li>Other optimization: stop initializing the content of the matrices (since we do not care of their content).</li>
-</ul>
-</div>
-</li>
-<li><a id="orgd519ef6"></a>Meeting with Arnaud.&#xa0;&#xa0;&#xa0;<span class="tag"><span class="MEETING">MEETING</span></span><br />
-<div class="outline-text-5" id="text-1-1-6-3">
-<ul class="org-ul">
-<li>There exists some visualization tools for Simgrid, to see the bandwidth that goes on some links.  May be very useful
-in the future, to have a better understanding of what is going on.</li>
-<li>The characteristics of the jobs (number of nodes, patterns of communication) have an important impact on performances.
-However, it is difficult for us to have access to this, we do not own a supercomputer&#x2026; Maybe Matthieu can have more
-information (e.g. from Bull's clients)?</li>
-</ul>
-</div>
-</li>
-<li><a id="org8996daa"></a><span class="done DONE">DONE</span> Add supervisors on Github for the journal.<br /></li>
-<li><a id="orgb676470"></a>Some quick performance tests.&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="EXPERIMENTS">EXPERIMENTS</span></span><br />
-<div class="outline-text-5" id="text-1-1-6-5">
-<ul class="org-ul">
-<li>Run my matrix product code, with SMPI optimizations.</li>
-<li>Use a 2-level fat-tree made with switches of 48 ports.</li>
-<li>First case: non-tapered. We use all the switches. The fat-tree is 2;24,48;1,24;1,1 (total of 1152 nodes).
-<ul class="org-ul">
-<li>Use 1089 processes, matrix size of 4950.</li>
-<li>Time: 1.75s.</li>
-<li>Communication time: 0.94s.</li>
-<li>Computation time: 0.81s.</li>
-</ul></li>
-<li>Second case: tapered. We remove half of the root switches. The fat-tree is 2;24,48;1,12;1,1 (still 1152 nodes).
-<ul class="org-ul">
-<li>Still uses 1089 processes, matrix size of 4950.</li>
-<li>Time: 1.78s.</li>
-<li>Communication time: 0.94s.</li>
-<li>Computation time: 0.82s.</li>
-</ul></li>
-<li>The observed difference does not seem significant, but we should check with a carefully designed experiment and
-analysis.</li>
-<li>For the record, running the same application on the same topology with only one process takes a time of 3607s.  Thus,
-we have a speedup of about 2026, so an efficiency of 1.86.  This is a very nice speedup (superlinear). Certainly due
-to cache effects.</li>
-<li>These quick tests suggest that we could remove root switches without impacting the performances, even if we use nearly
-the whole fat-tree (this is obvious if we use a small subtree).</li>
-</ul>
-</div>
-</li>
-<li><a id="orga5f2156"></a><span class="done DONE">DONE</span> Run another benchmark (e.g. HPL), with more carefully designed experiments.<br /></li>
-<li><a id="org2d9ddab"></a><span class="done DONE">DONE</span> The 3-level fat-tree was very long to load (aborted). Find why.<br /></li>
-</ol>
-</div>
-<div id="outline-container-org5db5936" class="outline-4">
-<h4 id="org5db5936"><span class="section-number-4">1.1.7</span> 2017-02-14 Tuesday</h4>
-<div class="outline-text-4" id="text-1-1-7">
-</div>
-<ol class="org-ol">
-<li><a id="org7aca4ec"></a>Work on experiment automatization.&#xa0;&#xa0;&#xa0;<span class="tag"><span class="PYTHON">PYTHON</span></span><br />
-<div class="outline-text-5" id="text-1-1-7-1">
-<ul class="org-ul">
-<li>Add Python functions to generate topology and host files from a given fat-tree description.</li>
-<li>Adapt Python script and Jupyter notebook from parallel system course to run experiments.</li>
-<li>The matrix size and the number of processes are fixed. We compute matrix products for various numbers of root switches
-(we test fat-trees (2;24,48;1,n;1,1) for n in [1, 24]).</li>
-<li>Results seem very promising. For a matrix size of 6600, we can have as few as 10 root switches without important
-impact on performances (recall that a typical 2-level fat tree with 48 port switches would have 24 root switches). If
-we keep removing switches, then performances are quickly impacted.</li>
-<li>Repeated the experiment with the same topology and the same matrix size, but with only 576 processes.  We observe a
-same trend, we can remove a lot of root switches without having an impact.</li>
-</ul>
-</div>
-</li>
-<li><a id="org309892d"></a><span class="done DONE">DONE</span> Ask if it would be possible to have an SSH access to some dedicated computer.<br />
-<div class="outline-text-5" id="text-1-1-7-2">
-<ul class="org-ul">
-<li>Does not need to have a lot of cores (Simgrid is not a parallel program), but it would be nice if it had a fast core.</li>
-<li>Needs to be dedicated so as to not perturbate the experiments.</li>
-</ul>
-</div>
-</li>
-<li><a id="org319883b"></a>Webinar on reproducible research: <a href="https://github.com/alegrand/RR_webinars/blob/master/7_publications/index.org">Publication modes favoring reproducible research</a>&#xa0;&#xa0;&#xa0;<span class="tag"><span class="MEETING">MEETING</span></span><br />
-<div class="outline-text-5" id="text-1-1-7-3">
-<ul class="org-ul">
-<li>Speakers: <a href="http://khinsen.net/">Konrad Hinsen</a> and <a href="http://www.labri.fr/perso/nrougier/">Nicolas Rougier</a>.</li>
-<li>Two parts in research: dissemination (of the results/ideas) and evaluation (of the researcher).</li>
-<li>If we want reproducible research to become a norm, researchers should be rewarded for this (their reputation should
-also depend on the reproducibility of their research, not only the number of citations or the impact factor).</li>
-<li>The speaker compares reproducible research for two points of view: human part and computer part, both for
-dissemination and evaluation.</li>
-</ul>
-</div>
-<ol class="org-ol">
-<li><a id="orgf923a7b"></a><a href="http://www.activepapers.org/">ActivePapers</a><br />
-<div class="outline-text-6" id="text-1-1-7-3-1">
-<ul class="org-ul">
-<li>Not a tool that one should use (yet), neither a proposition of new standard. It is mainly an idea for computer-aided
-research.</li>
-<li>How to have more trusts on the software? The “ideal” one is reimplementation (e.g. ReScience). The speaker tried this
-on a dozen projects, he never got identical results. Other good ideas: good practices like verison control and
-testing, keep track of the software stack (hardware, OS, tools, etc).</li>
-<li>ActivePapers group scripts, software dependencies and data into a same archive.</li>
-</ul>
-</div>
-</li>
-<li><a id="org9eb8a45"></a><a href="http://rescience.github.io/">ReScience</a><br />
-<div class="outline-text-6" id="text-1-1-7-3-2">
-<ul class="org-ul">
-<li>Idea: replicate science.</li>
-<li>For a great majority of papers, we cannot replicate their reuse their code.</li>
-<li>It is hard to publish replication of an original paper, most journals will reject it since not original.</li>
-<li>This is why ReScience was born. It is (currently) used on Github.</li>
-<li>To publish a new study, do a pull-request on ReScience repository. Then it is reviewed openly by reviewers selected by
-the editor.  The replication is improved until it is publishable.</li>
-</ul>
-</div>
-</li>
-</ol>
-</li>
-</ol>
-</div>
-<div id="outline-container-org38b7eb5" class="outline-4">
-<h4 id="org38b7eb5"><span class="section-number-4">1.1.8</span> 2017-02-15 Wednesday</h4>
-<div class="outline-text-4" id="text-1-1-8">
-</div>
-<ol class="org-ol">
-<li><a id="orgc7db0ef"></a>Use Christian’s config files for org mode&#xa0;&#xa0;&#xa0;<span class="tag"><span class="ORGMODE">ORGMODE</span></span><br /></li>
-<li><a id="org962ddea"></a>Work on the experiment script<br />
-<div class="outline-text-5" id="text-1-1-8-2">
-<ul class="org-ul">
-<li>Parsing more generic fat-tree descriptions. For instance, our current topology description would be
-2;24,48;1,1:24;1,1.  It means that the L1 switches can have between 1 and 24 up ports.</li>
-<li>Modify the script for experiments to be more generic.
-<ul class="org-ul">
-<li>Can give as command line arguments the fat-tree description, the (unique) matrix size, the (unique) number of processes.</li>
-<li>Use Python’s argparse for a cleaner interface.</li>
-</ul></li>
-</ul>
-</div>
-</li>
-<li><a id="org015ee2f"></a>Re-run experiments with this new script<br />
-<div class="outline-text-5" id="text-1-1-8-3">
-<ul class="org-ul">
-<li>Still observe the same trend: we can afford to remove a lot of up-ports for the L1 switches.</li>
-<li>Some points seem to be outliers. But we have not a lot of points, so it is difficult to say. We whould do more
-experiments to see if these points are still significantly separated from the rest.</li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-orgfbac3c5" class="outline-4">
-<h4 id="orgfbac3c5"><span class="section-number-4">1.1.9</span> 2017-02-16 Thursday</h4>
-<div class="outline-text-4" id="text-1-1-9">
-</div>
-<ol class="org-ol">
-<li><a id="orgdbf7085"></a><span class="done DONE">DONE</span> Enhance/fix Emacs configuration&#xa0;&#xa0;&#xa0;<span class="tag"><span class="ORGMODE">ORGMODE</span></span><br />
-<div class="outline-text-5" id="text-1-1-9-1">
-<ul class="org-ul">
-<li>Translate days and months in English.</li>
-<li>Increase the line length limit (120 columns?).</li>
-<li>Reformat the whole document with such limit.</li>
-<li>Add tags where relevant.</li>
-<li>Attach files, instead of putting a link.</li>
-</ul>
-</div>
-</li>
-<li><a id="orgdda9534"></a>Try to use even more SMPI optimizations&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span></span><br />
-<div class="outline-text-5" id="text-1-1-9-2">
-<ul class="org-ul">
-<li>Currently, we use the macro SMPI<sub>SAMPLE</sub><sub>GLOBAL</sub> only once: for the outer for loop of the sequential matrix product.</li>
-<li>Maybe we can also use it for the two other loops? We could also reduce the number of iterations (currently, it is
-0.5*size). Let’s try.</li>
-<li>Currently, we get communication times of about 0.14s and computation times of about 0.42s, for a total time of 0.57s,
-with the following command:
-smpirun &#x2013;cfg=smpi/bcast:mpich &#x2013;cfg=smpi/running-power:6217956542.969 -np 64 -hostfile ./hostfile<sub>1152.txt</sub> -platform
-./big<sub>tapered</sub><sub>fat</sub><sub>tree.xml</sub> ./matmul 1600</li>
-<li>FAIL. It seems we cannot use imbricated sample blocks. Quite strange, do not understand why&#x2026;</li>
-</ul>
-</div>
-</li>
-<li><a id="orga66ca43"></a>Try to run HPL with Simgrid&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-1-9-3">
-<ul class="org-ul">
-<li>Copied from <a href="https://gitlab.inria.fr/fheinric/paper-simgrid-energy/tree/master/src/hpl-2.2">Christian’s repository</a>.</li>
-<li>Compilation fails, don’t know why. But binaries are stored in the git repository (don’t know why either), so I can use
-them to do some first tests.
-In fact, file Make.SMPI needed to be modified. Changed <code>mkdir</code> by <code>mkdir -p</code>, <code>ln</code> by <code>ln -f</code> and <code>cp</code> by <code>cp -f</code>.
-Changed top directory.
-Also, the Makefile couldn’t find the shared library atlas. It was in /usr/lib, but named <code>libatlas.so.3</code>. Added a
-symbolic link to <code>libatlas.so</code>.</li>
-<li>Tested my laptop (with MPI, not SMPI). With a problem size of 10000 and 12 processes, it corresponds to 16.51 Gflops.</li>
-<li>Tested with SMPI, with a problem size of 10000 and 4 processes. Command:
-smpirun &#x2013;cfg=smpi/bcast:mpich &#x2013;cfg=smpi/running-power:6217956542.969 -platform
-../../../small<sub>tests</sub>/cluster<sub>fat</sub><sub>tree</sub><sub>64.xml</sub> -hostfile ../../../small<sub>tests</sub>/hostfile<sub>64.txt</sub> -np 4 ./xhpl
-Result: 1.849Gflops.</li>
-<li>Same thing, with 12 processes. Very similar: 1.847Gflops. Why is it not faster?</li>
-<li>Same thing, with 64 processes. Very similar: 1.858Gflops. Why is it not faster?</li>
-<li>Retried with a freshly compiled program. Still the same thing.</li>
-<li>Understood the issue: it is not enough to specify the number of processes with <code>-np 12</code>, we also have to tell it in the
-file <code>HPL.dat</code>.</li>
-<li>Tried with <code>-np 4</code>, P=4 and Q=1. Now, 6.6224Gflops. We have a speedup of 3.59, which seems reasonable.</li>
-<li>The number of processes given with <code>-np</code> must be greater or equal to P &times; Q.</li>
-<li>Tried with <code>-np 4</code>, P=1 and Q=4. Did not have a noticeable impact on performances (in comparison with P=4, Q=1).</li>
-<li>Tried with <code>-np 4</code>, P=2 and Q=2. Did not have a noticeable impact on performances (in comparison with P=4, Q=1).</li>
-<li>Tried with <code>-np 64</code>, P=8 and Q=8. Now, 22.46Gflops. Speedup of 12, very disappointing.</li>
-<li>Tried with <code>-np 64</code>, P=8 and Q=8 again, but with a problem size of 20000 (it was 10000). Now 52.2Gflops (speedup of 28.3).</li>
-</ul>
-</div>
-</li>
-<li><a id="org798a5f4"></a>Comparison with top 500<br />
-<div class="outline-text-5" id="text-1-1-9-4">
-<ul class="org-ul">
-<li>For the record, the order of magnitude for Intel desktop CPU of today is between 10 and 100 Gflops, according to <a href="https://www.pugetsystems.com/labs/hpc/Linpack-performance-Haswell-E-Core-i7-5960X-and-5930K-594/">this
-website</a>, <a href="https://setiathome.berkeley.edu/cpu_list.php">this website</a> and <a href="https://asteroidsathome.net/boinc/cpu_list.php">this website</a>. My laptop supposedly has a speed of 3.84 Gflops per core and 15.21 Gflops in
-total according to the last two websites.</li>
-<li>According to <a href="https://en.wikipedia.org/wiki/Raspberry_Pi#Performance">Wikipedia</a>, the first generation Raspberry Pi has a speed of 0.041 Gflops, a 64 nodes cluster made of
-those has a speed of 1.14 Gflops.</li>
-<li>The first supercomputer has a speed of about 93Pflops, or 93,000,000Gflops.</li>
-<li>The last one has a speed of about 349Tflops, or 349,000Gflops.</li>
-<li>In June 2005, the first one had a speed of about 136Tflops, the last one 1.2Tflops.</li>
-<li>In our settings with 64 nodes, each node has one core that computes at 1Gflops. Thus, our Rpeak is 64Gflops.  We have
-an efficiency of 52.2/64 = 0.81.  This is not bad, compared to the three first supercomputers of the top 500
-(respectively at 0.74, 0.61 and 0.63). But we should maybe not compare the efficiency of a 64 nodes cluster with these
-supercomputers, since it becomes harder to be efficient with a large topology.</li>
-</ul>
-</div>
-</li>
-<li><a id="orgc20d1b7"></a><span class="done DONE">DONE</span> SMPI optimization of HPL&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-1-9-5">
-<ul class="org-ul">
-<li>It seems that no SMPI optimization is done in the code obtained from Christian’s repository. Maybe we could speed
-things up?</li>
-<li>Need to check what is the algorithm behing HPL, whether it is regular (to use SMPI<sub>SAMPLE</sub>) and data independent (to
-use SMPI<sub>SHARED</sub>).</li>
-</ul>
-</div>
-</li>
-<li><a id="orgfdf4731"></a><span class="done DONE">DONE</span> Adapt the experience script to run HPL<br />
-<div class="outline-text-5" id="text-1-1-9-6">
-<ul class="org-ul">
-<li>Parse the output (quite ugly to parse, but easy, use methods str.split and list.index).</li>
-<li>Run the same kind of experiments than for the matrix product. Will be much longer if we cannot use SMPI optimizations.</li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-org5f525f9" class="outline-4">
-<h4 id="org5f525f9"><span class="section-number-4">1.1.10</span> 2017-02-17 Friday</h4>
-<div class="outline-text-4" id="text-1-1-10">
-</div>
-<ol class="org-ol">
-<li><a id="org60ca569"></a>Refactor the experiment script&#xa0;&#xa0;&#xa0;<span class="tag"><span class="PYTHON">PYTHON</span></span><br />
-<div class="outline-text-5" id="text-1-1-10-1">
-<ul class="org-ul">
-<li>Aim: reuse for HPL the code already done for the matrix product.</li>
-<li>Now, we have a clase <code>AbstractRunner</code>, which runs the common logic (e.g. some basic checks on the parameters, or running
-the desired number of experiments).</li>
-<li>We also have classes <code>MatrixProduct</code> and <code>HPL</code>, containing the piece of codes specific to the matrix product or HPL (e.g. running one experiment).</li>
-</ul>
-</div>
-</li>
-<li><a id="org50e00c9"></a>Some strange things with HPL&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="BUG">BUG</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-1-10-2">
-<ul class="org-ul">
-<li><p>
-The output has the following format:
-</p>
-<pre class="example">
-================================================================================
-T/V                N    NB     P     Q               Time                 Gflops
---------------------------------------------------------------------------------
-WR00L2L2        2000   120     1     1               3.17              1.683e+00
-</pre></li>
-<li>Sometimes, the last line is missing, so we do not have any informaiton on time and flops.</li>
-<li>Quite often it is present, but with wrong values: the time is 0.00 and the Gflops are absurdly high (e.g. 2.302e+03
-Gflops for a cluster made of 96 machines of 1 Gflops). It may come from an erroneous measure of the time.</li>
-<li><p>
-For instance, with the script of commit <code>dbdfeabbef3f90a3d4e2ecfbe5e8f505738cac23</code>, the following command line:
-<code>./run_measures.py --global_csv /tmp/bla --nb_runs 10 --size 5000 --nb_proc 64 --fat_tree "2;24,48;1,24;1,1"
-  --experiment HPL</code>
-</p>
-<ul class="org-ul">
-<li>It may get this output in one experiment:</li>
-</ul>
-<pre class="example">
-================================================================================
-T/V                N    NB     P     Q               Time                 Gflops
---------------------------------------------------------------------------------
-WR00L2L2        5000   120     8     8               0.00              1.108e+05
-</pre>
-<ul class="org-ul">
-<li>And this output in another one:</li>
-</ul>
-<pre class="example">
-================================================================================
-T/V                N    NB     P     Q               Time                 Gflops
---------------------------------------------------------------------------------
-WR00L2L2        5000   120     8     8               5.35              1.560e+01
-</pre>
-<p>
-Note that, for the two experiments, <b>nothing</b> has changed. The file <code>HPL.dat</code> is the same, the number of processes given
-to the option <code>-np</code> is the same, the topology file and the host file are the same.
-</p></li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-org1c5fce0" class="outline-4">
-<h4 id="org1c5fce0"><span class="section-number-4">1.1.11</span> 2017-02-20 Monday</h4>
-<div class="outline-text-4" id="text-1-1-11">
-</div>
-<ol class="org-ol">
-<li><a id="orged29dac"></a>Keep investigating on the HPL anomaly<br /></li>
-<li><a id="org4d4c032"></a>Found the issue with HPL&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="BUG">BUG</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-1-11-2">
-<ul class="org-ul">
-<li>Debugging with Christian, to understand what was going on.</li>
-<li>This was a concurrency issue. The private variables of the processes were in fact not private. This caused two
-processes to write a same variable, which led to an inconsistent value when measuring time.</li>
-<li>The function is <code>HPL_ptimer</code>, in file <code>testing/ptest/HPL_pdtest.c</code>.</li>
-<li>When using simgrid, need to use option <code>--cfg=smpi/privatize-global-variables:yes</code> to fix this.</li>
-<li>Used a tool to search for a word, looks nice: <code>cg</code> and <code>vg</code> (package <code>cgvg</code>).</li>
-<li>Another nice thing: <code>ctags</code> (command <code>ctags --fields=+l -R -f ./ctags src testing</code>).</li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-org103df46" class="outline-4">
-<h4 id="org103df46"><span class="section-number-4">1.1.12</span> 2017-02-21 Tuesday</h4>
-<div class="outline-text-4" id="text-1-1-12">
-</div>
-<ol class="org-ol">
-<li><a id="org79490d2"></a>Test the experiment script for HPL&#xa0;&#xa0;&#xa0;<span class="tag"><span class="EXPERIMENTS">EXPERIMENTS</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-1-12-1">
-<ul class="org-ul">
-<li>It seems to work well, the bug is fixed.</li>
-<li>Scalability issue. Testing for a size of 20k already takes a lot of time, and it is still too small to have a good
-efficiency with 1000 processes (performances are worse than with 100 processes).</li>
-<li>Definitely need to use SMPI optimizations if we want to do anything with HPL.</li>
-</ul>
-</div>
-</li>
-<li><a id="org3f29432"></a>Re-do experiments with matrix product<br />
-<div class="outline-text-5" id="text-1-1-12-2">
-<ul class="org-ul">
-<li>Stuck with HPL&#x2026;</li>
-<li>We also output the speed of the computation, in Gflops (this is redondant with the time, but we can use it for
-comparison with other algorithms like HPL).</li>
-<li>The plot looks nice, but nothing new.</li>
-</ul>
-</div>
-</li>
-<li><a id="orgf97be78"></a>Work on the drawing of fat-trees<br />
-<div class="outline-text-5" id="text-1-1-12-3">
-<ul class="org-ul">
-<li>Generate all nodes and edges of a fat-tree.</li>
-<li>No drawing yet.</li>
-<li>Will try to output Tikz code.</li>
-</ul>
-</div>
-</li>
-<li><a id="org718d693"></a><span class="done DONE">DONE</span> Look at where to put SMPI macros in HPL, with Christian<br />
-<div class="outline-text-5" id="text-1-1-12-4">
-<ul class="org-ul">
-<li>Have a look at a trace, to see where most of the time is spent.</li>
-</ul>
-</div>
-</li>
-<li><a id="org3cbb13f"></a>Keep working on the drawing of fat-trees.<br />
-<div class="outline-text-5" id="text-1-1-12-5">
-<ul class="org-ul">
-<li>Now produce working Tikz code.</li>
-<li>Figure quickly becomes unreadable for large fat-trees (not surprising).</li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-org9b353ac" class="outline-4">
-<h4 id="org9b353ac"><span class="section-number-4">1.1.13</span> 2017-02-22 Wednesday</h4>
-<div class="outline-text-4" id="text-1-1-13">
-</div>
-<ol class="org-ol">
-<li><a id="org1749bf1"></a>Terminate the work on fat-tree drawing&#xa0;&#xa0;&#xa0;<span class="tag"><span class="PYTHON">PYTHON</span></span><br />
-<div class="outline-text-5" id="text-1-1-13-1">
-<ul class="org-ul">
-<li>We can now do <code>./draw_topo.py bla.pdf "2;8,16;1,1:8;1,1" "2;4,8;1,1:4;1,1"</code> to draw all the fat-trees in the file
-<code>bla.pdf</code>. Very useful to visualize the differences between the trees.</li>
-<li>No limit on the fat-tree size, they should fit on the pdf (a very large page is generated, then cropped to the right
-dimension). However, a large fat-tree may not be very readable.</li>
-</ul>
-</div>
-</li>
-<li><a id="org11ff5b2"></a>Tried to move the SMPI<sub>SAMPLE</sub> of the matrix product<br />
-<div class="outline-text-5" id="text-1-1-13-2">
-<ul class="org-ul">
-<li>Cannot use one SMPI<sub>SAMPLE</sub> per loop (don’t know why, but it seems to be forbidden).</li>
-<li>It was used for the outer loop. Tried the inner loops, but performances were greatly degraded (about &times;50 in simulation time).</li>
-<li>Reverting the change.</li>
-</ul>
-</div>
-</li>
-<li><a id="orgce07094"></a><span class="done DONE">DONE</span> Cannot use more than 1024 processes with Simgrid (need to fix)&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="BUG">BUG</span></span><br />
-<div class="outline-text-5" id="text-1-1-13-3">
-<ul class="org-ul">
-<li>The <code>open()</code> system call fails with <code>EMFILE</code> error code.</li>
-<li>It used to work, don’t understand what changed in the meantime.</li>
-</ul>
-</div>
-</li>
-<li><a id="orgd699673"></a>Talk with Christian about SMPI optimizations in HPL&#xa0;&#xa0;&#xa0;<span class="tag"><span class="PERFORMANCE">PERFORMANCE</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-1-13-4">
-<ul class="org-ul">
-<li>He gave me a trace of HPL execution obtained with Simgrid.</li>
-<li><p>
-The parts taking most of the time are the following:
-</p>
-<pre class="example">
-50        /home/cheinrich/src/hpl-2.2/src/pgesv/hpl_rollt.c       242          /home/cheinrich/src/hpl-2.2/src/comm/hpl_recv.c     136 190.785263    498
-51        /home/cheinrich/src/hpl-2.2/src/pgesv/hpl_rollt.c       242          /home/cheinrich/src/hpl-2.2/src/comm/hpl_sdrv.c     180 372.272945    996
-52        /home/cheinrich/src/hpl-2.2/src/pgesv/hpl_rollt.c       242          /home/cheinrich/src/hpl-2.2/src/comm/hpl_send.c     133 179.711679    498
-</pre></li>
-</ul>
-</div>
-</li>
-<li><a id="orgce62630"></a>Let’s track these piece of code&#xa0;&#xa0;&#xa0;<span class="tag"><span class="PERFORMANCE">PERFORMANCE</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-1-13-5">
-<ul class="org-ul">
-<li><code>HPL_rollT.c</code> has only one function: <code>HPL_rollT</code>.</li>
-<li>This function is called only once: at the end of function <code>HPL_pdlaswp01T</code> (eponym file).</li>
-<li>This function is called once in function <code>HPL_pduptateNT</code> and once in function <code>HPL_pdupdateTT</code> (eponym files). There are
-very few differences between these two functions (4 line changes are relevant, which are small variations in the
-arguments of a function, <code>HPL_dtrsm</code>). These files have 443 lines: this is a huge copy-paste, very dirty.</li>
-<li>A candidate for the long function we are looking for is <code>HPL_dlaswp10N</code> (found by Christian). Has two nested loops.
-This function is also a good candidate for the most terrible piece of code ever written.</li>
-<li>Added a <code>SMPI_SAMPLE_GLOBAL</code> after the outer loop, did not reduce the simulation time. Also tried to remove the whole
-code of the function, did not reduce the simulation time either. So we can say this function is not our big consummer.</li>
-<li>Functions <code>HPL_recv</code> and <code>HPL_sdrv</code> are both called <b>only</b> in <code>HPL_pdmxswp</code> and <code>HPL_pdlaswp00N</code>.</li>
-<li>Function <code>HPL_pdlaswp00n</code> is used only in <code>HPL_pdupdateTN</code> and <code>HPL_pdupdateNN</code>, which are nearly identical. These two
-functions are then used in the <code>testing</code> folder, with something like <code>algo.upfun = HPL_pdupdateNN</code>. Might be hard to track&#x2026;</li>
-<li>Function <code>HPL_pdmxswp</code> is used in <code>HPL_pdpancrT</code>, <code>HPL_pdpanllT</code>, <code>HPL_pdpanllN</code>, <code>HPL_pdpanlT</code>, <code>HPL_pdpanrlN</code>,
-<code>HPL_pdpancrN</code>. These functions are used in the <code>testing</code> folder, with something like <code>algo.pffun = HPL_pdpancrN</code>.</li>
-<li><p>
-Trying to put some printf.
-We use the command:
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">smpirun --cfg=smpi/bcast:mpich --cfg=smpi/running-power:6217956542.969 --cfg=smpi/display-timing:yes
---cfg=smpi/privatize-global-variables:yes -np 16 -hostfile ../../../small_tests/hostfile_64.txt -platform
-../../../small_tests/cluster_fat_tree_64.xml ./xhpl
-</pre>
-</div>
-<ul class="org-ul">
-<li>Function <code>HPLpdupdateNN</code> never used.</li>
-<li>Function <code>HPLpdupdateTN</code> never user.</li>
-<li>Thus, function <code>HPL_pdlaswp00n</code> also never used (verified with printf in this function).</li>
-<li>Function <code>HPL_pdmxswp</code> is used and takes a significant (albeit not huge) amount of time (about 2 seconds when the
-total time is 41 seconds (virtual time)).</li>
-</ul></li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-orgf13081a" class="outline-4">
-<h4 id="orgf13081a"><span class="section-number-4">1.1.14</span> 2017-02-23 Thursday</h4>
-<div class="outline-text-4" id="text-1-1-14">
-</div>
-<ol class="org-ol">
-<li><a id="org4cc37d4"></a>Try to increase the file limit&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="BUG">BUG</span></span><br />
-<div class="outline-text-5" id="text-1-1-14-1">
-<ul class="org-ul">
-<li>First try, following <a href="http://askubuntu.com/questions/162229/how-do-i-increase-the-open-files-limit-for-a-non-root-user">this question</a> and <a href="http://stackoverflow.com/questions/21515463/how-to-increase-maximum-file-open-limit-ulimit-in-ubuntu">this question</a> from Stackoverflow.
-<ul class="org-ul">
-<li><p>
-Added the following to <code>/etc/security/limits.conf</code>:
-</p>
-<pre class="example">
-*     soft    nofile          40000
-*     hard    nofile          40000
-</pre></li>
-<li><p>
-Added the following to <code>/etc/pam.d/common-session</code>:
-</p>
-<pre class="example">
-session required pam_limits.so
-</pre></li>
-<li>Rebooting.</li>
-</ul></li>
-<li>Success, <code>ulimit -Sn</code> shows <code>40000</code> and we can now run experiments with more than 1024 processes.</li>
-</ul>
-</div>
-</li>
-<li><a id="org11d52ff"></a>Keep tracking the time consumming piece of code in HPL&#xa0;&#xa0;&#xa0;<span class="tag"><span class="PERFORMANCE">PERFORMANCE</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-1-14-2">
-<ul class="org-ul">
-<li>Function <code>HPL_pdmxswp</code> is used in some functions which are chosen with <code>algo.pffun</code> (see above).</li>
-<li>They are then used (through a call to algo.pffun) in functions <code>HPL_pdrpancrN</code>, <code>HPL_pdrpanrlN</code>, <code>HPL_pdrpanllN</code>,
-<code>HPL_pdpanrlT</code>, <code>HPL_pdrpancrT</code> and <code>HPL_pdranllT</code>.</li>
-<li>Again, these functions are not used directly in <code>src</code>, there is something like <code>algo.rffun = HPL_pdrpancrT</code> in the <code>testing</code> folder.</li>
-<li>This <code>rffun</code> is used only once, in <code>HPL_pdfact</code>.</li>
-<li>Function <code>HPL_pdfact</code> takes between 2.5 and 2.8 seconds when the total time is 41 seconds (virtual time). This time includes
-the time spent in <code>HPL_pdmxswp</code>.</li>
-<li>Function <code>HPL_pdffact</code> is used in functions <code>HPL_pdgesvK1</code>, <code>HPL_pdgesvK2</code> and <code>HPL_pdgesv0</code>. These functions are then called
-in <code>HPL_pdgesv</code>.</li>
-<li>Function <code>HPL_pdgesv</code> takes a time of about 3 seconds when the total time is 41 seconds (virtual time).</li>
-<li>Strange thing. Deleting the content of this function gives a very short run-time. Maybe the way I measured time (using
-<code>MPI_WTIME</code>) is not consistent with the way HPL measure time.</li>
-<li>Identified the long loop in <code>HPL_pdgesv0</code>. But cannot put a <code>SMPI_SAMPLE</code> here, there are calls to MPI primitives in the block.</li>
-<li>Found the right function to measure time: use <code>HPL_timer_walltime</code>, not <code>MPI_Wtime</code>.</li>
-<li><p>
-Instrumented the code of <code>HPL_pdgesv0</code> to have an idea of what takes time. Measures are taken with <code>HPL_timer_walltime</code>.
-What takes time is the part “factor and broadcast current panel” in the loop.
-Within this part, the call to <code>HPL_bcast</code> and <code>HPL_pdupdate</code> are what take most of the (virtual) time. In an execution of
-40.96 seconds:
-</p>
-<pre class="example">
-pdfact = 2.907908, binit = 0.002633, bcast = 11.013843, bwait = 0.000669, pdupdate = 26.709408
-</pre>
-<p>
-Obviously there is nothing to do for the broadcast, but there may be hope for <code>pdupdate</code>.
-</p></li>
-<li><p>
-Several versions exist for this function:
-</p>
-<ul class="org-ul">
-<li><code>HPL_pdupdateTN</code></li>
-<li><code>HPL_pdupdateNT</code></li>
-<li><code>HPL_pdupdateTT</code></li>
-<li><code>HPL_pdupdateNN</code></li>
-</ul>
-<p>
-Only <code>HPL_pdupdateTT</code> seems to be used (with our settings).
-Removed body of function <code>HPL_pdupdateTT</code>, the simulation time becomes about 8 seconds (was 69 seconds).
-</p></li>
-<li>Might be tricky to optimize with SMPI macros, this function mixes computations and communications.</li>
-<li>Tried to insert a <code>return</code> line 208 (before comment “The panel has been forwarded at that point, finish the update”. The
-time is not impacted and the correction test are valid, so the part of the code after this point seems useless here.
-Verified by inserting a <code>printf</code>, this paprt is never executed.</li>
-<li>Line 143 is executed (just after comment “1 x Q case”).</li>
-<li>Adding a <code>return</code> statement line 136 (just before comment “Enable/disable th column panel probing mechanism”) gives a
-simulation time of 8 seconds.
-Same thing line 140, after the broadcast.</li>
-<li>The <code>if</code> block of lines 143-258 is never executed in our settings. Explain why acting on line 208 did not have any effect.</li>
-<li>Adding a <code>return</code> statement line 358 (just before comment “The panel has been forwarded at that point, finish the
-update”) gives a simulation time of 9.7 seconds.</li>
-<li>The <code>if</code> block of lines 360-414 seem to be always executed. The <code>if</code> block of lines 366-390 is executed sometimes, but not
-always.
-In this block, we execute the <code>#else</code> part of the <code>#ifdef</code>.</li>
-<li>In this block, removing the call to <code>HPL_dgemm</code> reduce a lot the simulation time (from 68s to 13s).</li>
-<li>Several definitions exist for <code>HPL_dgemm</code>: there is an implementation in <code>src/blas/HPL_dgemm.c</code>, but also a <code>#define
-  HPL_dgemm cblas_dgemm</code> in <code>include/hpl_blas.h</code>.</li>
-<li>Can disable this <code>#define</code> by removing the line <code>HPL_OPTS = -DHPL_CALL_CBLAS</code> in the file <code>Make.SMPI</code>.
-Then, <code>HPL_dgemm</code> is executed, but not the others (<code>HPL_dgemm0</code>, <code>HPL_dgemmTT</code>, <code>HPL_dgemmTN</code>, <code>HPL_dgemmNT</code>, <code>HPL_dgemmNN</code>). It
-seems that <code>HPL_dgemm</code> can call <code>HPL_dgemm0</code> which can itself call the four others, but this only happens when
-<code>HPL_CALL_VSIPL</code> is defined.</li>
-<li>In fact, there is maybe no need to insert the <code>SMPI_SAMPLE</code> macro in <code>dgemm</code> function. We can put it inside
-<code>HPL_pdupdateTT</code>. For instance, line 360, just above the big <code>if</code> block.  However, this performs realy badly. With
-<code>SMPI_SAMPLE_GLOBAL(10, 0.1)</code>, the real time becomes about 10 seconds (speedup of &times;4) but the virtual time becomes about
-90 seconds (&times;2 error). If we increase one of the two numbers, the real times quickly become as large as it was before.
-Same thing with <code>SMPI_SAMPLE_LOCAL</code>.  Maybe this code is too irregular? Or we should “zoom in” and insert the SMPI
-optimizations in <code>dgemm</code> (which is in an external library, so not that easy).</li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-org17d672b" class="outline-4">
-<h4 id="org17d672b"><span class="section-number-4">1.1.15</span> 2017-02-27 Monday</h4>
-<div class="outline-text-4" id="text-1-1-15">
-</div>
-<ol class="org-ol">
-<li><a id="orged243b8"></a>Try running matrix product experiment with big fat-trees&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="BUG">BUG</span></span><br />
-<div class="outline-text-5" id="text-1-1-15-1">
-<ul class="org-ul">
-<li><p>
-Run a medium number of processes on a big fat-tree.
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">  ./run_measures.py --global_csv big_global.csv --local_csv big_local.csv --nb_runs 3 --size 9300 --nb_proc 961
-  --fat_tree <span class="org-string">"3;24,24,48;1,24,1:24;1,1,1"</span> --experiment matrix_product
-  <span class="org-comment-delimiter">#</span><span class="org-comment">+end_src sh</span>
-  Seems to work properly, one CPU core is quickly loaded at 100% and one experiment approximately takes two minutes.
-- Try a larger number of processes with the same topology and the same matrix size.
-  <span class="org-comment-delimiter">#</span><span class="org-comment">+begin_src sh</span>
-  ./run_measures.py --global_csv big_global.csv --local_csv big_local.csv --nb_runs 3 --size 9300 --nb_proc 8649
-  --fat_tree <span class="org-string">"3;24,24,48;1,24,1:24;1,1,1"</span> --experiment matrix_product
-</pre>
-</div>
-<p>
-The CPU is loaded at about 3% for quite a long time with the script <code>smpirun</code>. It finally launches <code>matmul</code> and becomes
-loaded at 100%. Then it quickly terminates with a non-null exit code: <code>Could not map fd 8652 with size 80000: Cannot
-  allocate memory</code>. The memory consumption was only 3% of the total memory, this is strange.
-This happens in function <code>shm_map</code>, called by <code>SMPI_SHARED_MALLOC</code>.
-</p></li>
-<li>Retrying the same command, with <code>malloc</code> instead of <code>SMPI_SHARED_MALLOC</code> and <code>free</code> instead of <code>SMPI_SHARED_FREE</code>.
-As expected, larger memory consumption (10.9% of total memory). There is no error this time. The first experiment
-terminates in about 20min. For the record, it achieved 1525 Gflops, with communication time and computation time of
-approximately 0.48 seconds.</li>
-<li>Revert the changes to get back <code>SMPI_SHARED</code> macros. Retry to run <code>smpirun</code> with the same settings, except the option
-<code>--cfg=smpi/privatize-global-variables:yes</code> which is not passed here.  No error either this time, run for 13
-minutes. Also a large memory consumption (13.5%), maybe the 3% we observed was not the final memory consumption, since
-the process exited with an error?</li>
-<li>Remark: for matrix product, there is no global variable. So maybe we can safely remove this option in this case?
-This does not solve the problem since we need it for HPL.</li>
-<li>Try the initial command with a smaller matrix size (size=93, i.e. all processes have a sub-matrix of size 1&times;1). Observed the same error.</li>
-<li><p>
-Also try to reproduce this with HPL, with this command:
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">./run_measures.py --global_csv big_global.csv --nb_runs 3 --size 5000 --nb_proc 8649 --fat_tree
-<span class="org-string">"3;24,24,48;1,24,1:24;1,1,1"</span> --experiment HPL
-</pre>
-</div>
-<p>
-Not any error, although we have a memory consumption of 71.2%.
-</p></li>
-<li>Try the initial command, still with a size of 93, but commenting the call to <code>matrix_product</code> in <code>matmul.c</code>. Thus, there
-is no allocation of temporary buffers, only the initial matrices (3 allocations instead of 5). No error.</li>
-<li>Same thing, with the call to <code>matrix_product</code> uncommented, but a <code>return</code> statement placed just after the temporary
-buffers allocations. We get the <code>mmap</code> error.</li>
-<li>Create a MWE from this, called <code>mmap_error.c</code>.</li>
-</ul>
-</div>
-</li>
-<li><a id="orgaa5fd22"></a>Work on a MWE for the mmap error&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="BUG">BUG</span></span><br />
-<div class="outline-text-5" id="text-1-1-15-2">
-<ul class="org-ul">
-<li>File <code>mmap_error.c</code> is a MWE for the <code>mmap</code> error. It consists in 5 calls to <code>SMPI_SHARED_MALLOC</code> with a size of 1, we
-launch it with 8652 processes.
-We also get an error if we do 100k calls to <code>SMPI_SHARED_MALLOC</code> with only one process. The total number of calls to
-this macro seem to be the issue. We get the error with or without the option <code>smpi/privatize-global-variables:yes</code>.</li>
-<li><p>
-The following file <code>mmap_error.c</code>:
-</p>
-<div class="org-src-container">
-<pre class="src src-c"><span class="org-preprocessor">#include</span> <span class="org-string">&lt;stdio.h&gt;</span>
-<span class="org-preprocessor">#include</span> <span class="org-string">&lt;mpi.h&gt;</span>
-
-<span class="org-preprocessor">#define</span> <span class="org-variable-name">N</span> 65471
-
-<span class="org-type">int</span> <span class="org-function-name">main</span>(<span class="org-type">int</span> <span class="org-variable-name">argc</span>, <span class="org-type">char</span> *<span class="org-variable-name">argv</span>[]) {
-
-    MPI_Init(&amp;argc, &amp;argv);
-
-    <span class="org-keyword">for</span>(<span class="org-type">int</span> <span class="org-variable-name">i</span> = 0; i &lt; N; i++) {
-        <span class="org-type">float</span> *<span class="org-variable-name">a</span> = SMPI_SHARED_MALLOC(1);
-    }
-
-    MPI_Barrier(MPI_COMM_WORLD);
-    printf(<span class="org-string">"Success\n"</span>);
-    MPI_Finalize();
-    <span class="org-keyword">return</span> 0;
-}
-</pre>
-</div>
-<p>
-With the following command (commit <code>8eb0cf0b6993e174df58607e9492a134b85a4669</code> of Simgrid):
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">smpicc -O4 mmap_error.c -o mmap_error
-smpirun -np 1 -hostfile hostfile_64.txt -platform cluster_fat_tree_64.xml ./mmap_error
-</pre>
-</div>
-<p>
-Yields an error. Note that the host and topology files are irrelevant here.
-</p>
-<ul class="org-ul">
-<li>For <code>N&lt;65471</code>, we have no error (<code>Success</code> is printed).</li>
-<li>For <code>N&gt;65471</code>, we have the error <code>Could not map fd 3 with size 1: Cannot allocate memory</code>.</li>
-<li>For <code>N=65471</code>, we have the error <code>Memory callocation of 524288 bytes failed</code>.</li>
-</ul></li>
-<li><p>
-Retried with latest version of Simgrid (commit <code>c8db21208f3436c35d3fdf5a875a0059719bff43</code>).  Now have the
-message:
-</p>
-<pre class="example">
-Could not map folded virtual memory (Cannot allocate memory). Do you perhaps need to increase
-the STARPU_MALLOC_SIMULATION_FOLD environment variable or the sysctl vm.max_map_count?
-</pre>
-<p>
-Found the issue:
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">$ sysctl vm.max_map_count
-vm.max_map_count = 65530
-</pre>
-</div>
-<p>
-To modify the value of a <code>sysctl</code> variable, follow <a href="https://www.cyberciti.biz/faq/howto-set-sysctl-variables/">this link</a>.
-Temporary fix:
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">sudo sysctl -w vm.max_map_count=100000
-</pre>
-</div></li>
-</ul>
-</div>
-</li>
-<li><a id="orgaf28d28"></a>Run the matrix product experiment with 8649 processes<br />
-<div class="outline-text-5" id="text-1-1-15-3">
-<ul class="org-ul">
-<li><p>
-Using the command:
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">./run_measures.py --global_csv big_global.csv --local_csv big_local.csv --nb_runs 3 --size 9300 --nb_proc 8649
---fat_tree <span class="org-string">"3;24,24,48;1,24,1:24;1,1,1"</span> --experiment matrix_product
-</pre>
-</div></li>
-<li>The experiments are very long, about 30 minutes. The code is already optimized a lot (SMPI macros, no initialization
-of the matrices), a large part of this time is spent outside of the application, so there is not much hope to run it
-faster without modifying Simgrid.</li>
-<li>This shows that we <b>really</b> need to optimize HPL if we want to run it with a large number of processes.</li>
-<li>Anyway, without SMPI macros, every floating-point operation of the application is actually performed. Thus, if we are
-simulating a computation made on a 1000 Gflops cluster, using a 1 Gflops laptop, the simulation should take <b>at least</b>
-1000 times longer than the same computation on a real 1000 Gflops cluster.</li>
-<li>First results show no large difference in the total time for small or large number of roots. The communication time is
-about twice as large as the computation time, so maybe we should take a larger matrix. When we had 961 processes, each
-one had a sub-matrix of size 300&times;300. With 8649 processes, they have a sub-matrix of size 100&times;100.
-Problem: if we want to get back to the 300&times;300 sub-matrices, we need to multiply the size by 3 and thus the memory
-consumption by 9. It was already about 25%, so not feasible on this laptop. But this is strange, we should have the
-memory of only one process and we successfully ran 300&times;300 sub-matrices, need to check.</li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-org1008d02" class="outline-4">
-<h4 id="org1008d02"><span class="section-number-4">1.1.16</span> 2017-02-28 Tuesday</h4>
-<div class="outline-text-4" id="text-1-1-16">
-</div>
-<ol class="org-ol">
-<li><a id="orgff6a6bb"></a>Other benchmarks on Simgrid&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="EXPERIMENTS">EXPERIMENTS</span></span><br />
-<div class="outline-text-5" id="text-1-1-16-1">
-<ul class="org-ul">
-<li>The paper “Simulating MPI application: the SMPI approach” uses the benchmark <a href="https://www.nas.nasa.gov/publications/npb.html">NAS EP</a> to demonstrate the scalability of
-SMPI. With SMPI optimizations, they ran it with 16384 processes in 200 to 400 seconds (depending on the topology).
-Where is the code for this?
-<ul class="org-ul">
-<li>Found an <a href="https://github.com/sbadia/simgrid/tree/master/examples/smpi/NAS">old repository</a>. Not clear if it is relevant.</li>
-<li>Also a (shorter) version in the <a href="https://github.com/simgrid/simgrid/tree/master/examples/smpi/NAS">official Simgrid repository</a>.
-Executable located in <code>simgrid/build/examples/smpi/NAS/</code>.
-Launch with two arguments: number of processes (don’t know what it does, we already have <code>-np</code> option given to
-<code>smpirun</code>) and the class to use (S, W, A, B, C, D, E, F).</li>
-</ul></li>
-<li>The NAS EP benchmark from Simgrid repository seems promising. Added a new class to have a larger problem (maybe we
-could instead give the size as an argument). With a large enough size, we can go to about 3.5 Gflops per process,
-i.e. an efficiency of 3.5 (recall that we use 1 Gflops nodes). It seems large, is it normal?</li>
-<li>Longer than the matrix product, 745 seconds for 1152 processes and class F (custom class with m=42). Only 93 seconds
-were spent in the application, so the code is already correctly optimized (one call to <code>SMPI_SAMPLE_GLOBAL</code>).</li>
-<li>Apparently not impacted by a tapered fat tree. Roughly the same speed for <code>2;24,48;1,24;1,1</code> and <code>2;24,48;1,1;1,1</code>, 1152
-processes and class F: about 3.5 Gflops. The application is made of a computation followed by three <code>MPI_Allreduce</code>
-of only one <code>double</code>, so very few communications (hence the name “embarassingly parallel”).</li>
-</ul>
-</div>
-</li>
-<li><a id="orga911a5f"></a>Talk with Christian about benchmarks<br />
-<div class="outline-text-5" id="text-1-1-16-2">
-<ul class="org-ul">
-<li>Get an access to grid 5000.</li>
-<li>Profile the code, with something like smpirun -wrapper “valgrind &lt;param&gt;”.</li>
-<li>To use SMPI macros, run the <code>HPL_dgemm</code> implemented in HPL, not the one from the external library.</li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-</div>
-<div id="outline-container-orgc9f6226" class="outline-3">
-<h3 id="orgc9f6226"><span class="section-number-3">1.2</span> 2017-03 March</h3>
-<div class="outline-text-3" id="text-1-2">
-</div>
-<div id="outline-container-org8e0642f" class="outline-4">
-<h4 id="org8e0642f"><span class="section-number-4">1.2.1</span> 2017-03-01 Wednesday</h4>
-<div class="outline-text-4" id="text-1-2-1">
-</div>
-<ol class="org-ol">
-<li><a id="org815a1a0"></a>Trying to use HPL without external BLAS library&#xa0;&#xa0;&#xa0;<span class="tag"><span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-1-1">
-<ul class="org-ul">
-<li>Failed.</li>
-<li>It seems that three options are available for compilation, according to <a href="http://www.netlib.org/benchmark/hpl/software.html">this page</a>:
-<ul class="org-ul">
-<li>BLAS Fortran 77 interface (the default),</li>
-<li>BLAS C interface (option <code>-DHPL_CALL_CBLAS</code>),</li>
-<li>VSIPL library (option <code>-DHPL_CALL_VSIPL</code>).</li>
-</ul></li>
-<li>We currently use the C interface, which rely on an external library (e.g. Atlas).</li>
-<li>There is an implementation of <code>HPL_dgemm</code> in HPL, but it seems to need either code from Fortran 77 or from VSIPL.</li>
-<li><p>
-According to the <a href="http://www.netlib.org/benchmark/hpl/">HPL homepage</a>:
-</p>
-<pre class="example">
-The HPL software package requires the availibility on your system of an implementation of the Message Passing
-Interface MPI (1.1 compliant). An implementation of either the Basic Linear Algebra Subprograms BLAS or the Vector
-Signal Image Processing Library VSIPL is also needed. Machine-specific as well as generic implementations of MPI, the
-BLAS and VSIPL are available for a large variety of systems.
-</pre>
-<p>
-So it seems hopeless to get rid of a BLAS library.
-</p></li>
-</ul>
-</div>
-</li>
-<li><a id="orge44c9c1"></a>Idea: trace calls to <code>HPL_dgemm</code> (Arnaud’s idea)&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="TRACING">TRACING</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-1-2">
-<ul class="org-ul">
-<li><p>
-To do so, surround them by calls to trivial MPI primitives (e.g. <code>MPI_Initialized</code>). For instance:
-</p>
-<div class="org-src-container">
-<pre class="src src-c"><span class="org-preprocessor">#define</span> <span class="org-function-name">HPL_dgemm</span>(...) ({<span class="org-type">int</span> <span class="org-variable-name">simgrid_test</span>; MPI_Initialized(&amp;simgrid_test); cblas_dgemm(__VA_ARGS__);\
-MPI_Initialized(&amp;simgrid_test);})
-</pre>
-</div></li>
-<li><p>
-Then, trace the execution (output in <code>/tmp/trace</code>):
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">smpirun -trace -trace-file /tmp/trace --cfg=smpi/trace-call-location:1 --cfg=smpi/bcast:mpich<span class="org-sh-escaped-newline">\</span>
---cfg=smpi/running-power:6217956542.969 --cfg=smpi/display-timing:yes --cfg=smpi/privatize-global-variables:yes -np 16<span class="org-sh-escaped-newline">\</span>
--hostfile ../../../small_tests/hostfile_64.txt -platform ../../../small_tests/cluster_fat_tree_64.xml ./xhpl<span class="org-sh-escaped-newline">\</span>
-</pre>
-</div></li>
-<li><p>
-Finally, dump this trace in CSV format:
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">pj_dump --user-defined --ignore-incomplete-links trace &gt; trace.dump
-</pre>
-</div></li>
-<li>Did not work, no <code>MPI_Initialized</code> in the trace. In fact, this primitive is currently not traced. We could modify SMPI
-to achieve this behavior, or use another MPI primitive that is already traced.</li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-org386bbae" class="outline-4">
-<h4 id="org386bbae"><span class="section-number-4">1.2.2</span> 2017-03-02 Thursday</h4>
-<div class="outline-text-4" id="text-1-2-2">
-</div>
-<ol class="org-ol">
-<li><a id="orgf374028"></a>Keep trying to trace calls to <code>HPL_dgemm</code>&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="TRACING">TRACING</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-2-1">
-<ul class="org-ul">
-<li>A MPI primitive is traced &hArr; the functions <code>new_pajePushState</code> and <code>new_pagePopState</code> are called (not sure, this is an
-intuition).</li>
-<li>This function is not called by <code>MPI_Initialized</code>, or <code>MPI_Wtime</code>.</li>
-<li>It is called by <code>MPI_Test</code>, but only if the <code>MPI_Request</code> object passed as argument is non-null, so we would need to do a
-fake asynchronous communication just before, which is probably not a good idea.</li>
-<li><p>
-Anyway, it looks dirty to use a MPI primitive like this. Wouldn’t it be better to have a custom no-op primitive that
-force the introduction of a trace entry? For instance, something like
-</p>
-<div class="org-src-container">
-<pre class="src src-c">SMPI_Trace {
-    HPL_dgemm();
-}
-</pre>
-</div>
-<p>
-or like
-</p>
-<div class="org-src-container">
-<pre class="src src-c"><span class="org-function-name">SMPI_BeginTrace</span>();
-<span class="org-function-name">HPL_dgemm</span>();
-<span class="org-function-name">SMPI_EndTrace</span>();
-</pre>
-</div></li>
-<li><p>
-Every MPI primitive is defined by a <code>#define</code> with a call to <code>smpi_trace_set_call_location</code> followed by a call to the
-function. For instance:
-</p>
-<div class="org-src-container">
-<pre class="src src-c"><span class="org-preprocessor">#define</span> <span class="org-function-name">MPI_Test</span>(...) ({ smpi_trace_set_call_location(__FILE__,__LINE__); MPI_Test(__VA_ARGS__); })
-</pre>
-</div>
-<p>
-However, this only record the file name and the line number, I do not think it dumps anything in the trace.
-</p></li>
-</ul>
-</div>
-</li>
-<li><a id="orgdf0de24"></a>Arnaud’s keynote: reproducible research&#xa0;&#xa0;&#xa0;<span class="tag"><span class="MEETING">MEETING</span></span><br />
-<div class="outline-text-5" id="text-1-2-2-2">
-<ul class="org-ul">
-<li>Intro: article we had in exam, “Is everything we eat associated with cancer?”.</li>
-<li>In most articles, we can read formulae and trust results, but much less often reproduce the results.</li>
-<li>Reproducibility crisis, several scandals with falsified results (intentionnaly or not).</li>
-<li>Video: Brendan Gregg, shouting  in the data center.</li>
-</ul>
-</div>
-</li>
-<li><a id="orgdacc8ea"></a>Discussion with Arnaud&#xa0;&#xa0;&#xa0;<span class="tag"><span class="MEETING">MEETING</span></span><br />
-<div class="outline-text-5" id="text-1-2-2-3">
-<ul class="org-ul">
-<li>Regarding the matrix product:
-<ul class="org-ul">
-<li>Compare the (tapered) fat-tree with “perfect” topology (cluster with no latency and infinit bandwidth).</li>
-<li>Run it with larger matrices for the same amount of processes. Do not aim at spending as much time in communication
-than computation. We want the communication time to become nearly negligible. In practices, users of a supercomputer
-try to fill the memory of their nodes.</li>
-</ul></li>
-<li>Regarding HPL:
-<ul class="org-ul">
-<li>As discussed yesterday, we want to trace the calls to <code>HPL_dgemm</code> by putting calls to a MPI primitive just before and after.</li>
-<li>The short-time goal is to have an idea of the behavior of HPL regarding this function. Are there a lot of different
-calls to <code>HPL_dgemm</code> coming from different locations? Do these calls always take the same amount of time (i.e. do we
-always multiply matrices of the same size)?</li>
-<li>It seems that there is some variability in the duration of <code>HPL_dgemm</code> (to be verified with the trace). If HPL really
-use the function to multiply matrices of different size, we cannot do something like
-<code>SMPI_SAMPLE(){HPL_dgemm()}</code>, it will not be precise. What we could do however is to generalize <code>SMPI_SAMPLE</code>: we could
-parametrize it by a number, representing the size of the problem that is sampled. If this size is always the same,
-then we could do what we are doing now, simply take the average. If this size changes over time, we could do
-something more elaborated for the prediction, like a linear regression.</li>
-<li>Using MPI functions like <code>MPI_Test</code> is not very “clean”, but we do not want to waste time on this currently, so we stick
-with existing MPI primitives. We could try to change this in the future.</li>
-<li>It is always safe to call <code>smpi_process_index</code>. Thus, we could modify <code>PMPI_Test</code> to call <code>TRACE_smpi_testing</code> functions
-even when the given request is <code>NULL</code>.</li>
-</ul></li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-org5e411d0" class="outline-4">
-<h4 id="org5e411d0"><span class="section-number-4">1.2.3</span> 2017-03-03 Friday</h4>
-<div class="outline-text-4" id="text-1-2-3">
-</div>
-<ol class="org-ol">
-<li><a id="orgbe24a63"></a>Tracing calls to <code>HPL_dgemm</code>&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="C">C</span>&#xa0;<span class="PYTHON">PYTHON</span>&#xa0;<span class="R">R</span>&#xa0;<span class="EXPERIMENTS">EXPERIMENTS</span>&#xa0;<span class="TRACING">TRACING</span>&#xa0;<span class="PERFORMANCE">PERFORMANCE</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-3-1">
-<ul class="org-ul">
-<li>Modification of the function <code>PMPI_Test</code> of Simgrid so that <code>MPI_Test</code> is traced even when the <code>MPI_Request</code> handle is
-<code>NULL</code>. To do that, we need to get the rank of the process, with <code>smpi_process_index</code>. The value returned is always 0 in
-this case. This is a problem, since we could not distinguish between calls to <code>MPI_Test</code> from different processes, thus
-it would be impossible to measure time. Reverting the changes.</li>
-<li><p>
-To get a non-null <code>MPI_Request</code>, did a <code>MPI_Isend</code> followed by a <code>MPI_Recv</code>:
-</p>
-<div class="org-src-container">
-<pre class="src src-c"><span class="org-preprocessor">#define</span>    <span class="org-function-name">HPL_dgemm</span>(...)      ({\
-  <span class="org-type">int</span> <span class="org-variable-name">my_rank</span>, <span class="org-variable-name">buff</span>=0;\
-  <span class="org-type">MPI_Request</span> <span class="org-variable-name">request</span>;\
-  MPI_Comm_rank(MPI_COMM_WORLD, &amp;my_rank);\
-  MPI_Isend(&amp;buff, 1, MPI_INT, my_rank, 0, MPI_COMM_WORLD, &amp;request);\
-  MPI_Recv(&amp;buff, 1, MPI_INT, my_rank, 0, MPI_COMM_WORLD, <span class="org-constant">NULL</span>);\
-  MPI_Wait(&amp;request, MPI_STATUS_IGNORE);\
-  cblas_dgemm(__VA_ARGS__);\
-  MPI_Isend(&amp;buff, 1, MPI_INT, my_rank, 0, MPI_COMM_WORLD, &amp;request);\
-  MPI_Recv(&amp;buff, 1, MPI_INT, my_rank, 0, MPI_COMM_WORLD, <span class="org-constant">NULL</span>);\
-  MPI_Wait(&amp;request, MPI_STATUS_IGNORE);\
-})
-</pre>
-</div></li>
-<li>Forget this. HPL was executed with only one process (<code>-np 16</code> but P and Q were 1 in <code>HPL.dat</code>). This is why we only had a
-rank <code>0</code> when giving <code>NULL</code> as <code>MPI_Request</code>. Let’s revert this and use simple <code>MPI_Test</code> with <code>NULL</code>.</li>
-<li><p>
-Calls to <code>MPI_Test</code> seem to be correctly traced, but the post-processing of the trace with <code>pj_dump</code> crashes:
-</p>
-<pre class="example">
-terminate called after throwing an instance of 'std::out_of_range'
-what():  vector::_M_range_check: __n (which is 4) &gt;= this-&gt;size() (which is 4)
-</pre>
-<p>
-It also happened with the more complex piece of code that is shown above (with <code>MPI_Test</code> instead of <code>MPI_Wait</code>).
-Reverting again, to use the bigger piece of code above.
-</p></li>
-<li>Now, the call to <code>pj_dump</code> succeeds, and we can see calls to <code>MPI_Wait</code> in the trace.</li>
-<li>The call to <code>smpirun</code> was:</li>
-</ul>
-<div class="org-src-container">
-<pre class="src src-sh">smpirun -trace -trace-file /tmp/trace --cfg=smpi/trace-call-location:1 --cfg=smpi/bcast:mpich<span class="org-sh-escaped-newline">\</span>
---cfg=smpi/running-power:6217956542.969 --cfg=smpi/display-timing:yes --cfg=smpi/privatize-global-variables:yes -np 16<span class="org-sh-escaped-newline">\</span>
--hostfile ../../../small_tests/hostfile_64.txt -platform ../../../small_tests/cluster_fat_tree_64.xml ./xhpl
-</pre>
-</div>
-<ul class="org-ul">
-<li>Processing of the trace.
-Clean the file:</li>
-</ul>
-<div class="org-src-container">
-<pre class="src src-sh">pj_dump --user-defined --ignore-incomplete-links /tmp/trace &gt; /tmp/trace.csv
-grep <span class="org-string">"State,"</span> /tmp/trace.csv | grep MPI_Wait | sed -e <span class="org-string">'s/()//'</span> -e <span class="org-string">'s/MPI_STATE, //ig'</span>  -e <span class="org-string">'s/State, //ig'</span> -e <span class="org-string">'s/rank-//'</span> -e<span class="org-sh-escaped-newline">\</span>
-<span class="org-string">'s/PMPI_/MPI_/'</span> | grep MPI_  | tr <span class="org-string">'A-Z'</span> <span class="org-string">'a-z'</span> &gt; /tmp/trace_processed.csv
-</pre>
-</div>
-
-<p>
-Clean the paths:
-</p>
-<div class="org-src-container">
-<pre class="src src-python"><span class="org-keyword">import</span> re
-<span class="org-variable-name">reg</span> = re.<span class="org-builtin">compile</span>(<span class="org-string">'((?:[^/])*)(?:/[a-zA-Z0-9_-]*)*((?:/hpl-2.2(?:/[a-zA-Z0-9_-]*)*).*)'</span>)
-<span class="org-keyword">def</span> <span class="org-function-name">process</span>(in_file, out_file):
-    <span class="org-keyword">with</span> <span class="org-builtin">open</span>(in_file, <span class="org-string">'r'</span>) <span class="org-keyword">as</span> in_f:
-        <span class="org-keyword">with</span> <span class="org-builtin">open</span>(out_file, <span class="org-string">'w'</span>) <span class="org-keyword">as</span> out_f:
-            <span class="org-keyword">for</span> line <span class="org-keyword">in</span> in_f:
-                <span class="org-variable-name">match</span> = reg.match(line)
-                out_f.write(<span class="org-string">'%s%s\n'</span> % (match.group(1), match.group(2)))
-process(<span class="org-string">'/tmp/trace_processed.csv'</span>, <span class="org-string">'/tmp/trace_cleaned.csv'</span>)
-</pre>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">df &lt;- read.csv("/tmp/trace_cleaned.csv", header=F, strip.white=T, sep=",");
-names(df) = c("rank", "start", "end", "duration", "level", "state", "Filename", "Linenumber");
-head(df)
-</pre>
-</div>
-
-<pre class="example">
-  rank    start      end duration level    state
-1    8 2.743960 2.743960        0     0 mpi_wait
-2    8 2.744005 2.744005        0     0 mpi_wait
-3    8 2.744005 2.744005        0     0 mpi_wait
-4    8 2.744005 2.744005        0     0 mpi_wait
-5    8 2.744005 2.744005        0     0 mpi_wait
-6    8 2.744005 2.744005        0     0 mpi_wait
-                            Filename Linenumber
-1 /hpl-2.2/src/pfact/hpl_pdrpanllt.c        222
-2 /hpl-2.2/src/pfact/hpl_pdrpanllt.c        222
-3 /hpl-2.2/src/pfact/hpl_pdrpanllt.c        222
-4 /hpl-2.2/src/pfact/hpl_pdrpanllt.c        222
-5 /hpl-2.2/src/pfact/hpl_pdrpanllt.c        222
-6 /hpl-2.2/src/pfact/hpl_pdrpanllt.c        222
-</pre>
-
-<div class="org-src-container">
-<pre class="src src-R">duration_compute = function(df) {
-    ndf = data.frame();
-    df = df[with(df,order(rank,start)),];
-    #origin = unique(df$origin)
-    for(i in (sort(unique(df$rank)))) {
-	start     = df[df$rank==i,]$start;
-	end       = df[df$rank==i,]$end;
-	l         = length(end);
-	end       = c(0,end[1:(l-1)]); # Computation starts at time 0
-
-	startline = c(0, df[df$rank==i,]$Linenumber[1:(l-1)]);
-	startfile = c("", as.character(df[df$rank==i,]$Filename[1:(l-1)]));
-	endline   = df[df$rank==i,]$Linenumber;
-	endfile   = df[df$rank==i,]$Filename;
-
-	ndf       = rbind(ndf, data.frame(rank=i, start=end, end=start,
-	    duration=start-end, state="Computing",
-	    startline=startline, startfile=startfile, endline=endline,
-	    endfile=endfile));
-    }
-    ndf$idx = 1:length(ndf$duration)
-    ndf;
-}
-durations = duration_compute(df);
-durations = durations[durations["startfile"] == "/hpl-2.2/src/pgesv/hpl_pdupdatett.c" &amp; durations["endfile"] == "/hpl-2.2/src/pgesv/hpl_pdupdatett.c" &amp;
-    durations["startline"] == durations["endline"],]
-</pre>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">library(dplyr)
-options(width=200)
-group_by(durations, startfile, startline, endfile, endline) %&gt;% summarise(duration=sum(duration), count=n()) %&gt;% as.data.frame()
-</pre>
-</div>
-
-<pre class="example">
-                            startfile startline                             endfile endline  duration count
-1 /hpl-2.2/src/pgesv/hpl_pdupdatett.c       387 /hpl-2.2/src/pgesv/hpl_pdupdatett.c     387  683.6677   659
-2 /hpl-2.2/src/pgesv/hpl_pdupdatett.c       411 /hpl-2.2/src/pgesv/hpl_pdupdatett.c     411 2115.8129  1977
-</pre>
-
-
-<div class="org-src-container">
-<pre class="src src-R">library(ggplot2)
-ggplot(durations, aes(x=idx, y=duration, color=factor(rank))) +
-    geom_point(shape=1) + ggtitle("Durations of HPL_dgemm")
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="images/trace1_16.png" alt="trace1_16.png" />
-</p>
-</div>
-
-
-<div class="org-src-container">
-<pre class="src src-R">ggplot(durations, aes(x=start, y=duration, color=factor(rank))) +
-    geom_point(shape=1) + ggtitle("Durations of HPL_dgemm")
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="images/trace2_16.png" alt="trace2_16.png" />
-</p>
-</div>
-
-<p>
-Same results, with four processes:
-</p>
-
-
-<div class="figure">
-<p><img src="images/trace1_4.png" alt="trace1_4.png" />
-</p>
-</div>
-
-
-<div class="figure">
-<p><img src="images/trace2_4.png" alt="trace2_4.png" />
-</p>
-</div>
-</div>
-</li>
-
-<li><a id="org00a7ab8"></a>Seminaire&#xa0;&#xa0;&#xa0;<span class="tag"><span class="MEETING">MEETING</span></span><br />
-<div class="outline-text-5" id="text-1-2-3-2">
-<p>
-On the asymptotic behavior of the price of anarchy, how bad is selfish routing in highly congested networks?
-</p>
-<ul class="org-ul">
-<li>For instance, cars on a road make their own routing decisions, hence the “selfish” routing. This is not optimal (in
-comparison with a centralized routing).</li>
-</ul>
-</div>
-</li>
-<li><a id="orge67bba8"></a>Discussion with Arnaud &amp; Christian&#xa0;&#xa0;&#xa0;<span class="tag"><span class="MEETING">MEETING</span></span><br />
-<div class="outline-text-5" id="text-1-2-3-3">
-<ul class="org-ul">
-<li>According to the plots, it is impossible to use <code>SMPI_SAMPLE</code> as is, since there are huge variations on the duration of <code>HPL_dgemm</code>.</li>
-<li>The idea of a parametrized <code>SMPI_SAMPLE</code> is also not super. Every process does consecutive calls to <code>HPL_dgemm</code>, each call
-being shorter than the previous ones. So we would still have to compute expensive calls.</li>
-<li>A long term idea may be to have a “SimBLAS” library, that simulates the calls to <code>HPL_dgemm</code> (and other BLAS
-primitives). Christian will work on this.</li>
-<li>Answers to all my questions from the paper readings.</li>
-</ul>
-</div>
-</li>
-<li><a id="org1ec6bea"></a><span class="todo TODO">TODO</span> New tasks <code>[3/4]</code><br />
-<div class="outline-text-5" id="text-1-2-3-4">
-<ul class="org-ul">
-<li class="on"><code>[X]</code> Do the linear regression by hand, off-line. Output the sizes of the matrices given to <code>HPL_dgemm</code> (with <code>printf</code>).</li>
-<li class="on"><code>[X]</code> Register on Grid5000. Compile HPL on one Grid5000 machine.</li>
-<li class="on"><code>[X]</code> Try to run HPL with a very large matrix, by using <code>SMPI_SHARED_MALLOC</code> (thus look at where all the allocations of
-matrices are done).</li>
-<li class="off"><code>[&#xa0;]</code> Have a look at the code of Simgrid, in particular the routing in fat-trees.</li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-orgcf590a7" class="outline-4">
-<h4 id="orgcf590a7"><span class="section-number-4">1.2.4</span> 2017-03-06 Monday</h4>
-<div class="outline-text-4" id="text-1-2-4">
-</div>
-<ol class="org-ol">
-<li><a id="org09c2838"></a>Output the matrix sizes&#xa0;&#xa0;&#xa0;<span class="tag"><span class="C">C</span>&#xa0;<span class="PYTHON">PYTHON</span>&#xa0;<span class="TRACING">TRACING</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-4-1">
-<ul class="org-ul">
-<li><p>
-Add the following before the relevant calls to <code>HPL_dgemm</code>:
-</p>
-<div class="org-src-container">
-<pre class="src src-c">printf(<span class="org-string">"line=%d rank=%d m=%d n=%d k=%d\n"</span>, __LINE__+3, rank, mp, nn, jb);
-</pre>
-</div>
-<p>
-Then, run HPL by redirecting <code>stdout</code> to <code>/tmp/output</code>.
-</p></li>
-<li>Process the output, to get a CSV file:</li>
-</ul>
-<div class="org-src-container">
-<pre class="src src-python"><span class="org-keyword">import</span> re
-<span class="org-keyword">import</span> csv
-<span class="org-variable-name">reg</span> = re.<span class="org-builtin">compile</span>(<span class="org-string">'line=([0-9]+) rank=([0-9]+) m=([0-9]+) n=([0-9]+) k=([0-9]+)'</span>)
-
-<span class="org-keyword">def</span> <span class="org-function-name">process</span>(in_file, out_file):
-    <span class="org-keyword">with</span> <span class="org-builtin">open</span>(in_file, <span class="org-string">'r'</span>) <span class="org-keyword">as</span> in_f:
-        <span class="org-keyword">with</span> <span class="org-builtin">open</span>(out_file, <span class="org-string">'w'</span>) <span class="org-keyword">as</span> out_f:
-            <span class="org-variable-name">csv_writer</span> = csv.writer(out_f)
-            csv_writer.writerow((<span class="org-string">'line'</span>, <span class="org-string">'rank'</span>, <span class="org-string">'n'</span>, <span class="org-string">'m'</span>, <span class="org-string">'k'</span>))
-            <span class="org-keyword">for</span> line <span class="org-keyword">in</span> in_f:
-                <span class="org-variable-name">match</span> = reg.match(line)
-                <span class="org-keyword">if</span> match <span class="org-keyword">is</span> <span class="org-keyword">not</span> <span class="org-constant">None</span>:
-                    csv_writer.writerow(<span class="org-builtin">tuple</span>(match.group(i) <span class="org-keyword">for</span> i <span class="org-keyword">in</span> <span class="org-builtin">range</span>(1,6)))
-process(<span class="org-string">'/tmp/output'</span>, <span class="org-string">'/tmp/sizes.csv'</span>)
-</pre>
-</div>
-</div>
-</li>
-
-<li><a id="orgf9d4a80"></a>Merge the sizes with the durations&#xa0;&#xa0;&#xa0;<span class="tag"><span class="R">R</span>&#xa0;<span class="EXPERIMENTS">EXPERIMENTS</span>&#xa0;<span class="PERFORMANCE">PERFORMANCE</span></span><br />
-<div class="outline-text-5" id="text-1-2-4-2">
-<ul class="org-ul">
-<li>Run <code>smpirun</code> as stated above, then process the output and the trace as before.</li>
-<li>Process the data:</li>
-</ul>
-<div class="org-src-container">
-<pre class="src src-R">df &lt;- read.csv("/tmp/trace_cleaned.csv", header=F, strip.white=T, sep=",");
-names(df) = c("rank", "start", "end", "duration", "level", "state", "Filename", "Linenumber");
-head(df)
-</pre>
-</div>
-
-<pre class="example">
-  rank    start      end duration level    state                           Filename Linenumber
-1    8 2.743960 2.743960        0     0 mpi_wait /hpl-2.2/src/pfact/hpl_pdrpanllt.c        222
-2    8 2.744005 2.744005        0     0 mpi_wait /hpl-2.2/src/pfact/hpl_pdrpanllt.c        222
-3    8 2.744005 2.744005        0     0 mpi_wait /hpl-2.2/src/pfact/hpl_pdrpanllt.c        222
-4    8 2.744005 2.744005        0     0 mpi_wait /hpl-2.2/src/pfact/hpl_pdrpanllt.c        222
-5    8 2.744005 2.744005        0     0 mpi_wait /hpl-2.2/src/pfact/hpl_pdrpanllt.c        222
-6    8 2.744005 2.744005        0     0 mpi_wait /hpl-2.2/src/pfact/hpl_pdrpanllt.c        222
-</pre>
-
-
-<div class="org-src-container">
-<pre class="src src-R">sizes &lt;- read.csv("/tmp/sizes.csv");
-head(sizes)
-</pre>
-</div>
-
-<pre class="example">
-  line rank    n    m   k
-1  411   12 4920 4920 120
-2  387    0 4920 4920 120
-3  411    8 5000 4920 120
-4  411    4 5040 4920 120
-5  411   13 4920 5040 120
-6  387    1 4920 5040 120
-</pre>
-
-
-<div class="org-src-container">
-<pre class="src src-R">durations = duration_compute(df); # same function as above
-durations = durations[durations["startfile"] == "/hpl-2.2/src/pgesv/hpl_pdupdatett.c" &amp; durations["endfile"] == "/hpl-2.2/src/pgesv/hpl_pdupdatett.c" &amp;
-    durations["startline"] == durations["endline"],]
-head(durations)
-</pre>
-</div>
-
-<pre class="example">
-    rank     start       end duration     state startline                           startfile endline                             endfile idx
-481    0  3.153899  6.271075 3.117176 Computing       387 /hpl-2.2/src/pgesv/hpl_pdupdatett.c     387 /hpl-2.2/src/pgesv/hpl_pdupdatett.c 481
-486    0  7.047247 10.063367 3.016120 Computing       411 /hpl-2.2/src/pgesv/hpl_pdupdatett.c     411 /hpl-2.2/src/pgesv/hpl_pdupdatett.c 486
-491    0 10.648367 13.716045 3.067678 Computing       411 /hpl-2.2/src/pgesv/hpl_pdupdatett.c     411 /hpl-2.2/src/pgesv/hpl_pdupdatett.c 491
-496    0 14.104534 17.155418 3.050884 Computing       411 /hpl-2.2/src/pgesv/hpl_pdupdatett.c     411 /hpl-2.2/src/pgesv/hpl_pdupdatett.c 496
-977    0 17.557080 20.430869 2.873789 Computing       387 /hpl-2.2/src/pgesv/hpl_pdupdatett.c     387 /hpl-2.2/src/pgesv/hpl_pdupdatett.c 977
-982    0 21.104026 24.044767 2.940741 Computing       411 /hpl-2.2/src/pgesv/hpl_pdupdatett.c     411 /hpl-2.2/src/pgesv/hpl_pdupdatett.c 982
-</pre>
-
-
-
-<div class="org-src-container">
-<pre class="src src-R">insert_sizes = function(durations, sizes) {
-    stopifnot(nrow(durations)==nrow(sizes))
-    ndf = data.frame();
-    for(i in (sort(unique(durations$rank)))) {
-	tmp_dur = durations[durations$rank == i,]
-	tmp_sizes = sizes[sizes$rank == i,]
-	stopifnot(nrow(tmp_dur) == nrow(tmp_sizes))
-	stopifnot(tmp_dur$startline == tmp_sizes$line)
-	storage.mode(tmp_sizes$m) &lt;- "double" # avoiding integer overflow when taking the product
-	storage.mode(tmp_sizes$n) &lt;- "double"
-	storage.mode(tmp_sizes$k) &lt;- "double"
-	tmp_dur$m = tmp_sizes$m
-	tmp_dur$n = tmp_sizes$n
-	tmp_dur$k = tmp_sizes$k
-	tmp_dur$size_product = tmp_sizes$m * tmp_sizes$n * tmp_sizes$k
-	ndf = rbind(ndf, tmp_dur)
-    }
-    return(ndf);
-}
-</pre>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">result = insert_sizes(durations, sizes)
-head(result)
-</pre>
-</div>
-
-<pre class="example">
-    rank     start       end duration     state startline                           startfile endline                             endfile idx    m    n   k size_product
-481    0  3.153899  6.271075 3.117176 Computing       387 /hpl-2.2/src/pgesv/hpl_pdupdatett.c     387 /hpl-2.2/src/pgesv/hpl_pdupdatett.c 481 4920 4920 120   2904768000
-486    0  7.047247 10.063367 3.016120 Computing       411 /hpl-2.2/src/pgesv/hpl_pdupdatett.c     411 /hpl-2.2/src/pgesv/hpl_pdupdatett.c 486 4920 4920 120   2904768000
-491    0 10.648367 13.716045 3.067678 Computing       411 /hpl-2.2/src/pgesv/hpl_pdupdatett.c     411 /hpl-2.2/src/pgesv/hpl_pdupdatett.c 491 4920 4920 120   2904768000
-496    0 14.104534 17.155418 3.050884 Computing       411 /hpl-2.2/src/pgesv/hpl_pdupdatett.c     411 /hpl-2.2/src/pgesv/hpl_pdupdatett.c 496 4920 4920 120   2904768000
-977    0 17.557080 20.430869 2.873789 Computing       387 /hpl-2.2/src/pgesv/hpl_pdupdatett.c     387 /hpl-2.2/src/pgesv/hpl_pdupdatett.c 977 4800 4800 120   2764800000
-982    0 21.104026 24.044767 2.940741 Computing       411 /hpl-2.2/src/pgesv/hpl_pdupdatett.c     411 /hpl-2.2/src/pgesv/hpl_pdupdatett.c 982 4800 4800 120   2764800000
-</pre>
-</div>
-</li>
-
-<li><a id="org9b8b09c"></a>Plot and linear regression&#xa0;&#xa0;&#xa0;<span class="tag"><span class="R">R</span>&#xa0;<span class="EXPERIMENTS">EXPERIMENTS</span>&#xa0;<span class="PERFORMANCE">PERFORMANCE</span></span><br />
-<div class="outline-text-5" id="text-1-2-4-3">
-<div class="org-src-container">
-<pre class="src src-R">library(ggplot2)
-ggplot(result, aes(x=size_product, y=duration, color=factor(rank))) +
-    geom_point(shape=1) + ggtitle("Durations of HPL_dgemm as a function of the sizes")
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="images/trace3_16.png" alt="trace3_16.png" />
-</p>
-</div>
-
-
-<div class="org-src-container">
-<pre class="src src-R">reg &lt;- lm(duration~I(m*n*k), data=result)
-summary(reg)
-</pre>
-</div>
-
-<pre class="example">
-
-Call:
-lm(formula = duration ~ I(m * n * k), data = result)
-
-Residuals:
-     Min       1Q   Median       3Q      Max 
--0.10066 -0.01700 -0.00085  0.00351  0.57745 
-
-Coefficients:
-               Estimate Std. Error  t value Pr(&gt;|t|)    
-(Intercept)  -2.476e-03  1.235e-03   -2.005   0.0451 *  
-I(m * n * k)  1.062e-09  9.220e-13 1151.470   &lt;2e-16 ***
----
-Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
-
-Residual standard error: 0.04205 on 2634 degrees of freedom
-Multiple R-squared:  0.998,	Adjusted R-squared:  0.998 
-F-statistic: 1.326e+06 on 1 and 2634 DF,  p-value: &lt; 2.2e-16
-</pre>
-
-<div class="org-src-container">
-<pre class="src src-R">layout(matrix(c(1,2,3,4),2,2))
-plot(reg)
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="images/reg_16.png" alt="reg_16.png" />
-</p>
-</div>
-</div>
-</li>
-<li><a id="org50d658a"></a>Comments on the linear regression&#xa0;&#xa0;&#xa0;<span class="tag"><span class="EXPERIMENTS">EXPERIMENTS</span></span><br />
-<div class="outline-text-5" id="text-1-2-4-4">
-<ul class="org-ul">
-<li>The plot of the duration as a function of <code>m*n*k</code> looks great. Maybe a bit of heteroscedasticity, but not so much. It is
-clearly linear.</li>
-<li>The linear regression however is not so good. We have a high R-squared (0.998), but the plots look bad. The
-residual-vs-fitted plot shows that the results are clearly heteroscedastic. The normal-QQ shows that they are not
-linear (in <code>m*n*k</code>) but rather exponential.</li>
-<li>The plot of the linear regression seems to contradict the first plot, this is strange.</li>
-</ul>
-</div>
-</li>
-<li><a id="org2222357"></a>Investigating the linear regression&#xa0;&#xa0;&#xa0;<span class="tag"><span class="C">C</span></span><br />
-<div class="outline-text-5" id="text-1-2-4-5">
-<ul class="org-ul">
-<li><p>
-We can print other relevant parameters of <code>HPL_dgemm</code>:
-</p>
-<div class="org-src-container">
-<pre class="src src-c">printf(<span class="org-string">"line=%d rank=%d m=%d n=%d k=%d a=%f lead_A=%d lead_B=%d lead_C=%d\n"</span>, __LINE__+3,
-  rank, mp, nn, jb, -HPL_rone, ldl2, LDU, lda);
-</pre>
-</div>
-<p>
-Here, <code>a</code> is a scaling factor applied to the matrix, <code>lead_A</code>, <code>lead_B</code> and <code>lead_C</code> are the leading dimensions of matrices <code>A</code>, <code>B</code> and
-<code>C</code>.
-A sample of what we get is (only some lines are reported here):
-</p>
-<pre class="example">
-line=411 rank=2 m=2240 n=2160 k=120 a=-1.000000 lead_A=2480 lead_B=2160 lead_C=2480
-line=387 rank=3 m=1640 n=1641 k=120 a=-1.000000 lead_A=2480 lead_B=1641 lead_C=2480
-line=387 rank=2 m=680 n=720 k=120 a=-1.000000 lead_A=680 lead_B=720 lead_C=2480
-line=387 rank=2 m=200 n=240 k=120 a=-1.000000 lead_A=200 lead_B=240 lead_C=2480
-177 line=411 rank=1 m=480 n=441 k=120 a=-1.000000 lead_A=2520 lead_B=441 lead_C=2520
-</pre>
-<p>
-This trend seems to roughly repeat: <code>a</code> is always -1, <code>lead_C</code> is always either 2480 or 2520. For small enough values,
-<code>lead_A</code> is equal to <code>m</code> and <code>lead_C</code> is equal to <code>n</code>. For larger values, they are not equal anymore, but all are large.
-However, there are still some noticeable variations. For instance:
-</p>
-<pre class="example">
-line=387 rank=0 m=600 n=600 k=120 a=-1.000000 lead_A=2520 lead_B=600 lead_C=2520
-line=411 rank=0 m=600 n=600 k=120 a=-1.000000 lead_A=600 lead_B=600 lead_C=2520
-</pre>
-<p>
-In this last example, all parameters are equal, except <code>lead_A</code> which is more than four times larger in one case.
-</p></li>
-<li>A small leading dimension means a better locality and thus better performances. These differences in the leading
-dimensions could explain the non-linearity and the heteroscedasticity.</li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-org48e6f81" class="outline-4">
-<h4 id="org48e6f81"><span class="section-number-4">1.2.5</span> 2017-03-07 Tuesday</h4>
-<div class="outline-text-4" id="text-1-2-5">
-</div>
-<ol class="org-ol">
-<li><a id="org93ce665"></a>And the leading dimensions?&#xa0;&#xa0;&#xa0;<span class="tag"><span class="C">C</span>&#xa0;<span class="PYTHON">PYTHON</span>&#xa0;<span class="R">R</span>&#xa0;<span class="EXPERIMENTS">EXPERIMENTS</span>&#xa0;<span class="TRACING">TRACING</span>&#xa0;<span class="PERFORMANCE">PERFORMANCE</span></span><br />
-<div class="outline-text-5" id="text-1-2-5-1">
-<ul class="org-ul">
-<li><p>
-We have this <code>printf</code> before the calls to <code>HPL_dgemm</code> (same as before, except for the <code>a</code> that is removed):
-</p>
-<div class="org-src-container">
-<pre class="src src-c">printf(<span class="org-string">"line=%d rank=%d m=%d n=%d k=%d lead_A=%d lead_B=%d lead_C=%d\n"</span>, __LINE__+3,
-  rank, mp, nn, jb, ldl2, LDU, lda);
-</pre>
-</div></li>
-<li>The trace is in the file <code>/tmp/trace</code>, we process it as before. The output is redirected in the file <code>/tmp/output</code>.</li>
-<li>Processing of the output:</li>
-</ul>
-<div class="org-src-container">
-<pre class="src src-python"><span class="org-keyword">import</span> re
-<span class="org-keyword">import</span> csv
-<span class="org-variable-name">reg</span> = re.<span class="org-builtin">compile</span>(<span class="org-string">'line=([0-9]+) rank=([0-9]+) m=([0-9]+) n=([0-9]+) k=([0-9]+) lead_A=([0-9]+) lead_B=([0-9]+) lead_C=([0-9]+)'</span>)
-
-<span class="org-keyword">def</span> <span class="org-function-name">process</span>(in_file, out_file):
-    <span class="org-keyword">with</span> <span class="org-builtin">open</span>(in_file, <span class="org-string">'r'</span>) <span class="org-keyword">as</span> in_f:
-        <span class="org-keyword">with</span> <span class="org-builtin">open</span>(out_file, <span class="org-string">'w'</span>) <span class="org-keyword">as</span> out_f:
-            <span class="org-variable-name">csv_writer</span> = csv.writer(out_f)
-            csv_writer.writerow((<span class="org-string">'line'</span>, <span class="org-string">'rank'</span>, <span class="org-string">'n'</span>, <span class="org-string">'m'</span>, <span class="org-string">'k'</span>, <span class="org-string">'lead_A'</span>, <span class="org-string">'lead_B'</span>, <span class="org-string">'lead_C'</span>))
-            <span class="org-keyword">for</span> line <span class="org-keyword">in</span> in_f:
-                <span class="org-variable-name">match</span> = reg.match(line)
-                <span class="org-keyword">if</span> match <span class="org-keyword">is</span> <span class="org-keyword">not</span> <span class="org-constant">None</span>:
-                    csv_writer.writerow(<span class="org-builtin">tuple</span>(match.group(i) <span class="org-keyword">for</span> i <span class="org-keyword">in</span> <span class="org-builtin">range</span>(1,9)))
-process(<span class="org-string">'/tmp/output'</span>, <span class="org-string">'/tmp/sizes.csv'</span>)
-</pre>
-</div>
-
-<p>
-We have the <code>durations</code> dataframe, obtained as before:
-</p>
-
-<div class="org-src-container">
-<pre class="src src-R">head(durations)
-</pre>
-</div>
-
-<pre class="example">
-    rank     start       end duration     state startline                           startfile endline                             endfile idx
-481    0  4.111176  7.158459 3.047283 Computing       387 /hpl-2.2/src/pgesv/hpl_pdupdatett.c     387 /hpl-2.2/src/pgesv/hpl_pdupdatett.c 481
-486    0  7.827329 10.848572 3.021243 Computing       411 /hpl-2.2/src/pgesv/hpl_pdupdatett.c     411 /hpl-2.2/src/pgesv/hpl_pdupdatett.c 486
-491    0 11.411456 14.445789 3.034333 Computing       411 /hpl-2.2/src/pgesv/hpl_pdupdatett.c     411 /hpl-2.2/src/pgesv/hpl_pdupdatett.c 491
-496    0 14.837377 17.868118 3.030741 Computing       411 /hpl-2.2/src/pgesv/hpl_pdupdatett.c     411 /hpl-2.2/src/pgesv/hpl_pdupdatett.c 496
-977    0 18.268679 21.142146 2.873467 Computing       387 /hpl-2.2/src/pgesv/hpl_pdupdatett.c     387 /hpl-2.2/src/pgesv/hpl_pdupdatett.c 977
-982    0 21.809954 24.699182 2.889228 Computing       411 /hpl-2.2/src/pgesv/hpl_pdupdatett.c     411 /hpl-2.2/src/pgesv/hpl_pdupdatett.c 982
-</pre>
-
-
-<p>
-Then we get the <code>sizes</code> dataframe:
-</p>
-<div class="org-src-container">
-<pre class="src src-R">sizes &lt;- read.csv("/tmp/sizes.csv");
-head(sizes)
-</pre>
-</div>
-
-<pre class="example">
-  line rank    n    m   k lead_A lead_B lead_C
-1  387    0 4920 4920 120   5040   4920   5040
-2  411    8 5000 4920 120   5000   4920   5000
-3  411    4 5040 4920 120   5040   4920   5040
-4  411   12 4920 4920 120   4920   4920   4920
-5  387    1 4920 5040 120   4920   5040   5040
-6  411    5 5040 5040 120   5040   5040   5040
-</pre>
-
-
-<div class="org-src-container">
-<pre class="src src-R">insert_sizes = function(durations, sizes) {
-    stopifnot(nrow(durations)==nrow(sizes))
-    ndf = data.frame();
-    for(i in (sort(unique(durations$rank)))) {
-	tmp_dur = durations[durations$rank == i,]
-	tmp_sizes = sizes[sizes$rank == i,]
-	stopifnot(nrow(tmp_dur) == nrow(tmp_sizes))
-	stopifnot(tmp_dur$startline == tmp_sizes$line)
-	storage.mode(tmp_sizes$m) &lt;- "double" # avoiding integer overflow when taking the product
-	storage.mode(tmp_sizes$n) &lt;- "double"
-	storage.mode(tmp_sizes$k) &lt;- "double"
-	storage.mode(tmp_sizes$lead_A) &lt;- "double"
-	storage.mode(tmp_sizes$lead_B) &lt;- "double"
-	storage.mode(tmp_sizes$lead_C) &lt;- "double"
-	tmp_dur$m = tmp_sizes$m
-	tmp_dur$n = tmp_sizes$n
-	tmp_dur$k = tmp_sizes$k
-	tmp_dur$lead_A = tmp_sizes$lead_A
-	tmp_dur$lead_B = tmp_sizes$lead_B
-	tmp_dur$lead_C = tmp_sizes$lead_C
-	tmp_dur$lead_product = tmp_sizes$lead_A * tmp_sizes$lead_B * tmp_sizes$lead_C
-	tmp_dur$size_product = tmp_sizes$m * tmp_sizes$n * tmp_sizes$k
-	tmp_dur$ratio = tmp_dur$lead_product/tmp_dur$size_product
-	ndf = rbind(ndf, tmp_dur)
-    }
-    return(ndf);
-}
-</pre>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">result = insert_sizes(durations, sizes)
-head(result)
-</pre>
-</div>
-
-<pre class="example">
-    rank     start       end duration     state startline                           startfile endline                             endfile idx    m    n   k lead_A lead_B lead_C lead_product
-481    0  4.111176  7.158459 3.047283 Computing       387 /hpl-2.2/src/pgesv/hpl_pdupdatett.c     387 /hpl-2.2/src/pgesv/hpl_pdupdatett.c 481 4920 4920 120   5040   4920   5040 124975872000
-486    0  7.827329 10.848572 3.021243 Computing       411 /hpl-2.2/src/pgesv/hpl_pdupdatett.c     411 /hpl-2.2/src/pgesv/hpl_pdupdatett.c 486 4920 4920 120   4920   4920   5040 122000256000
-491    0 11.411456 14.445789 3.034333 Computing       411 /hpl-2.2/src/pgesv/hpl_pdupdatett.c     411 /hpl-2.2/src/pgesv/hpl_pdupdatett.c 491 4920 4920 120   4920   4920   5040 122000256000
-496    0 14.837377 17.868118 3.030741 Computing       411 /hpl-2.2/src/pgesv/hpl_pdupdatett.c     411 /hpl-2.2/src/pgesv/hpl_pdupdatett.c 496 4920 4920 120   4920   4920   5040 122000256000
-977    0 18.268679 21.142146 2.873467 Computing       387 /hpl-2.2/src/pgesv/hpl_pdupdatett.c     387 /hpl-2.2/src/pgesv/hpl_pdupdatett.c 977 4800 4800 120   5040   4800   5040 121927680000
-982    0 21.809954 24.699182 2.889228 Computing       411 /hpl-2.2/src/pgesv/hpl_pdupdatett.c     411 /hpl-2.2/src/pgesv/hpl_pdupdatett.c 982 4800 4800 120   4800   4800   5040 116121600000
-    size_product    ratio
-481   2904768000 43.02439
-486   2904768000 42.00000
-491   2904768000 42.00000
-496   2904768000 42.00000
-977   2764800000 44.10000
-982   2764800000 42.00000
-</pre>
-
-<div class="org-src-container">
-<pre class="src src-R">library(ggplot2)
-ggplot(result, aes(x=lead_product, y=duration, color=factor(rank))) +
-    geom_point(shape=1) + ggtitle("Durations of HPL_dgemm as a function of the leading dimensions")
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="images/trace4_16.png" alt="trace4_16.png" />
-</p>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">library(ggplot2)
-ggplot(result, aes(x=lead_product, y=size_product, color=factor(rank))) +
-    geom_point(shape=1) + ggtitle("Size of the matrices of HPL_dgemm as a function of the leading dimensions")
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="images/trace5_16.png" alt="trace5_16.png" />
-</p>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">ggplot(result, aes(x=idx, y=ratio, color=factor(rank))) +
-    geom_point(shape=1) + ggtitle("Ratios of the leading dimensions by the sizes over time")
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="images/trace6_16.png" alt="trace6_16.png" />
-</p>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">reg &lt;- lm(duration~ I(m*n*k) + lead_A+lead_B+lead_C, data=result)
-summary(reg)
-</pre>
-</div>
-
-<pre class="example">
-
-Call:
-lm(formula = duration ~ I(m * n * k) + lead_A + lead_B + lead_C, 
-    data = result)
-
-Residuals:
-     Min       1Q   Median       3Q      Max 
--0.09477 -0.01804 -0.00439  0.00850  1.39992 
-
-Coefficients:
-               Estimate Std. Error t value Pr(&gt;|t|)    
-(Intercept)  -7.741e-01  9.915e-02  -7.807 8.37e-15 ***
-I(m * n * k)  1.069e-09  4.431e-12 241.217  &lt; 2e-16 ***
-lead_A        2.965e-06  7.744e-07   3.828 0.000132 ***
-lead_B       -7.048e-06  2.799e-06  -2.518 0.011863 *  
-lead_C        1.547e-04  1.981e-05   7.810 8.16e-15 ***
----
-Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
-
-Residual standard error: 0.04981 on 2631 degrees of freedom
-Multiple R-squared:  0.9972,	Adjusted R-squared:  0.9972 
-F-statistic: 2.361e+05 on 4 and 2631 DF,  p-value: &lt; 2.2e-16
-</pre>
-
-<div class="org-src-container">
-<pre class="src src-R">layout(matrix(c(1,2,3,4),2,2))
-plot(reg)
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="images/reg2_16.png" alt="reg2_16.png" />
-</p>
-</div>
-</div>
-</li>
-
-<li><a id="orgba6c44b"></a>Discussion about the leading dimensions&#xa0;&#xa0;&#xa0;<span class="tag"><span class="EXPERIMENTS">EXPERIMENTS</span></span><br />
-<div class="outline-text-5" id="text-1-2-5-2">
-<ul class="org-ul">
-<li>In the three previous plots, we see that the leading dimensions have two modes, which are directly observable in the
-durations of <code>HPL_dgemm</code>.
-<ul class="org-ul">
-<li>One of the modes seems to be linear in the sizes, we observe a straight line.</li>
-<li>The other mode is clearly non-linear. Maybe quadratic? Exponential?</li>
-</ul></li>
-<li>The linear regression shows that the variables <code>lead_A</code>, <code>lead_B</code> and <code>lead_C</code> have a non-negligible impact on the
-performances, albeit smaller than the sizes. We still have terrible plots, adding parameters in the model did not
-change anything.</li>
-<li>This could explain the “bad” plots of the linear regression.</li>
-</ul>
-</div>
-</li>
-<li><a id="orga576e03"></a>Performance analysis of <code>dgemm</code> outside of HPL&#xa0;&#xa0;&#xa0;<span class="tag"><span class="C">C</span>&#xa0;<span class="EXPERIMENTS">EXPERIMENTS</span>&#xa0;<span class="PERFORMANCE">PERFORMANCE</span></span><br />
-<div class="outline-text-5" id="text-1-2-5-3">
-<ul class="org-ul">
-<li>In the above analysis, the raw results come from a trace of HPL. Thus, we cannot control the sizes and/or leading
-dimensions. We only have observational data and not experimental data.</li>
-<li>To fix this, let’s write a short C code, called <code>dgemm_test</code>,  that call <code>cblas_dgemm</code> (the function to which is aliased <code>HPL_dgemm</code>).</li>
-<li>Currently, this code takes six parameters as arguments: the three sizes and the three leading dimensions. Be careful,
-the meaning of these sizes and leading dimensions change depending on how <code>dgemm</code> is called: <code>CblasColMajor</code> or
-<code>CblasRowMajor</code>, and <code>CblasNoTrans</code> or <code>CblasTrans</code>. In the current code, these are fixed to be the same than in HPL.</li>
-<li>Then, a Python script (called <code>runner.py</code>) sample random sizes and leading dimensions (taking care of the constraints
-between the sizes and dimensions) and call <code>dgemm_test</code>. It then writes the results in a CSV file.</li>
-<li>Quick analysis of these results in R:
-<ul class="org-ul">
-<li>We got plots with the same shape (both the plot of the raw results and the plot of the linear regression).</li>
-<li>The call to <code>dgemm</code> is 10 times faster in <code>dgemm_test</code> than in HPL. Need to find why. Firstly, what is the time obtained in the HPL
-traces? Is it virtual or real?</li>
-<li>Similarly than with HPL, the linear regression shows that the ratio has a significative impact, but lower than the
-sizes.</li>
-</ul></li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-org22e80f4" class="outline-4">
-<h4 id="org22e80f4"><span class="section-number-4">1.2.6</span> 2017-03-08 Wednesday</h4>
-<div class="outline-text-4" id="text-1-2-6">
-</div>
-<ol class="org-ol">
-<li><a id="org8388991"></a>Keep looking at <code>dgemm</code> outside of HPL&#xa0;&#xa0;&#xa0;<span class="tag"><span class="C">C</span>&#xa0;<span class="EXPERIMENTS">EXPERIMENTS</span>&#xa0;<span class="PERFORMANCE">PERFORMANCE</span></span><br />
-<div class="outline-text-5" id="text-1-2-6-1">
-<ul class="org-ul">
-<li>Use <code>dgemm_test</code> at commit <code>0455edcb0af1eb673725959d216137997fc40fd2</code>. Run 1000 experiments.</li>
-<li>Here, the variable <code>product</code> is sampled randomly and uniformly in [1, 2000<sup>3</sup>]. Then, the three sizes are set to &lfloor;
-product<sup>(1/3)</sup> &rfloor;.</li>
-<li>The leading dimensions are equal to the sizes.</li>
-<li><p>
-Analysis in R:
-</p>
-<div class="org-src-container">
-<pre class="src src-R">result &lt;- read.csv('~/tmp/3/result.csv')
-head(result)
-</pre>
-</div>
-
-<pre class="example">
-      time size_product lead_product ratio    m    n    k lead_A lead_B lead_C
-1 0.160235    843908625    843908625     1  945  945  945    945    945    945
-2 0.719003   4298942376   4298942376     1 1626 1626 1626   1626   1626   1626
-3 0.783674   4549540393   4549540393     1 1657 1657 1657   1657   1657   1657
-4 0.472595   2656741625   2656741625     1 1385 1385 1385   1385   1385   1385
-5 0.319670   1874516337   1874516337     1 1233 1233 1233   1233   1233   1233
-6 1.131936   6676532387   6676532387     1 1883 1883 1883   1883   1883   1883
-</pre>
-
-
-<div class="org-src-container">
-<pre class="src src-R">library(ggplot2)
-ggplot(result, aes(x=size_product, y=time)) +
-    geom_point(shape=1) + ggtitle("Durations of cblas_dgemm as a function of the sizes product.")
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="images/dgemm_test_raw.png" alt="dgemm_test_raw.png" />
-</p>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">reg &lt;- lm(time ~ size_product, result)
-summary(reg)
-</pre>
-</div>
-
-<pre class="example">
-
-Call:
-lm(formula = time ~ size_product, data = result)
-
-Residuals:
-      Min        1Q    Median        3Q       Max 
--0.027295 -0.008640 -0.002781  0.005900  0.229935 
-
-Coefficients:
-              Estimate Std. Error t value Pr(&gt;|t|)    
-(Intercept)  1.172e-02  1.087e-03   10.78   &lt;2e-16 ***
-size_product 1.666e-10  2.353e-13  707.87   &lt;2e-16 ***
----
-Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
-
-Residual standard error: 0.01716 on 998 degrees of freedom
-Multiple R-squared:  0.998,	Adjusted R-squared:  0.998 
-F-statistic: 5.011e+05 on 1 and 998 DF,  p-value: &lt; 2.2e-16
-</pre></li>
-</ul>
-
-
-<div class="org-src-container">
-<pre class="src src-R">layout(matrix(c(1,2,3,4),2,2))
-plot(reg)
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="images/dgemm_test_lm.png" alt="dgemm_test_lm.png" />
-</p>
-</div>
-
-<ul class="org-ul">
-<li>In the above plots, we can observe similar trends than with <code>HPL_dgemm</code>, albeit less important.
-The data is slightly heteroscedastic and the residuals do not follow exactly a normal distribution. It seems that
-there are several “outliers” where <code>dgemm</code> takes significantly more time, i.e. the distribution of the residuals is
-skewed to the “right”.</li>
-<li>For instance, the entry n°208 has been obtained with sizes of 1503. It took a time of 0.807207.
-Let’s run this experiment again 100 times (with the command <code>./dgemm_test 1503 1503 1503 1503 1503 1503</code>). The min and
-the max over all observed times are respectively 0.5813 and 0.6494. The mean is 0.5897 and the standard deviation
-is 0.0082.</li>
-<li>Thus, it seems that this point is a real outlier. We can suppose that this is also true for the other similar points.</li>
-<li>This outlier is 0.2 seconds larger than the average we got and 0.15 seconds larger than the max. It seems very
-large. Maybe the process had a “bad” context switch (e.g. if it was moved to another core, but the execution time is
-not that high, so it seems unlikely).</li>
-<li><p>
-There seems to be a pattern, the outliers look to happen at regular intervals.
-</p>
-<div class="org-src-container">
-<pre class="src src-R">x = df[abs(df$time - (1.666e-10*df$size_product + 1.172e-2)) &gt; 5e-2, ]
-x$id = which(abs(df$time - (1.666e-10*df$size_product + 1.172e-2)) &gt; 5e-2)
-x$prev_id = c(0, x$id[1:(length(x$id)-1)])
-x$id_diff = x$id - x$prev_id
-x
-</pre>
-</div>
-
-<pre class="example">
-        time size_product lead_product ratio    m    n    k lead_A lead_B
-37  0.674633   3602686437   3602686437     1 1533 1533 1533   1533   1533
-38  0.409866   2053225511   2053225511     1 1271 1271 1271   1271   1271
-207 1.295097   7055792632   7055792632     1 1918 1918 1918   1918   1918
-208 0.807207   3395290527   3395290527     1 1503 1503 1503   1503   1503
-381 1.079795   5535839609   5535839609     1 1769 1769 1769   1769   1769
-558 0.453775   1869959168   1869959168     1 1232 1232 1232   1232   1232
-657 0.917557   4699421875   4699421875     1 1675 1675 1675   1675   1675
-748 1.233466   6414120712   6414120712     1 1858 1858 1858   1858   1858
-753 0.708934   3884701248   3884701248     1 1572 1572 1572   1572   1572
-914 1.337868   7166730752   7166730752     1 1928 1928 1928   1928   1928
-    lead_C  id prev_id id_diff
-37    1533  37       0      37
-38    1271  38      37       1
-207   1918 207      38     169
-208   1503 208     207       1
-381   1769 381     208     173
-558   1232 558     381     177
-657   1675 657     558      99
-748   1858 748     657      91
-753   1572 753     748       5
-914   1928 914     753     161
-</pre>
-
-<p>
-We see here that the differences between the ids do not seem to be uniformly random. Some of them are small (1, 5),
-others are large (161, 169, 173, 177), or in between (37, 91, 99).
-</p></li>
-<li><p>
-This pattern has been reproduced by runing 1000 experiments with a size of 1503. Among the results, 26 of them are
-larger than 0.7 (mean of 0.6024, standard deviation of 0.0249, min of 0.5811, max of 0.8363).
-Here is the list of the differences between the indices of these elements. The list have been sorted:
-</p>
-<pre class="example">
-[1, 1, 1, 1, 1, 1, 2, 4, 4, 5, 7, 7, 10, 15, 20, 25, 28, 32, 42, 42, 43, 53, 108, 200, 201]
-</pre>
-<p>
-A lot of them are small or medium, and two are much larger.
-</p></li>
-</ul>
-</div>
-</li>
-<li><a id="org09e40c3"></a>Time prediction in HPL&#xa0;&#xa0;&#xa0;<span class="tag"><span class="C">C</span>&#xa0;<span class="PYTHON">PYTHON</span>&#xa0;<span class="R">R</span>&#xa0;<span class="EXPERIMENTS">EXPERIMENTS</span>&#xa0;<span class="PERFORMANCE">PERFORMANCE</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-6-2">
-<ul class="org-ul">
-<li><p>
-Let’s try to predict the time that will be spend in <code>HPL_dgemm</code>, and compare it with the real time.
-The aim is then to have a cheap SimBLAS: replacing calls to the function by a sleep of the predicted time.
-We have this <code>printf</code> before the calls to <code>HPL_dgemm</code>:
-</p>
-<div class="org-src-container">
-<pre class="src src-c">printf(<span class="org-string">"line=%d rank=%d m=%d n=%d k=%d lead_A=%d lead_B=%d lead_C=%d expected_time=%f\n"</span>,
-        __LINE__+3, rank, mp, nn, jb, ldl2, LDU, lda, expected_time);
-</pre>
-</div>
-<p>
-We do as before: we run HPL with P=Q=4 and N=20000. The trace is dumped in <code>/tmp/trace</code> and <code>stdout</code> is redirected to
-<code>/tmp/output</code>.
-</p></li>
-<li>Processing of the output:</li>
-</ul>
-<div class="org-src-container">
-<pre class="src src-python"><span class="org-keyword">import</span> re
-<span class="org-keyword">import</span> csv
-<span class="org-variable-name">reg</span> = re.<span class="org-builtin">compile</span>(<span class="org-string">'line=([0-9]+) rank=([0-9]+) m=([0-9]+) n=([0-9]+) k=([0-9]+) lead_A=([0-9]+) lead_B=([0-9]+) lead_C=([0-9]+) expected_time=(-?[0-9]+.[0-9]+)'</span>)
-
-<span class="org-keyword">def</span> <span class="org-function-name">process</span>(in_file, out_file):
-    <span class="org-keyword">with</span> <span class="org-builtin">open</span>(in_file, <span class="org-string">'r'</span>) <span class="org-keyword">as</span> in_f:
-        <span class="org-keyword">with</span> <span class="org-builtin">open</span>(out_file, <span class="org-string">'w'</span>) <span class="org-keyword">as</span> out_f:
-            <span class="org-variable-name">csv_writer</span> = csv.writer(out_f)
-            csv_writer.writerow((<span class="org-string">'line'</span>, <span class="org-string">'rank'</span>, <span class="org-string">'n'</span>, <span class="org-string">'m'</span>, <span class="org-string">'k'</span>, <span class="org-string">'lead_A'</span>, <span class="org-string">'lead_B'</span>, <span class="org-string">'lead_C'</span>, <span class="org-string">'expected_time'</span>))
-            <span class="org-keyword">for</span> line <span class="org-keyword">in</span> in_f:
-                <span class="org-variable-name">match</span> = reg.match(line)
-                <span class="org-keyword">if</span> match <span class="org-keyword">is</span> <span class="org-keyword">not</span> <span class="org-constant">None</span>:
-                    csv_writer.writerow(<span class="org-builtin">tuple</span>(match.group(i) <span class="org-keyword">for</span> i <span class="org-keyword">in</span> <span class="org-builtin">range</span>(1,10)))
-process(<span class="org-string">'/tmp/output'</span>, <span class="org-string">'/tmp/sizes.csv'</span>)
-</pre>
-</div>
-<ul class="org-ul">
-<li><p>
-We process the trace as before, we get a dataframe <code>durations</code>.
-</p>
-<div class="org-src-container">
-<pre class="src src-R">head(durations)
-</pre>
-</div>
-
-<pre class="example">
-    rank     start      end duration     state startline                           startfile endline                             endfile idx
-481    0  3.480994  6.54468 3.063686 Computing       388 /hpl-2.2/src/pgesv/hpl_pdupdatett.c     388 /hpl-2.2/src/pgesv/hpl_pdupdatett.c 481
-486    0  7.225255 10.24889 3.023633 Computing       413 /hpl-2.2/src/pgesv/hpl_pdupdatett.c     413 /hpl-2.2/src/pgesv/hpl_pdupdatett.c 486
-491    0 10.803780 13.82799 3.024215 Computing       413 /hpl-2.2/src/pgesv/hpl_pdupdatett.c     413 /hpl-2.2/src/pgesv/hpl_pdupdatett.c 491
-496    0 14.230774 17.26467 3.033897 Computing       413 /hpl-2.2/src/pgesv/hpl_pdupdatett.c     413 /hpl-2.2/src/pgesv/hpl_pdupdatett.c 496
-977    0 17.676746 20.58197 2.905229 Computing       388 /hpl-2.2/src/pgesv/hpl_pdupdatett.c     388 /hpl-2.2/src/pgesv/hpl_pdupdatett.c 977
-982    0 21.258337 24.16961 2.911277 Computing       413 /hpl-2.2/src/pgesv/hpl_pdupdatett.c     413 /hpl-2.2/src/pgesv/hpl_pdupdatett.c 982
-</pre>
-
-
-<div class="org-src-container">
-<pre class="src src-R">sizes &lt;- read.csv("/tmp/sizes.csv");
-head(sizes)
-</pre>
-</div>
-
-<pre class="example">
-  line rank    n    m   k lead_A lead_B lead_C expected_time
-1  413    8 5000 4920 120   5000   4920   5000      3.132548
-2  413   12 4920 4920 120   4920   4920   4920      3.082388
-3  413    4 5040 4920 120   5040   4920   5040      3.157628
-4  388    0 4920 4920 120   5040   4920   5040      3.082388
-5  413    5 5040 5040 120   5040   5040   5040      3.234704
-6  413    9 5000 5040 120   5000   5040   5000      3.209012
-</pre>
-
-
-<div class="org-src-container">
-<pre class="src src-R">insert_sizes = function(durations, sizes) {
-    stopifnot(nrow(durations)==nrow(sizes))
-    ndf = data.frame();
-    for(i in (sort(unique(durations$rank)))) {
-	tmp_dur = durations[durations$rank == i,]
-	tmp_sizes = sizes[sizes$rank == i,]
-	stopifnot(nrow(tmp_dur) == nrow(tmp_sizes))
-	stopifnot(tmp_dur$startline == tmp_sizes$line)
-	storage.mode(tmp_sizes$m) &lt;- "double" # avoiding integer overflow when taking the product
-	storage.mode(tmp_sizes$n) &lt;- "double"
-	storage.mode(tmp_sizes$k) &lt;- "double"
-	storage.mode(tmp_sizes$lead_A) &lt;- "double"
-	storage.mode(tmp_sizes$lead_B) &lt;- "double"
-	storage.mode(tmp_sizes$lead_C) &lt;- "double"
-	tmp_dur$m = tmp_sizes$m
-	tmp_dur$n = tmp_sizes$n
-	tmp_dur$k = tmp_sizes$k
-	tmp_dur$lead_A = tmp_sizes$lead_A
-	tmp_dur$lead_B = tmp_sizes$lead_B
-	tmp_dur$lead_C = tmp_sizes$lead_C
-	tmp_dur$lead_product = tmp_sizes$lead_A * tmp_sizes$lead_B * tmp_sizes$lead_C
-	tmp_dur$size_product = tmp_sizes$m * tmp_sizes$n * tmp_sizes$k
-	tmp_dur$ratio = tmp_dur$lead_product/tmp_dur$size_product
-	tmp_dur$expected_time = tmp_sizes$expected_time
-	tmp_dur$absolute_time_diff = tmp_dur$expected_time - tmp_dur$duration
-	tmp_dur$relative_time_diff = (tmp_dur$expected_time - tmp_dur$duration)/tmp_dur$expected_time
-	ndf = rbind(ndf, tmp_dur)
-    }
-    return(ndf);
-}
-</pre>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">result = insert_sizes(durations, sizes)
-head(result)
-</pre>
-</div>
-
-<pre class="example">
-    rank     start      end duration     state startline                           startfile endline                             endfile idx    m    n   k lead_A lead_B lead_C lead_product
-481    0  3.480994  6.54468 3.063686 Computing       388 /hpl-2.2/src/pgesv/hpl_pdupdatett.c     388 /hpl-2.2/src/pgesv/hpl_pdupdatett.c 481 4920 4920 120   5040   4920   5040 124975872000
-486    0  7.225255 10.24889 3.023633 Computing       413 /hpl-2.2/src/pgesv/hpl_pdupdatett.c     413 /hpl-2.2/src/pgesv/hpl_pdupdatett.c 486 4920 4920 120   4920   4920   5040 122000256000
-491    0 10.803780 13.82799 3.024215 Computing       413 /hpl-2.2/src/pgesv/hpl_pdupdatett.c     413 /hpl-2.2/src/pgesv/hpl_pdupdatett.c 491 4920 4920 120   4920   4920   5040 122000256000
-496    0 14.230774 17.26467 3.033897 Computing       413 /hpl-2.2/src/pgesv/hpl_pdupdatett.c     413 /hpl-2.2/src/pgesv/hpl_pdupdatett.c 496 4920 4920 120   4920   4920   5040 122000256000
-977    0 17.676746 20.58197 2.905229 Computing       388 /hpl-2.2/src/pgesv/hpl_pdupdatett.c     388 /hpl-2.2/src/pgesv/hpl_pdupdatett.c 977 4800 4800 120   5040   4800   5040 121927680000
-982    0 21.258337 24.16961 2.911277 Computing       413 /hpl-2.2/src/pgesv/hpl_pdupdatett.c     413 /hpl-2.2/src/pgesv/hpl_pdupdatett.c 982 4800 4800 120   4800   4800   5040 116121600000
-    size_product    ratio expected_time absolute_time_diff relative_time_diff
-481   2904768000 43.02439      3.082388           0.018702        0.006067374
-486   2904768000 42.00000      3.082388           0.058755        0.019061520
-491   2904768000 42.00000      3.082388           0.058173        0.018872705
-496   2904768000 42.00000      3.082388           0.048491        0.015731634
-977   2764800000 44.10000      2.933742           0.028513        0.009718987
-982   2764800000 42.00000      2.933742           0.022465        0.007657456
-</pre></li>
-</ul>
-
-<div class="org-src-container">
-<pre class="src src-R">ggplot(result, aes(x=idx, y=absolute_time_diff, color=factor(rank))) +
-    geom_point(shape=1) + ggtitle("Absolute difference between the expected time and the real time")
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="images/trace7_16.png" alt="trace7_16.png" />
-</p>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">ggplot(result, aes(x=start, y=absolute_time_diff, color=factor(rank))) +
-    geom_point(shape=1) + ggtitle("Absolute difference between the expected time and the real time")
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="images/trace8_16.png" alt="trace8_16.png" />
-</p>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">ggplot(result, aes(x=start, y=relative_time_diff, color=factor(rank))) +
-    geom_point(shape=1) + ggtitle("Relative difference between the expected time and the real time")
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="images/trace9_16.png" alt="trace9_16.png" />
-</p>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">ggplot(result[result$start &lt; 200,], aes(x=start, y=relative_time_diff, color=factor(rank))) +
-    geom_point(shape=1) + ggtitle("Relative difference between the expected time and the real time\n“Large enough” matrices")
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="images/trace10_16.png" alt="trace10_16.png" />
-</p>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">for(i in (sort(unique(result$rank)))) {
-    print(sum(result[result$rank == i,]$absolute_time_diff))
-}
-</pre>
-</div>
-
-<pre class="example">
-[1] 1.494745
-[1] 1.343339
-[1] -2.940891
-[1] -1.11672
-[1] 0.466087
-[1] 1.90049
-[1] -3.441326
-[1] -1.564635
-[1] -2.708597
-[1] -1.647053
-[1] 0.027765
-[1] -4.653833
-[1] 2.878523
-[1] 3.572304
-[1] 1.124928
-[1] 3.749203
-</pre>
-
-<ul class="org-ul">
-<li>We can see several things.
-<ul class="org-ul">
-<li>There are very large differences between the ranks. We could already see it in the first plots (<code>duration</code> vs
-<code>size_product</code>), but it is even more obvious here. We should find why.</li>
-<li>There are some outliers that may have a very significant impact on the agregated difference between prediction and
-reality.</li>
-<li>The prediction ability of this is better than <code>SMPI_Sample</code>, but still far from perfect.</li>
-</ul></li>
-</ul>
-</div>
-</li>
-<li><a id="org82401b9"></a>Let’s try a cheap SimBLAS&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="C">C</span>&#xa0;<span class="PERFORMANCE">PERFORMANCE</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-6-3">
-<ul class="org-ul">
-<li><p>
-We can replace the call to <code>HPL_dgemm</code> by the following:
-</p>
-<div class="org-src-container">
-<pre class="src src-c"><span class="org-type">double</span> <span class="org-variable-name">expected_time</span> = (1.062e-09)*(<span class="org-type">double</span>)mp*(<span class="org-type">double</span>)nn*(<span class="org-type">double</span>)jb - 2.476e-03
-<span class="org-keyword">if</span>(expected_time &gt; 0)
-    smpi_usleep((<span class="org-type">useconds_t</span>)(<span class="org-type">expected_time</span>*1e6));
-</pre>
-</div></li>
-<li>First test: it works pretty well. We roughly got the same results than with the true call to <code>HPL_dgemm</code>: 2.329e+01
-Gflops, against 2.332e01, 2.305e01 and 2.315e01 Gflops. The simulation time is much shorter, about 46 seconds, against
-about 495 seconds (8 minutes and 15 seconds). Note than with or without a real call to <code>HPL_dgemm</code>, the time spent
-outside of the application is much lower: between 6 and 8 seconds. Thus, there is room for new optimizations.</li>
-</ul>
-</div>
-</li>
-<li><a id="org34af65e"></a>Tracking the other expensive BLAS functions&#xa0;&#xa0;&#xa0;<span class="tag"><span class="PERFORMANCE">PERFORMANCE</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-6-4">
-<ul class="org-ul">
-<li>In the file <code>hpl_blas.h</code>, several functions are defined like <code>HPL_dgemm</code>, with <code>#define</code> aliasing them to the real <code>cblas</code> function.</li>
-<li>We can try to replace them by a no-op, to see if it changes the simulation time significantly.</li>
-<li><p>
-The following table sum up the (very approximate) gain we get on simulation time if we remove each of the
-functions. We use the same parameters than above for HPL.
-</p>
-
-<table border="2" cellspacing="0" cellpadding="6" rules="groups" frame="hsides">
-
-
-<colgroup>
-<col  class="org-left" />
-
-<col  class="org-right" />
-</colgroup>
-<thead>
-<tr>
-<th scope="col" class="org-left">Function</th>
-<th scope="col" class="org-right">time (s)</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td class="org-left"><code>HPL_dswap</code></td>
-<td class="org-right">0.5</td>
-</tr>
-
-<tr>
-<td class="org-left"><code>HPL_dcopy</code></td>
-<td class="org-right">N/A</td>
-</tr>
-
-<tr>
-<td class="org-left"><code>HPL_daxpy</code></td>
-<td class="org-right">0</td>
-</tr>
-
-<tr>
-<td class="org-left"><code>HPL_dscal</code></td>
-<td class="org-right">N/A</td>
-</tr>
-
-<tr>
-<td class="org-left"><code>HPL_idamax</code></td>
-<td class="org-right">N/A</td>
-</tr>
-
-<tr>
-<td class="org-left"><code>HPL_dgemv</code></td>
-<td class="org-right">1</td>
-</tr>
-
-<tr>
-<td class="org-left"><code>HPL_dtrsv</code></td>
-<td class="org-right">0</td>
-</tr>
-
-<tr>
-<td class="org-left"><code>HPL_dger</code></td>
-<td class="org-right">0.5</td>
-</tr>
-
-<tr>
-<td class="org-left"><code>HPL_dtrsm</code></td>
-<td class="org-right">10</td>
-</tr>
-</tbody>
-</table>
-
-<ul class="org-ul">
-<li>The function <code>HPL_idamax</code> cannot be removed, since it returns an integer used to index an array.</li>
-<li><p>
-The functions <code>HPL_dscal</code> and <code>HPL_dcopy</code> cannot be removed either, since removing them causes the following error:
-</p>
-<pre class="example">
-/home/tom/simgrid/src/simix/smx_global.cpp:557: [simix_kernel/CRITICAL] Oops ! Deadlock or code not perfectly clean.
-</pre></li>
-</ul></li>
-<li>It is clear that we should now focus on <code>HPL_dtrsm</code>. This function solves a triangular system of equations.</li>
-<li>It is also clear that the time spent in the application is not entirely spent in the BLAS functions, we should look
-for something else.</li>
-</ul>
-</div>
-</li>
-<li><a id="org4c0958d"></a>Forgot a call to <code>HPL_dgemm</code>&#xa0;&#xa0;&#xa0;<span class="tag"><span class="PERFORMANCE">PERFORMANCE</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-6-5">
-<ul class="org-ul">
-<li>I found out that I forgot a place where <code>HPL_dgemm</code> was used.</li>
-<li>If we remove all additional occurences of <code>HPL_dgemm</code>, we gain 6 seconds (in addition of the high gain we already had).</li>
-<li>I thought that it was used only in <code>HPL_pduptateTT</code>, but it appears that it is also used in <code>HPL_pdrpanllT</code>.</li>
-<li>The call to <code>HPL_dgemm</code> was correctly traced. But I filtered the results in the R script and kept only the ones of <code>HPL_pdupdateTT</code>.</li>
-<li>The <code>printf</code> function with the parameters was only present in <code>HPL_pdupdateTT</code>.</li>
-<li>Consequently, all the visualizations and linear regressions were done with missing data. We should redo them to check
-if this changes anything.</li>
-</ul>
-</div>
-</li>
-<li><a id="org45b2e2a"></a>Looking at <code>HPL_dtrsm</code>&#xa0;&#xa0;&#xa0;<span class="tag"><span class="PERFORMANCE">PERFORMANCE</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-6-6">
-<ul class="org-ul">
-<li>This function is used in a lot of functions: <code>HPL_pdrpan***</code> and <code>HPL_pdupdate**</code> (each has several variants).</li>
-<li>By aliasing this function to <code>printf("%s\n", __FILE___)</code> and filtering the output with <code>awk '!a[$0]++'</code> (remove duplicates),
-we know that, in our settings, <code>HPL_dtrsm</code> is only used in <code>HPL_pdrpanllT</code> and <code>HPL_pdupdateTT</code>. By sorting with <code>sort</code> and
-then counting duplicates with <code>uniq -dc</code>, we know that <code>HPL_pdrpanllT</code> (resp. <code>HPL_pdupdateTT</code>) call our function 78664 times
-(resp. 2636 times).</li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-orgfbfa111" class="outline-4">
-<h4 id="orgfbfa111"><span class="section-number-4">1.2.7</span> 2017-03-09 Thursday</h4>
-<div class="outline-text-4" id="text-1-2-7">
-</div>
-<ol class="org-ol">
-<li><a id="orgdf85ea6"></a>Fix <code>HPL_dgemm</code> trace&#xa0;&#xa0;&#xa0;<span class="tag"><span class="C">C</span>&#xa0;<span class="TRACING">TRACING</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-7-1">
-<ul class="org-ul">
-<li>In the old version, the calls to <code>MPI_Wait</code> were done in the <code>#include</code>, so we were sure that every call to <code>HPL_dgemm</code> was
-traced by Simgrid. However, the <code>printf</code> for the parameters had to be done before every call to <code>HPL_dgemm</code>, this is why
-I missed some of them.</li>
-<li>Now, the <code>printf</code> is also done in the <code>#include</code>. Because we need to have the arguments given to <code>HPL_dgemm</code> here, we cannot
-anymore use variadic arguments. We have to put all the parameters.</li>
-<li><p>
-The code is now as follows:
-</p>
-<div class="org-src-container">
-<pre class="src src-c"><span class="org-preprocessor">#define</span>  <span class="org-function-name">HPL_dgemm</span>(<span class="org-variable-name">layout</span>, <span class="org-variable-name">TransA</span>, <span class="org-variable-name">TransB</span>, <span class="org-variable-name">M</span>, <span class="org-variable-name">N</span>, <span class="org-variable-name">K</span>, <span class="org-variable-name">alpha</span>, <span class="org-variable-name">A</span>, <span class="org-variable-name">lda</span>, <span class="org-variable-name">B</span>, <span class="org-variable-name">ldb</span>, <span class="org-variable-name">beta</span>, <span class="org-variable-name">C</span>, <span class="org-variable-name">ldc</span>)  ({\
-    <span class="org-type">int</span> <span class="org-variable-name">my_rank</span>, <span class="org-variable-name">buff</span>=0;\
-    <span class="org-type">MPI_Request</span> <span class="org-variable-name">request</span>;\
-    MPI_Comm_rank(MPI_COMM_WORLD, &amp;my_rank);\
-    <span class="org-type">double</span> <span class="org-variable-name">expected_time</span> = (1.062e-09)*(<span class="org-type">double</span>)M*(<span class="org-type">double</span>)N*(<span class="org-type">double</span>)K - 2.476e-03;\
-    printf(<span class="org-string">"file=%s line=%d rank=%d m=%d n=%d k=%d lead_A=%d lead_B=%d lead_C=%d expected_time=%f\n"</span>, __FILE__, __LINE__+3, my_rank, M, N, K, lda, ldb, ldc, expected_time);\
-    MPI_Isend(&amp;buff, 1, MPI_INT, my_rank, 0, MPI_COMM_WORLD, &amp;request);\
-    MPI_Recv(&amp;buff, 1, MPI_INT, my_rank, 0, MPI_COMM_WORLD, <span class="org-constant">NULL</span>);\
-    MPI_Wait(&amp;request, MPI_STATUS_IGNORE);\
-    cblas_dgemm(layout, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);\
-    MPI_Isend(&amp;buff, 1, MPI_INT, my_rank, 0, MPI_COMM_WORLD, &amp;request);\
-    MPI_Recv(&amp;buff, 1, MPI_INT, my_rank, 0, MPI_COMM_WORLD, <span class="org-constant">NULL</span>);\
-    MPI_Wait(&amp;request, MPI_STATUS_IGNORE);\
-})
-</pre>
-</div></li>
-</ul>
-</div>
-</li>
-<li><a id="org667b9fd"></a>Tentative of linear regression of <code>HPL_dgemm</code>: failed, there is a bug somewhere&#xa0;&#xa0;&#xa0;<span class="tag"><span class="PYTHON">PYTHON</span>&#xa0;<span class="R">R</span>&#xa0;<span class="EXPERIMENTS">EXPERIMENTS</span>&#xa0;<span class="PERFORMANCE">PERFORMANCE</span>&#xa0;<span class="BUG">BUG</span></span><br />
-<div class="outline-text-5" id="text-1-2-7-2">
-<ul class="org-ul">
-<li>In the other linear regressions, some calls to <code>HPL_dgemm</code> were missing. Thus, the analysis need to be done again, just
-to check if it changes anything.</li>
-<li>I tried to run roughly the same process as above, but failed, there seems to be a bug somewhere.</li>
-<li>Everything piece of code is written here. The trace and the output have been obtained with N=5000 and P=Q=4.</li>
-</ul>
-<p>
-Clean the file:
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">pj_dump --user-defined --ignore-incomplete-links /tmp/trace &gt; /tmp/trace.csv
-grep <span class="org-string">"State,"</span> /tmp/trace.csv | grep MPI_Wait | sed -e <span class="org-string">'s/()//'</span> -e <span class="org-string">'s/MPI_STATE, //ig'</span>  -e <span class="org-string">'s/State, //ig'</span> -e <span class="org-string">'s/rank-//'</span> -e<span class="org-sh-escaped-newline">\</span>
-<span class="org-string">'s/PMPI_/MPI_/'</span> | grep MPI_  | tr <span class="org-string">'A-Z'</span> <span class="org-string">'a-z'</span> &gt; /tmp/trace_processed.csv
-</pre>
-</div>
-
-<p>
-Clean the paths:
-</p>
-<div class="org-src-container">
-<pre class="src src-python"><span class="org-keyword">import</span> re
-<span class="org-variable-name">reg</span> = re.<span class="org-builtin">compile</span>(<span class="org-string">'((?:[^/])*)(?:/[a-zA-Z0-9_-]*)*((?:/hpl-2.2(?:/[a-zA-Z0-9_-]*)*).*)'</span>)
-<span class="org-keyword">def</span> <span class="org-function-name">process</span>(in_file, out_file):
-    <span class="org-keyword">with</span> <span class="org-builtin">open</span>(in_file, <span class="org-string">'r'</span>) <span class="org-keyword">as</span> in_f:
-        <span class="org-keyword">with</span> <span class="org-builtin">open</span>(out_file, <span class="org-string">'w'</span>) <span class="org-keyword">as</span> out_f:
-            <span class="org-keyword">for</span> line <span class="org-keyword">in</span> in_f:
-                <span class="org-variable-name">match</span> = reg.match(line)
-                out_f.write(<span class="org-string">'%s%s\n'</span> % (match.group(1), match.group(2)))
-process(<span class="org-string">'/tmp/trace_processed.csv'</span>, <span class="org-string">'/tmp/trace_cleaned.csv'</span>)
-</pre>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">df &lt;- read.csv("/tmp/trace_cleaned.csv", header=F, strip.white=T, sep=",");
-names(df) = c("rank", "start", "end", "duration", "level", "state", "Filename", "Linenumber");
-head(df)
-</pre>
-</div>
-
-<pre class="example">
-  rank    start      end duration level    state
-1    8 0.207257 0.207257        0     0 mpi_wait
-2    8 0.207275 0.207275        0     0 mpi_wait
-3    8 0.207289 0.207289        0     0 mpi_wait
-4    8 0.207289 0.207289        0     0 mpi_wait
-5    8 0.207309 0.207309        0     0 mpi_wait
-6    8 0.207309 0.207309        0     0 mpi_wait
-                            Filename Linenumber
-1 /hpl-2.2/src/pfact/hpl_pdrpanllt.c        222
-2 /hpl-2.2/src/pfact/hpl_pdrpanllt.c        222
-3 /hpl-2.2/src/pfact/hpl_pdrpanllt.c        222
-4 /hpl-2.2/src/pfact/hpl_pdrpanllt.c        222
-5 /hpl-2.2/src/pfact/hpl_pdrpanllt.c        222
-6 /hpl-2.2/src/pfact/hpl_pdrpanllt.c        222
-</pre>
-
-<div class="org-src-container">
-<pre class="src src-R">duration_compute = function(df) {
-    ndf = data.frame();
-    df = df[with(df,order(rank,start)),];
-    #origin = unique(df$origin)
-    for(i in (sort(unique(df$rank)))) {
-	start     = df[df$rank==i,]$start;
-	end       = df[df$rank==i,]$end;
-	l         = length(end);
-	end       = c(0,end[1:(l-1)]); # Computation starts at time 0
-
-	startline = c(0, df[df$rank==i,]$Linenumber[1:(l-1)]);
-	startfile = c("", as.character(df[df$rank==i,]$Filename[1:(l-1)]));
-	endline   = df[df$rank==i,]$Linenumber;
-	endfile   = df[df$rank==i,]$Filename;
-
-	ndf       = rbind(ndf, data.frame(rank=i, start=end, end=start,
-	    duration=start-end, state="Computing",
-	    startline=startline, startfile=startfile, endline=endline,
-	    endfile=endfile));
-    }
-    ndf$idx = 1:length(ndf$duration)
-    ndf;
-}
-durations = duration_compute(df);
-durations = durations[as.character(durations$startfile) == as.character(durations$endfile) &amp;
-    durations$startline == durations$endline,]
-</pre>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">head(durations)
-</pre>
-</div>
-
-<pre class="example">
-  rank    start      end duration     state startline
-2    0 0.207097 0.207149  5.2e-05 Computing       222
-3    0 0.207149 0.207179  3.0e-05 Computing       222
-4    0 0.207179 0.207179  0.0e+00 Computing       222
-5    0 0.207179 0.207194  1.5e-05 Computing       222
-6    0 0.207194 0.207194  0.0e+00 Computing       222
-7    0 0.207194 0.207207  1.3e-05 Computing       222
-                           startfile endline                            endfile
-2 /hpl-2.2/src/pfact/hpl_pdrpanllt.c     222 /hpl-2.2/src/pfact/hpl_pdrpanllt.c
-3 /hpl-2.2/src/pfact/hpl_pdrpanllt.c     222 /hpl-2.2/src/pfact/hpl_pdrpanllt.c
-4 /hpl-2.2/src/pfact/hpl_pdrpanllt.c     222 /hpl-2.2/src/pfact/hpl_pdrpanllt.c
-5 /hpl-2.2/src/pfact/hpl_pdrpanllt.c     222 /hpl-2.2/src/pfact/hpl_pdrpanllt.c
-6 /hpl-2.2/src/pfact/hpl_pdrpanllt.c     222 /hpl-2.2/src/pfact/hpl_pdrpanllt.c
-7 /hpl-2.2/src/pfact/hpl_pdrpanllt.c     222 /hpl-2.2/src/pfact/hpl_pdrpanllt.c
-  idx
-2   2
-3   3
-4   4
-5   5
-6   6
-7   7
-</pre>
-
-<div class="org-src-container">
-<pre class="src src-R">unique(durations[c("startfile", "startline")])
-</pre>
-</div>
-
-<pre class="example">
-                              startfile startline
-2    /hpl-2.2/src/pfact/hpl_pdrpanllt.c       222
-14         /hpl-2.2/src/comm/hpl_sdrv.c       191
-478      /hpl-2.2/src/pgesv/hpl_rollt.c       242
-481 /hpl-2.2/src/pgesv/hpl_pdupdatett.c       384
-486 /hpl-2.2/src/pgesv/hpl_pdupdatett.c       407
-</pre>
-
-
-<p>
-We need to check each of these to see if this is indeed a call to <code>HPL_dgemm</code>, or something else.
-It appears that <code>HPL_rollT</code> and <code>HPL_sdrv</code> are not calling <code>HPL_dgemm</code>, they are just calling <code>MPI_Wait</code>. Thus, we have to
-remove them.
-</p>
-
-<div class="org-src-container">
-<pre class="src src-R">durations = durations[durations$startfile != "/hpl-2.2/src/comm/hpl_sdrv.c" &amp; durations$startfile != "/hpl-2.2/src/pgesv/hpl_rollt.c",]
-unique(durations[c("startfile", "startline")])
-</pre>
-</div>
-
-
-<pre class="example">
-                              startfile startline
-2    /hpl-2.2/src/pfact/hpl_pdrpanllt.c       222
-481 /hpl-2.2/src/pgesv/hpl_pdupdatett.c       384
-486 /hpl-2.2/src/pgesv/hpl_pdupdatett.c       407
-</pre>
-
-
-<p>
-Now, let us get what was output by the <code>printf</code>.
-</p>
-
-<p>
-Processing the output:
-</p>
-<div class="org-src-container">
-<pre class="src src-python"><span class="org-keyword">import</span> re
-<span class="org-keyword">import</span> csv
-<span class="org-variable-name">reg</span> = re.<span class="org-builtin">compile</span>(<span class="org-string">'file=([a-zA-Z0-9/_.-]+) line=([0-9]+) rank=([0-9]+) m=([0-9]+) n=([0-9]+) k=([0-9]+) lead_A=([0-9]+) lead_B=([0-9]+) lead_C=([0-9]+) expected_time=(-?[0-9]+.[0-9]+)'</span>)
-
-<span class="org-keyword">def</span> <span class="org-function-name">process</span>(in_file, out_file):
-    <span class="org-keyword">with</span> <span class="org-builtin">open</span>(in_file, <span class="org-string">'r'</span>) <span class="org-keyword">as</span> in_f:
-        <span class="org-keyword">with</span> <span class="org-builtin">open</span>(out_file, <span class="org-string">'w'</span>) <span class="org-keyword">as</span> out_f:
-            <span class="org-variable-name">csv_writer</span> = csv.writer(out_f)
-            csv_writer.writerow((<span class="org-string">'file'</span>, <span class="org-string">'line'</span>, <span class="org-string">'rank'</span>, <span class="org-string">'n'</span>, <span class="org-string">'m'</span>, <span class="org-string">'k'</span>, <span class="org-string">'lead_A'</span>, <span class="org-string">'lead_B'</span>, <span class="org-string">'lead_C'</span>, <span class="org-string">'expected_time'</span>))
-            <span class="org-keyword">for</span> line <span class="org-keyword">in</span> in_f:
-                <span class="org-variable-name">match</span> = reg.match(line)
-                <span class="org-keyword">if</span> match <span class="org-keyword">is</span> <span class="org-keyword">not</span> <span class="org-constant">None</span>:
-                    <span class="org-variable-name">result</span> = <span class="org-builtin">list</span>(match.group(i) <span class="org-keyword">for</span> i <span class="org-keyword">in</span> <span class="org-builtin">range</span>(1, 11))
-                    <span class="org-variable-name">result</span>[0] = result[0][result[0].index(<span class="org-string">'/hpl'</span>):].lower()
-                    csv_writer.writerow(result)
-process(<span class="org-string">'/tmp/output'</span>, <span class="org-string">'/tmp/parameters.csv'</span>)
-</pre>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">parameters &lt;- read.csv("/tmp/parameters.csv");
-head(parameters)
-</pre>
-</div>
-
-<pre class="example">
-                                file line rank    n  m k lead_A lead_B lead_C
-1 /hpl-2.2/src/pfact/hpl_pdrpanllt.c  222    0 1320 60 0   1320    120   1320
-2 /hpl-2.2/src/pfact/hpl_pdrpanllt.c  222    8 1200 60 0   1200    120   1200
-3 /hpl-2.2/src/pfact/hpl_pdrpanllt.c  222    0 1320 30 0   1320    120   1320
-4 /hpl-2.2/src/pfact/hpl_pdrpanllt.c  222    4 1280 60 0   1280    120   1280
-5 /hpl-2.2/src/pfact/hpl_pdrpanllt.c  222    0 1320 16 0   1320    120   1320
-6 /hpl-2.2/src/pfact/hpl_pdrpanllt.c  222    0 1320  8 0   1320    120   1320
-  expected_time
-1     -0.002476
-2     -0.002476
-3     -0.002476
-4     -0.002476
-5     -0.002476
-6     -0.002476
-</pre>
-
-<p>
-A first remark: we see that some rows have k=0, which is a bit surprising. I double-checked by adding some <code>printf</code> in the
-files, this is not a bug. This only happens in <code>HPL_pdrpanllT</code> so it was unnoticed until now.
-</p>
-
-<div class="org-src-container">
-<pre class="src src-R">nrow(parameters)
-nrow(durations)
-nrow(parameters[parameters$file == "/hpl-2.2/src/pfact/hpl_pdrpanllt.c",])
-nrow(durations[durations$startfile == "/hpl-2.2/src/pfact/hpl_pdrpanllt.c",])
-</pre>
-</div>
-
-<pre class="example">
-[1] 20300
-[1] 29964
-[1] 19664
-[1] 29328
-</pre>
-
-
-<ul class="org-ul">
-<li>There is obviously something wrong. We should have a one-to-one correspondance between the elements of the <code>parameters</code>
-dataframe and the elements of the <code>durations</code> dataframe. It seems here that SMPI has produced additional entries in the
-trace, or some of the <code>printf</code> I put disapeared.</li>
-<li>This is not an error in parsing the output (e.g. some lines not parsed because of a wrong format/regexp). The output
-file has 20359 lines.</li>
-<li>Tried puting a <code>printf("blabla\n")</code> just before <code>HPL_dgemm</code> in the file <code>HPL_pdrpanllT.c</code> and counted the number of times it
-appeared. Exactly the same number, so definitely not an issue with the parsing or the definition with the <code>#define</code>.</li>
-<li>Checked the <code>durations</code> dataframe. Nothing apparently wrong, all the entries for this file are at the same line, so I
-did not miss a hidden <code>MPI_Wait</code> somewhere else in this same file).</li>
-</ul>
-</div>
-</li>
-<li><a id="org96307ec"></a>Using another way to measure durations&#xa0;&#xa0;&#xa0;<span class="tag"><span class="C">C</span>&#xa0;<span class="PYTHON">PYTHON</span>&#xa0;<span class="R">R</span>&#xa0;<span class="EXPERIMENTS">EXPERIMENTS</span>&#xa0;<span class="TRACING">TRACING</span>&#xa0;<span class="PERFORMANCE">PERFORMANCE</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-7-3">
-<ul class="org-ul">
-<li>Let’s use something else than SMPI trace to measure durations. We will measure the time directly in the code. But
-first we need to check that this new measure is consistent with what we got with the traces.</li>
-<li>Now, <code>HPL_dgemm</code> is defined as:</li>
-</ul>
-<div class="org-src-container">
-<pre class="src src-c"><span class="org-preprocessor">#define</span>  <span class="org-function-name">HPL_dgemm</span>(<span class="org-variable-name">layout</span>, <span class="org-variable-name">TransA</span>, <span class="org-variable-name">TransB</span>, <span class="org-variable-name">M</span>, <span class="org-variable-name">N</span>, <span class="org-variable-name">K</span>, <span class="org-variable-name">alpha</span>, <span class="org-variable-name">A</span>, <span class="org-variable-name">lda</span>, <span class="org-variable-name">B</span>, <span class="org-variable-name">ldb</span>, <span class="org-variable-name">beta</span>, <span class="org-variable-name">C</span>, <span class="org-variable-name">ldc</span>)  ({\
-  <span class="org-type">int</span> <span class="org-variable-name">my_rank</span>, <span class="org-variable-name">buff</span>=0;\
-  <span class="org-type">MPI_Request</span> <span class="org-variable-name">request</span>;\
-  MPI_Comm_rank(MPI_COMM_WORLD, &amp;my_rank);\
-  <span class="org-type">double</span> <span class="org-variable-name">expected_time</span> = (1.062e-09)*(<span class="org-type">double</span>)M*(<span class="org-type">double</span>)N*(<span class="org-type">double</span>)K - 2.476e-03;\
-  <span class="org-keyword">struct</span> <span class="org-type">timeval</span> <span class="org-variable-name">before</span> = {};\
-  <span class="org-keyword">struct</span> <span class="org-type">timeval</span> <span class="org-variable-name">after</span> = {};\
-  gettimeofday(&amp;before, <span class="org-constant">NULL</span>);\
-  MPI_Isend(&amp;buff, 1, MPI_INT, my_rank, 0, MPI_COMM_WORLD, &amp;request);\
-  MPI_Recv(&amp;buff, 1, MPI_INT, my_rank, 0, MPI_COMM_WORLD, <span class="org-constant">NULL</span>);\
-  MPI_Wait(&amp;request, MPI_STATUS_IGNORE);\
-  cblas_dgemm(layout, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);\
-  gettimeofday(&amp;after, <span class="org-constant">NULL</span>);\
-  MPI_Isend(&amp;buff, 1, MPI_INT, my_rank, 0, MPI_COMM_WORLD, &amp;request);\
-  MPI_Recv(&amp;buff, 1, MPI_INT, my_rank, 0, MPI_COMM_WORLD, <span class="org-constant">NULL</span>);\
-  MPI_Wait(&amp;request, MPI_STATUS_IGNORE);\
-  <span class="org-type">double</span> <span class="org-variable-name">time_before</span> = (<span class="org-type">double</span>)(before.tv_sec) + (<span class="org-type">double</span>)(before.tv_usec)*1e-6;\
-  <span class="org-type">double</span> <span class="org-variable-name">time_after</span> = (<span class="org-type">double</span>)(after.tv_sec) + (<span class="org-type">double</span>)(after.tv_usec)*1e-6;\
-  <span class="org-type">double</span> <span class="org-variable-name">real_time</span> = time_after-time_before;\
-  printf(<span class="org-string">"file=%s line=%d rank=%d m=%d n=%d k=%d lead_A=%d lead_B=%d lead_C=%d real_time=%f expected_time=%f\n"</span>, __FILE__, __LINE__, my_rank, M, N, K, lda, ldb, ldc, real_time, expected_time);\
-})
-</pre>
-</div>
-<ul class="org-ul">
-<li>We run the same code than above to get the <code>durations</code> frame.</li>
-</ul>
-<div class="org-src-container">
-<pre class="src src-R">head(durations)
-</pre>
-</div>
-
-<pre class="example">
-  rank    start      end duration     state startline
-2    0 0.275856 0.275896  4.0e-05 Computing       224
-3    0 0.275896 0.275929  3.3e-05 Computing       224
-4    0 0.275929 0.275929  0.0e+00 Computing       224
-5    0 0.275929 0.275948  1.9e-05 Computing       224
-6    0 0.275948 0.275948  0.0e+00 Computing       224
-7    0 0.275948 0.275965  1.7e-05 Computing       224
-                           startfile endline                            endfile
-2 /hpl-2.2/src/pfact/hpl_pdrpanllt.c     224 /hpl-2.2/src/pfact/hpl_pdrpanllt.c
-3 /hpl-2.2/src/pfact/hpl_pdrpanllt.c     224 /hpl-2.2/src/pfact/hpl_pdrpanllt.c
-4 /hpl-2.2/src/pfact/hpl_pdrpanllt.c     224 /hpl-2.2/src/pfact/hpl_pdrpanllt.c
-5 /hpl-2.2/src/pfact/hpl_pdrpanllt.c     224 /hpl-2.2/src/pfact/hpl_pdrpanllt.c
-6 /hpl-2.2/src/pfact/hpl_pdrpanllt.c     224 /hpl-2.2/src/pfact/hpl_pdrpanllt.c
-7 /hpl-2.2/src/pfact/hpl_pdrpanllt.c     224 /hpl-2.2/src/pfact/hpl_pdrpanllt.c
-  idx
-2   2
-3   3
-4   4
-5   5
-6   6
-7   7
-</pre>
-
-<p>
-Now, we process the parameters:
-</p>
-<div class="org-src-container">
-<pre class="src src-python"><span class="org-keyword">import</span> re
-<span class="org-keyword">import</span> csv
-<span class="org-variable-name">reg</span> = re.<span class="org-builtin">compile</span>(<span class="org-string">'file=([a-zA-Z0-9/_.-]+) line=([0-9]+) rank=([0-9]+) m=([0-9]+) n=([0-9]+) k=([0-9]+) lead_A=([0-9]+) lead_B=([0-9]+) lead_C=([0-9]+) real_time=(-?[0-9]+.[0-9]+) expected_time=(-?[0-9]+.[0-9]+)'</span>)
-
-<span class="org-keyword">def</span> <span class="org-function-name">process</span>(in_file, out_file):
-    <span class="org-keyword">with</span> <span class="org-builtin">open</span>(in_file, <span class="org-string">'r'</span>) <span class="org-keyword">as</span> in_f:
-        <span class="org-keyword">with</span> <span class="org-builtin">open</span>(out_file, <span class="org-string">'w'</span>) <span class="org-keyword">as</span> out_f:
-            <span class="org-variable-name">csv_writer</span> = csv.writer(out_f)
-            csv_writer.writerow((<span class="org-string">'file'</span>, <span class="org-string">'line'</span>, <span class="org-string">'rank'</span>, <span class="org-string">'n'</span>, <span class="org-string">'m'</span>, <span class="org-string">'k'</span>, <span class="org-string">'lead_A'</span>, <span class="org-string">'lead_B'</span>, <span class="org-string">'lead_C'</span>, <span class="org-string">'real_time'</span>, <span class="org-string">'expected_time'</span>))
-            <span class="org-keyword">for</span> line <span class="org-keyword">in</span> in_f:
-                <span class="org-variable-name">match</span> = reg.match(line)
-                <span class="org-keyword">if</span> match <span class="org-keyword">is</span> <span class="org-keyword">not</span> <span class="org-constant">None</span>:
-                    <span class="org-variable-name">result</span> = <span class="org-builtin">list</span>(match.group(i) <span class="org-keyword">for</span> i <span class="org-keyword">in</span> <span class="org-builtin">range</span>(1, 12))
-                    <span class="org-variable-name">result</span>[0] = result[0][result[0].index(<span class="org-string">'/hpl'</span>):].lower()
-                    csv_writer.writerow(result)
-process(<span class="org-string">'/tmp/output'</span>, <span class="org-string">'/tmp/parameters.csv'</span>)
-</pre>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">parameters &lt;- read.csv("/tmp/parameters.csv");
-head(parameters)
-</pre>
-</div>
-
-<pre class="example">
-                                file line rank    n  m k lead_A lead_B lead_C
-1 /hpl-2.2/src/pfact/hpl_pdrpanllt.c  224    0 1320 60 0   1320    120   1320
-2 /hpl-2.2/src/pfact/hpl_pdrpanllt.c  224    0 1320 30 0   1320    120   1320
-3 /hpl-2.2/src/pfact/hpl_pdrpanllt.c  224    0 1320 16 0   1320    120   1320
-4 /hpl-2.2/src/pfact/hpl_pdrpanllt.c  224    0 1320  8 0   1320    120   1320
-5 /hpl-2.2/src/pfact/hpl_pdrpanllt.c  224    0 1320  4 0   1320    120   1320
-6 /hpl-2.2/src/pfact/hpl_pdrpanllt.c  224    0 1320  2 0   1320    120   1320
-  real_time expected_time
-1   8.1e-05     -0.002476
-2   0.0e+00     -0.002476
-3   0.0e+00     -0.002476
-4   0.0e+00     -0.002476
-5   1.0e-06     -0.002476
-6   0.0e+00     -0.002476
-</pre>
-
-<p>
-We merge the <code>durations</code> and <code>parameters</code> dataframes, but only the entries for the file <code>hpl_pdupdatett.c</code> (we cannot do it
-for the other file since we have a mismatch).
-</p>
-<div class="org-src-container">
-<pre class="src src-R">insert_sizes = function(durations, sizes) {
-    stopifnot(nrow(durations)==nrow(sizes))
-    ndf = data.frame();
-    for(i in (sort(unique(durations$rank)))) {
-	tmp_dur = durations[durations$rank == i,]
-	tmp_sizes = sizes[sizes$rank == i,]
-	stopifnot(nrow(tmp_dur) == nrow(tmp_sizes))
-	stopifnot(tmp_dur$startline == tmp_sizes$line)
-	storage.mode(tmp_sizes$m) &lt;- "double" # avoiding integer overflow when taking the product
-	storage.mode(tmp_sizes$n) &lt;- "double"
-	storage.mode(tmp_sizes$k) &lt;- "double"
-	storage.mode(tmp_sizes$lead_A) &lt;- "double"
-	storage.mode(tmp_sizes$lead_B) &lt;- "double"
-	storage.mode(tmp_sizes$lead_C) &lt;- "double"
-	tmp_dur$m = tmp_sizes$m
-	tmp_dur$n = tmp_sizes$n
-	tmp_dur$k = tmp_sizes$k
-	tmp_dur$lead_A = tmp_sizes$lead_A
-	tmp_dur$lead_B = tmp_sizes$lead_B
-	tmp_dur$lead_C = tmp_sizes$lead_C
-	tmp_dur$lead_product = tmp_sizes$lead_A * tmp_sizes$lead_B * tmp_sizes$lead_C
-	tmp_dur$size_product = tmp_sizes$m * tmp_sizes$n * tmp_sizes$k
-	tmp_dur$ratio = tmp_dur$lead_product/tmp_dur$size_product
-	tmp_dur$real_time = tmp_sizes$real_time
-	tmp_dur$expected_time = tmp_sizes$expected_time
-	tmp_dur$absolute_time_diff = tmp_dur$expected_time - tmp_dur$duration
-	tmp_dur$relative_time_diff = (tmp_dur$expected_time - tmp_dur$duration)/tmp_dur$expected_time
-	ndf = rbind(ndf, tmp_dur)
-    }
-    return(ndf);
-}
-</pre>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">result = insert_sizes(durations[durations$startfile == "/hpl-2.2/src/pgesv/hpl_pdupdatett.c",], parameters[parameters$file == "/hpl-2.2/src/pgesv/hpl_pdupdatett.c",])
-</pre>
-</div>
-
-<p>
-Now we plot the time measured by SMPI traces against the time measured by <code>gettimeofday</code>.
-</p>
-
-<div class="org-src-container">
-<pre class="src src-R">library(ggplot2)
-ggplot(result, aes(x=duration, y=real_time)) +
-    geom_point(shape=1) + ggtitle("Time measured by SMPI against time measured by gettimeofday")
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="images/gettimeofday.png" alt="gettimeofday.png" />
-</p>
-</div>
-
-<p>
-Checking with a linear regression, just to be sure:
-</p>
-<div class="org-src-container">
-<pre class="src src-R">summary(lm(duration~real_time, data=result))
-</pre>
-</div>
-
-<pre class="example">
-
-Call:
-lm(formula = duration ~ real_time, data = result)
-
-Residuals:
-       Min         1Q     Median         3Q        Max 
--4.917e-05 -4.088e-06  1.075e-06  5.261e-06  6.181e-05 
-
-Coefficients:
-              Estimate Std. Error    t value Pr(&gt;|t|)    
-(Intercept) -2.617e-06  6.285e-07     -4.163 3.57e-05 ***
-real_time    9.999e-01  7.058e-06 141678.252  &lt; 2e-16 ***
----
-Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
-
-Residual standard error: 1.034e-05 on 634 degrees of freedom
-Multiple R-squared:      1,	Adjusted R-squared:      1 
-F-statistic: 2.007e+10 on 1 and 634 DF,  p-value: &lt; 2.2e-16
-</pre>
-
-<p>
-It is not perfect, but it looks pretty great. So, let’s use this to measure time.
-</p>
-</div>
-</li>
-
-<li><a id="org81a53d3"></a>Now we can finally re-do the analysis of <code>HPL_dgemm</code>&#xa0;&#xa0;&#xa0;<span class="tag"><span class="R">R</span>&#xa0;<span class="EXPERIMENTS">EXPERIMENTS</span>&#xa0;<span class="PERFORMANCE">PERFORMANCE</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-7-4">
-<ul class="org-ul">
-<li>There are less things to do, since all the data come from the output file.</li>
-<li>Recall the aim of doing this again: in the previous analysis, some calls to <code>HPL_dgemm</code> were missing. Thus, it needs to
-be done again, just to check if it changes anything.</li>
-<li>Generate the CSV file by runing the same Python script as in the previous section (the output format did not change).</li>
-<li>Then, analysis in R:</li>
-</ul>
-<div class="org-src-container">
-<pre class="src src-R">results &lt;- read.csv("/tmp/parameters.csv");
-head(results)
-</pre>
-</div>
-
-<pre class="example">
-                                file line rank    n  m k lead_A lead_B lead_C
-1 /hpl-2.2/src/pfact/hpl_pdrpanllt.c  222    0 5040 60 0   5040    120   5040
-2 /hpl-2.2/src/pfact/hpl_pdrpanllt.c  222    0 5040 30 0   5040    120   5040
-3 /hpl-2.2/src/pfact/hpl_pdrpanllt.c  222    0 5040 16 0   5040    120   5040
-4 /hpl-2.2/src/pfact/hpl_pdrpanllt.c  222    0 5040  8 0   5040    120   5040
-5 /hpl-2.2/src/pfact/hpl_pdrpanllt.c  222    0 5040  4 0   5040    120   5040
-6 /hpl-2.2/src/pfact/hpl_pdrpanllt.c  222    8 5000 60 0   5000    120   5000
-  real_time expected_time
-1   5.7e-05     -0.002476
-2   7.0e-06     -0.002476
-3   0.0e+00     -0.002476
-4   0.0e+00     -0.002476
-5   0.0e+00     -0.002476
-6   9.0e-06     -0.002476
-</pre>
-
-<div class="org-src-container">
-<pre class="src src-R">process_results = function(results) {
-    storage.mode(results$m) &lt;- "double" # avoiding integer overflow when taking the product
-    storage.mode(results$n) &lt;- "double"
-    storage.mode(results$k) &lt;- "double"
-    storage.mode(results$lead_A) &lt;- "double"
-    storage.mode(results$lead_B) &lt;- "double"
-    storage.mode(results$lead_C) &lt;- "double"
-    results$lead_product = results$lead_A * results$lead_B * results$lead_C
-    results$size_product = results$m * results$n * results$k
-    results$ratio = results$lead_product/results$size_product
-    results$absolute_time_diff = results$expected_time - results$real_time
-    results$relative_time_diff = (results$expected_time - results$real_time)/results$expected_time
-    results$idx = 1:length(results$rank)
-    return(results);
-}
-</pre>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">results = process_results(results)
-head(results)
-</pre>
-</div>
-
-<pre class="example">
-                                file line rank    n  m k lead_A lead_B lead_C
-1 /hpl-2.2/src/pfact/hpl_pdrpanllt.c  222    0 5040 60 0   5040    120   5040
-2 /hpl-2.2/src/pfact/hpl_pdrpanllt.c  222    0 5040 30 0   5040    120   5040
-3 /hpl-2.2/src/pfact/hpl_pdrpanllt.c  222    0 5040 16 0   5040    120   5040
-4 /hpl-2.2/src/pfact/hpl_pdrpanllt.c  222    0 5040  8 0   5040    120   5040
-5 /hpl-2.2/src/pfact/hpl_pdrpanllt.c  222    0 5040  4 0   5040    120   5040
-6 /hpl-2.2/src/pfact/hpl_pdrpanllt.c  222    8 5000 60 0   5000    120   5000
-  real_time expected_time lead_product size_product ratio absolute_time_diff
-1   5.7e-05     -0.002476   3048192000            0   Inf          -0.002533
-2   7.0e-06     -0.002476   3048192000            0   Inf          -0.002483
-3   0.0e+00     -0.002476   3048192000            0   Inf          -0.002476
-4   0.0e+00     -0.002476   3048192000            0   Inf          -0.002476
-5   0.0e+00     -0.002476   3048192000            0   Inf          -0.002476
-6   9.0e-06     -0.002476   3000000000            0   Inf          -0.002485
-  relative_time_diff idx
-1           1.023021   1
-2           1.002827   2
-3           1.000000   3
-4           1.000000   4
-5           1.000000   5
-6           1.003635   6
-</pre>
-
-<div class="org-src-container">
-<pre class="src src-R">library(ggplot2)
-ggplot(results, aes(x=idx, y=real_time, color=factor(file))) +
-    geom_point(shape=1) + ggtitle("Durations of HPL_dgemm")
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="images/trace_gettimeofday1_16.png" alt="trace_gettimeofday1_16.png" />
-</p>
-</div>
-
-
-<p>
-This is the plot of the duration of <code>HPL_dgemm</code> over time (analogous to the plot <code>duration</code> vs <code>start</code> that we had). The part
-for <code>hpl_pduptatett</code> looks exactly as before. We see that the calls to <code>HPL_dgemm</code> in <code>hpl_pdrpanllt</code> are always very short.
-</p>
-
-<div class="org-src-container">
-<pre class="src src-R">library(ggplot2)
-ggplot(results, aes(x=size_product, y=real_time, color=factor(rank))) +
-    geom_point(shape=1) + ggtitle("Durations of HPL_dgemm")
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="images/trace_gettimeofday2_16.png" alt="trace_gettimeofday2_16.png" />
-</p>
-</div>
-
-<p>
-Without surprise, we find exactly the same kind of plot as before, since all the new calls to <code>HPL_dgemm</code> are very short
-and thus hidden in the left part of the graph.
-</p>
-
-
-<div class="org-src-container">
-<pre class="src src-R">reg &lt;- lm(duration~I(m*n*k), data=result)
-summary(reg)
-</pre>
-</div>
-
-<pre class="example">
-
-Call:
-lm(formula = duration ~ I(m * n * k), data = result)
-
-Residuals:
-      Min        1Q    Median        3Q       Max 
--0.004843 -0.001337 -0.000024  0.000280  0.055746 
-
-Coefficients:
-              Estimate Std. Error t value Pr(&gt;|t|)    
-(Intercept)  2.393e-04  2.182e-04   1.097    0.273    
-I(m * n * k) 1.064e-09  2.615e-12 406.932   &lt;2e-16 ***
----
-Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
-
-Residual standard error: 0.003594 on 634 degrees of freedom
-Multiple R-squared:  0.9962,	Adjusted R-squared:  0.9962 
-F-statistic: 1.656e+05 on 1 and 634 DF,  p-value: &lt; 2.2e-16
-</pre>
-
-<div class="org-src-container">
-<pre class="src src-R">layout(matrix(c(1,2,3,4),2,2))
-plot(reg)
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="images/reg_gettimeofday_16.png" alt="reg_gettimeofday_16.png" />
-</p>
-</div>
-
-<p>
-The summary of the linear regression shows that the factor <code>m*n*k</code> barely changed. The intercept is very different, but
-its t-value is too low, so it is not meaning-full.
-The residuals vs fitted plot seems to look better, with no more heteroscedasticity. My guess is that we added a lot of
-points with very low values, so their weight hide the problem.
-The QQ-plot still looks problematic.
-</p>
-</div>
-</li>
-<li><a id="org9c102e1"></a>Replacing <code>HPL_dgemm</code> by <code>smpi_usleep</code> again&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="PERFORMANCE">PERFORMANCE</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-7-5">
-<ul class="org-ul">
-<li>As for the <code>printf</code>, we will put the <code>smpi_usleep</code> in the <code>#define</code>. We take the coefficients of the latest linear regression.</li>
-<li>Testing: we still get the same number of Gflops (about 23 Gflops) but the simulation runs in 41 seconds now.</li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-orgb891b00" class="outline-4">
-<h4 id="orgb891b00"><span class="section-number-4">1.2.8</span> 2017-03-10 Friday</h4>
-<div class="outline-text-4" id="text-1-2-8">
-</div>
-<ol class="org-ol">
-<li><a id="org162a224"></a>Tracing <code>HPL_dtrsm</code>&#xa0;&#xa0;&#xa0;<span class="tag"><span class="C">C</span>&#xa0;<span class="PYTHON">PYTHON</span>&#xa0;<span class="R">R</span>&#xa0;<span class="EXPERIMENTS">EXPERIMENTS</span>&#xa0;<span class="TRACING">TRACING</span>&#xa0;<span class="PERFORMANCE">PERFORMANCE</span></span><br />
-<div class="outline-text-5" id="text-1-2-8-1">
-<ul class="org-ul">
-<li>The goal is to do something similar for <code>HPL_dtrsm</code>. In a first time, we will trace the parameters used to call it and
-its durations, then we will do a linear regression, to finally replace it by a <code>smpi_usleep</code>.</li>
-<li>Recall that this function solves a triangular set of equations. It takes as input two m &times; n matrices. We expect the
-complexity to be O(m*n).</li>
-<li>Replace the definition of <code>HPL_dtrsm</code> in <code>hpl_blas.h</code> by the following:</li>
-</ul>
-<div class="org-src-container">
-<pre class="src src-c"><span class="org-preprocessor">#define</span> <span class="org-function-name">HPL_dtrsm</span>(<span class="org-variable-name">layout</span>, <span class="org-variable-name">Side</span>, <span class="org-variable-name">Uplo</span>, <span class="org-variable-name">TransA</span>, <span class="org-variable-name">Diag</span>, <span class="org-variable-name">M</span>, <span class="org-variable-name">N</span>, <span class="org-variable-name">alpha</span>, <span class="org-variable-name">A</span>, <span class="org-variable-name">lda</span>, <span class="org-variable-name">B</span>, <span class="org-variable-name">ldb</span>) ({\
-    <span class="org-type">int</span> <span class="org-variable-name">my_rank</span>, <span class="org-variable-name">buff</span>=0;\
-    MPI_Comm_rank(MPI_COMM_WORLD, &amp;my_rank);\
-    <span class="org-keyword">struct</span> <span class="org-type">timeval</span> <span class="org-variable-name">before</span> = {};\
-    <span class="org-keyword">struct</span> <span class="org-type">timeval</span> <span class="org-variable-name">after</span> = {};\
-    gettimeofday(&amp;before, <span class="org-constant">NULL</span>);\
-    cblas_dtrsm(layout, Side, Uplo, TransA, Diag, M, N, alpha, A, lda, B, ldb);\
-    gettimeofday(&amp;after, <span class="org-constant">NULL</span>);\
-    <span class="org-type">double</span> <span class="org-variable-name">time_before</span> = (<span class="org-type">double</span>)(before.tv_sec) + (<span class="org-type">double</span>)(before.tv_usec)*1e-6;\
-    <span class="org-type">double</span> <span class="org-variable-name">time_after</span> = (<span class="org-type">double</span>)(after.tv_sec) + (<span class="org-type">double</span>)(after.tv_usec)*1e-6;\
-    <span class="org-type">double</span> <span class="org-variable-name">real_time</span> = time_after-time_before;\
-    printf(<span class="org-string">"file=%s line=%d rank=%d m=%d n=%d lead_A=%d lead_B=%d real_time=%f\n"</span>, __FILE__, __LINE__, my_rank, M, N, lda, ldb, real_time);\
-})
-</pre>
-</div>
-<ul class="org-ul">
-<li>Run the simulation:</li>
-</ul>
-<div class="org-src-container">
-<pre class="src src-sh">smpirun --cfg=smpi/bcast:mpich --cfg=smpi/running-power:6217956542.969 --cfg=smpi/display-timing:yes<span class="org-sh-escaped-newline">\</span>
---cfg=smpi/privatize-global-variables:yes -np 16 -hostfile ../../../small_tests/hostfile_64.txt -platform<span class="org-sh-escaped-newline">\</span>
-../../../small_tests/cluster_fat_tree_64.xml ./xhpl &gt; /tmp/output
-</pre>
-</div>
-<ul class="org-ul">
-<li>Process the output file:</li>
-</ul>
-<div class="org-src-container">
-<pre class="src src-python"><span class="org-keyword">import</span> re
-<span class="org-keyword">import</span> csv
-<span class="org-variable-name">reg</span> = re.<span class="org-builtin">compile</span>(<span class="org-string">'file=([a-zA-Z0-9/_.-]+) line=([0-9]+) rank=([0-9]+) m=([0-9]+) n=([0-9]+) lead_A=([0-9]+) lead_B=([0-9]+) real_time=(-?[0-9]+.[0-9]+)'</span>)
-
-<span class="org-keyword">def</span> <span class="org-function-name">process</span>(in_file, out_file):
-    <span class="org-keyword">with</span> <span class="org-builtin">open</span>(in_file, <span class="org-string">'r'</span>) <span class="org-keyword">as</span> in_f:
-        <span class="org-keyword">with</span> <span class="org-builtin">open</span>(out_file, <span class="org-string">'w'</span>) <span class="org-keyword">as</span> out_f:
-            <span class="org-variable-name">csv_writer</span> = csv.writer(out_f)
-            csv_writer.writerow((<span class="org-string">'file'</span>, <span class="org-string">'line'</span>, <span class="org-string">'rank'</span>, <span class="org-string">'n'</span>, <span class="org-string">'m'</span>, <span class="org-string">'lead_A'</span>, <span class="org-string">'lead_B'</span>, <span class="org-string">'real_time'</span>))
-            <span class="org-keyword">for</span> line <span class="org-keyword">in</span> in_f:
-                <span class="org-variable-name">match</span> = reg.match(line)
-                <span class="org-keyword">if</span> match <span class="org-keyword">is</span> <span class="org-keyword">not</span> <span class="org-constant">None</span>:
-                    <span class="org-variable-name">result</span> = <span class="org-builtin">list</span>(match.group(i) <span class="org-keyword">for</span> i <span class="org-keyword">in</span> <span class="org-builtin">range</span>(1, 9))
-                    <span class="org-variable-name">result</span>[0] = result[0][result[0].index(<span class="org-string">'/hpl'</span>):].lower()
-                    csv_writer.writerow(result)
-process(<span class="org-string">'/tmp/output'</span>, <span class="org-string">'/tmp/parameters.csv'</span>)
-</pre>
-</div>
-
-<ul class="org-ul">
-<li>Analysis in R:</li>
-</ul>
-<div class="org-src-container">
-<pre class="src src-R">results &lt;- read.csv("/tmp/parameters.csv");
-head(results)
-</pre>
-</div>
-
-<pre class="example">
-                                file line rank  n m lead_A lead_B real_time
-1 /hpl-2.2/src/pfact/hpl_pdrpanllt.c  171    8 60 0    120    120  0.000102
-2 /hpl-2.2/src/pfact/hpl_pdrpanllt.c  171    8 30 0    120    120  0.000013
-3 /hpl-2.2/src/pfact/hpl_pdrpanllt.c  171    8 16 0    120    120  0.000000
-4 /hpl-2.2/src/pfact/hpl_pdrpanllt.c  171    8  8 0    120    120  0.000000
-5 /hpl-2.2/src/pfact/hpl_pdrpanllt.c  171    8  4 0    120    120  0.000000
-6 /hpl-2.2/src/pfact/hpl_pdrpanllt.c  171    8  2 0    120    120  0.000000
-</pre>
-
-
-<div class="org-src-container">
-<pre class="src src-R">process_results = function(results) {
-    storage.mode(results$m) &lt;- "double" # avoiding integer overflow when taking the product
-    storage.mode(results$n) &lt;- "double"
-    storage.mode(results$lead_A) &lt;- "double"
-    storage.mode(results$lead_B) &lt;- "double"
-    results$lead_product = results$lead_A * results$lead_B
-    results$size_product = results$m * results$n
-    results$ratio = results$lead_product/results$size_product
- #  results$absolute_time_diff = results$expected_time - results$real_time
- #  results$relative_time_diff = (results$expected_time - results$real_time)/results$expected_time
-    results$idx = 1:length(results$rank)
-    return(results);
-}
-</pre>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">results = process_results(results)
-head(results)
-</pre>
-</div>
-
-<pre class="example">
-                                file line rank  n m lead_A lead_B real_time
-1 /hpl-2.2/src/pfact/hpl_pdrpanllt.c  171    8 60 0    120    120  0.000102
-2 /hpl-2.2/src/pfact/hpl_pdrpanllt.c  171    8 30 0    120    120  0.000013
-3 /hpl-2.2/src/pfact/hpl_pdrpanllt.c  171    8 16 0    120    120  0.000000
-4 /hpl-2.2/src/pfact/hpl_pdrpanllt.c  171    8  8 0    120    120  0.000000
-5 /hpl-2.2/src/pfact/hpl_pdrpanllt.c  171    8  4 0    120    120  0.000000
-6 /hpl-2.2/src/pfact/hpl_pdrpanllt.c  171    8  2 0    120    120  0.000000
-  lead_product size_product ratio idx
-1        14400            0   Inf   1
-2        14400            0   Inf   2
-3        14400            0   Inf   3
-4        14400            0   Inf   4
-5        14400            0   Inf   5
-6        14400            0   Inf   6
-</pre>
-
-<div class="org-src-container">
-<pre class="src src-R">library(ggplot2)
-ggplot(results, aes(x=idx, y=real_time, color=factor(file))) +
-    geom_point(shape=1) + ggtitle("Durations of HPL_dgemm")
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="images/trace_dtrsm1_16.png" alt="trace_dtrsm1_16.png" />
-</p>
-</div>
-
-<p>
-We can observe a trend similar to <code>HPL_dgemm</code>. The function is only used in two places, <code>HPL_pdrpanllT</code> and
-<code>HPL_pdupdateTT</code>. In the former, all the calls are very short, whereas in the later, the calls are long at the beginning
-and become shorter throughout the execution. We also have some outliers.
-</p>
-
-<div class="org-src-container">
-<pre class="src src-R">library(ggplot2)
-ggplot(results, aes(x=size_product, y=real_time, color=factor(rank))) +
-    geom_point(shape=1) + ggtitle("Durations of HPL_dgemm")
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="images/trace_dtrsm2_16.png" alt="trace_dtrsm2_16.png" />
-</p>
-</div>
-
-<p>
-As expected, the duration looks proportional to the product of the sizes.
-</p>
-
-
-<div class="org-src-container">
-<pre class="src src-R">reg &lt;- lm(real_time~I(m*n), data=results)
-summary(reg)
-</pre>
-</div>
-
-<pre class="example">
-
-Call:
-lm(formula = real_time ~ I(m * n), data = results)
-
-Residuals:
-      Min        1Q    Median        3Q       Max 
--0.002999  0.000010  0.000010  0.000010  0.043651 
-
-Coefficients:
-              Estimate Std. Error  t value Pr(&gt;|t|)    
-(Intercept) -1.042e-05  2.445e-06   -4.263 2.02e-05 ***
-I(m * n)     9.246e-08  3.915e-11 2361.957  &lt; 2e-16 ***
----
-Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
-
-Residual standard error: 0.0006885 on 81298 degrees of freedom
-Multiple R-squared:  0.9856,	Adjusted R-squared:  0.9856 
-F-statistic: 5.579e+06 on 1 and 81298 DF,  p-value: &lt; 2.2e-16
-</pre>
-
-<div class="org-src-container">
-<pre class="src src-R">layout(matrix(c(1,2,3,4),2,2))
-plot(reg)
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="images/reg_dtrsm_16.png" alt="reg_dtrsm_16.png" />
-</p>
-</div>
-
-<p>
-The R-squared is high and both the intercept and sizes have a significant impact.
-However, the outliers are even more concerning than with <code>HPL_dgemm</code>. The Q-Q plot shows a large tail, and the residual vs
-leverage shows that these outliers are non-negligible in the linear regression (i.e. if we removed them, the
-coefficients would change significantly).
-</p>
-</div>
-</li>
-<li><a id="orgb27c2cd"></a>Replacing <code>HPL_dtrsm</code> by <code>smpi_sleep</code>&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="PERFORMANCE">PERFORMANCE</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-8-2">
-<ul class="org-ul">
-<li>Similarly to what have been done with <code>HPL_dgemm</code>, we use the coefficients found with the linear regression to replace
-the function by a sleep.</li>
-</ul>
-<div class="org-src-container">
-<pre class="src src-c"><span class="org-preprocessor">#define</span> <span class="org-function-name">HPL_dtrsm</span>(<span class="org-variable-name">layout</span>, <span class="org-variable-name">Side</span>, <span class="org-variable-name">Uplo</span>, <span class="org-variable-name">TransA</span>, <span class="org-variable-name">Diag</span>, <span class="org-variable-name">M</span>, <span class="org-variable-name">N</span>, <span class="org-variable-name">alpha</span>, <span class="org-variable-name">A</span>, <span class="org-variable-name">lda</span>, <span class="org-variable-name">B</span>, <span class="org-variable-name">ldb</span>) ({\
-    <span class="org-type">double</span> <span class="org-variable-name">expected_time</span> = (9.246e-08)*(<span class="org-type">double</span>)M*(<span class="org-type">double</span>)N - 1.024e-05;\
-    <span class="org-keyword">if</span>(expected_time &gt; 0)\
-        smpi_usleep((<span class="org-type">useconds_t</span>)(<span class="org-type">expected_time</span>*1e6));\
-})
-</pre>
-</div>
-<ul class="org-ul">
-<li>Running HPL again. We get the expected speed (about 23 Gflops) and a simulation time of 29 seconds (gain of 12 seconds).</li>
-</ul>
-</div>
-</li>
-<li><a id="orgfb546b8"></a>Having a look at <code>malloc</code>&#xa0;&#xa0;&#xa0;<span class="tag"><span class="PYTHON">PYTHON</span>&#xa0;<span class="R">R</span>&#xa0;<span class="PERFORMANCE">PERFORMANCE</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-8-3">
-<ul class="org-ul">
-<li>To run HPL with larger matrices, we need to replace some calls to <code>malloc</code> (resp. <code>free</code>) by <code>SMPI_SHARED_MALLOC</code>
-(resp. <code>SMPI_SHARED_FREE</code>).</li>
-<li>Firstly, let’s see where the big allocations are.</li>
-<li>Define <code>MY_MALLOC</code> in <code>hpl.h</code> as follows:</li>
-</ul>
-<div class="org-src-container">
-<pre class="src src-c"><span class="org-preprocessor">#define</span> <span class="org-function-name">MY_MALLOC</span>(<span class="org-variable-name">n</span>) ({\
-    <span class="org-type">int</span> <span class="org-variable-name">my_rank</span>;\
-    MPI_Comm_rank(MPI_COMM_WORLD, &amp;my_rank);\
-    printf(<span class="org-string">"file=%s line=%d rank=%d size=%lu\n"</span>, __FILE__, __LINE__, my_rank, n);\
-    malloc(n);\
-})
-</pre>
-</div>
-<ul class="org-ul">
-<li>Replace all the calls to <code>malloc</code> in the files by <code>MY_MALLOC</code>:</li>
-</ul>
-<div class="org-src-container">
-<pre class="src src-sh">grep -l malloc testing/**/*.c src/**/*.c | xargs sed -i <span class="org-string">'s/malloc/MY_MALLOC/g'</span>
-</pre>
-</div>
-<ul class="org-ul">
-<li>Run <code>smpirun</code> (N=20000, P=Q=4) and redirect the output to <code>/tmp/output</code>.</li>
-<li>Process the output file:</li>
-</ul>
-<div class="org-src-container">
-<pre class="src src-python"><span class="org-keyword">import</span> re
-<span class="org-keyword">import</span> csv
-<span class="org-variable-name">reg</span> = re.<span class="org-builtin">compile</span>(<span class="org-string">'file=([a-zA-Z0-9/_.-]+) line=([0-9]+) rank=([0-9]+) size=([0-9]+)'</span>)
-
-<span class="org-keyword">def</span> <span class="org-function-name">process</span>(in_file, out_file):
-    <span class="org-keyword">with</span> <span class="org-builtin">open</span>(in_file, <span class="org-string">'r'</span>) <span class="org-keyword">as</span> in_f:
-        <span class="org-keyword">with</span> <span class="org-builtin">open</span>(out_file, <span class="org-string">'w'</span>) <span class="org-keyword">as</span> out_f:
-            <span class="org-variable-name">csv_writer</span> = csv.writer(out_f)
-            csv_writer.writerow((<span class="org-string">'file'</span>, <span class="org-string">'line'</span>, <span class="org-string">'rank'</span>, <span class="org-string">'size'</span>))
-            <span class="org-keyword">for</span> line <span class="org-keyword">in</span> in_f:
-                <span class="org-variable-name">match</span> = reg.match(line)
-                <span class="org-keyword">if</span> match <span class="org-keyword">is</span> <span class="org-keyword">not</span> <span class="org-constant">None</span>:
-                    <span class="org-variable-name">result</span> = <span class="org-builtin">list</span>(match.group(i) <span class="org-keyword">for</span> i <span class="org-keyword">in</span> <span class="org-builtin">range</span>(1, 5))
-                    <span class="org-variable-name">result</span>[0] = result[0][result[0].index(<span class="org-string">'/hpl'</span>):].lower()
-                    csv_writer.writerow(result)
-process(<span class="org-string">'/tmp/output'</span>, <span class="org-string">'/tmp/malloc.csv'</span>)
-</pre>
-</div>
-<ul class="org-ul">
-<li>Analysis in R:</li>
-</ul>
-<div class="org-src-container">
-<pre class="src src-R">results &lt;- read.csv("/tmp/malloc.csv");
-head(results)
-</pre>
-</div>
-
-<pre class="example">
-                            file line rank size
-1 /hpl-2.2/src/grid/hpl_reduce.c  127    0    4
-2 /hpl-2.2/src/grid/hpl_reduce.c  127    1    4
-3 /hpl-2.2/src/grid/hpl_reduce.c  127    2    4
-4 /hpl-2.2/src/grid/hpl_reduce.c  127    3    4
-5 /hpl-2.2/src/grid/hpl_reduce.c  127    4    4
-6 /hpl-2.2/src/grid/hpl_reduce.c  127    5    4
-</pre>
-
-
-<div class="org-src-container">
-<pre class="src src-R">library(ggplot2)
-ggplot(results, aes(x=file, y=size)) +
-    geom_boxplot() + ggtitle("Sizes of malloc") + theme(axis.text.x = element_text(angle = 90, hjust = 1))
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="images/trace_malloc1_16.png" alt="trace_malloc1_16.png" />
-</p>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">storage.mode(results$size) &lt;- "double" # avoiding integer overflow when taking the product
-aggregated_results = aggregate(results$size, by=list(file=results$file), FUN=sum)
-head(aggregated_results)
-</pre>
-</div>
-
-<pre class="example">
-                                   file           x
-1         /hpl-2.2/src/comm/hpl_packl.c     9034816
-2        /hpl-2.2/src/grid/hpl_reduce.c     3200736
-3 /hpl-2.2/src/panel/hpl_pdpanel_init.c 11592866048
-4  /hpl-2.2/src/panel/hpl_pdpanel_new.c        3456
-5     /hpl-2.2/src/pauxil/hpl_pdlange.c     2560032
-6       /hpl-2.2/src/pfact/hpl_pdfact.c     2645504
-</pre>
-
-
-<div class="org-src-container">
-<pre class="src src-R">library(ggplot2)
-ggplot(aggregated_results, aes(x=file, y=x)) +
-    geom_boxplot() + ggtitle("Sizes of malloc") + theme(axis.text.x = element_text(angle = 90, hjust = 1))
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="images/trace_malloc2_16.png" alt="trace_malloc2_16.png" />
-</p>
-</div>
-
-
-<p>
-There are several things to notice:
-</p>
-<ul class="org-ul">
-<li>The biggest chunks are allocated in <code>HPL_pdtest</code>. These are the local matrices of each process.</li>
-<li>However, regarding the total quantity of allocated memory, <code>HPL_pdpanel_init</code> is the clear winner.</li>
-<li>In these tests, <code>htop</code> reported that about 20% of the 16GB of my laptop’s memory were used, i.e. about 3.2GB. We use a
-matrix of size 20000, each element is of type <code>double</code> (8 bytes), so the total amount of memory for the whole matrix is
-20000<sup>2</sup>/8 = 3.2GB.</li>
-<li>Thus, it seems that the <code>malloc</code> used in <code>HPL_pdpanel_init</code> are in fact negligible. An hypothesis is that they are quickly
-followed by a <code>free</code>.</li>
-<li>Verifying that every process allocates the same thing:</li>
-</ul>
-
-<div class="org-src-container">
-<pre class="src src-R">library(ggplot2)
-ggplot(results[results$file == "/hpl-2.2/testing/ptest/hpl_pdtest.c",], aes(x="", y=size, fill=factor(rank))) +
-    coord_polar("y", start=0) +
-    geom_bar(width=1, stat="identity") +
-    ggtitle("Sizes of malloc in HPL_pdtest")
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="images/trace_malloc3_16.png" alt="trace_malloc3_16.png" />
-</p>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">res_pdtest = results[results$file == "/hpl-2.2/testing/ptest/hpl_pdtest.c",]
-unique(res_pdtest[order(res_pdtest$size),]$size)
-</pre>
-</div>
-
-<pre class="example">
-[1] 193729992 196879432 198454152 200080072 201680392 203293512
-</pre>
-
-
-<ul class="org-ul">
-<li>The different calls to <code>malloc</code> in <code>HPL_pdtest</code> have approximately the same size, but not exactly. This understandable, P
-and Q may not divide the matrix sizes. Maybe this could cause <code>SMPI_SHARED_MALLOC</code> to not work properly?</li>
-</ul>
-</div>
-</li>
-<li><a id="org75b87ca"></a>Tentative to use <code>SMIP_SHARED_MALLOC</code> and <code>SMPI_SHARED_FREE</code> in HPL&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="PERFORMANCE">PERFORMANCE</span>&#xa0;<span class="BUG">BUG</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-8-4">
-<ul class="org-ul">
-<li>Revert the previous changes regarding <code>malloc</code>.</li>
-<li>In file <code>hpl_pdtest.c</code>, replace <code>malloc</code> by <code>SMPI_SHARED_MALLOC</code> and <code>free</code> by <code>SMPI_SHARED_FREE</code>.</li>
-<li><p>
-Run HPL with Simgrid. Two issues:
-</p>
-<ul class="org-ul">
-<li>The memory consumption stays the same, about 20% of my laptop’s memory. A first guess would be that the
-<code>SHARED_MALLOC</code> did not work, a new allocation was made for every process. Maybe because different sizes were given?</li>
-<li>The execution time (both virutal and real) decreased significantly. The virtual time dropped from 233 to 223 seconds,
-the real time from 28 to 15 seconds. If we forget the first point, a guess could be that <code>SHARED_MALLOC</code> worked
-properly and resulted in a lower number of cache misses (since all processes share the same sub-matrix) and thus
-improved performances. It is an experimental bias, we should avoid it.</li>
-</ul>
-<p>
-The fact that we have these two issues combined is very surprising.
-</p></li>
-<li>Let’s try to see if the <code>SHARED_MALLOC</code> makes only one allocation or not, by adding some <code>printf</code> in its implementation.
-<ul class="org-ul">
-<li>The path <code>shmalloc_global</code> is taken.</li>
-<li>The <code>bogusfile</code> is created only once, as expected.</li>
-<li>Then, every process maps the file in memory, chunk by chunk. The base adress is not the same for every process, but
-this is not an issue (we are speaking of virtual memory here).</li>
-</ul></li>
-<li>Tested my matrix product program. Got 34% memory utilization, 44 virtual seconds and 8 real seconds with <code>SMPI_SHARED_MALLOC</code>, but 11% memory utilization, 81
-virtual seconds and 7 real seconds with <code>malloc</code>. Very strange.</li>
-<li>Hypothesis: either the measure of the memory consumption is broken, or <code>SHARED_MALLOC</code> is broken.</li>
-<li><p>
-Try to use something else than <code>htop</code>:
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">watch -n 0,1 cat /proc/meminfo
-</pre>
-</div>
-<ul class="org-ul">
-<li>With <code>malloc</code> and <code>free</code>, the available memory drop from 14.4 GB to 11.0 GB.</li>
-<li>With <code>SMPI_SHARED_MALLOC</code> and <code>SMPI_SHARED_FREE</code>, the available memory drop from 14.4 GB to 14.1 GB.</li>
-</ul>
-<p>
-This seems more coherent, so <code>htop</code> would be a bad tool to measure memory consumption when using <code>SMPI_SHARED_MALLOC</code>.
-But this does not solve the time issue.
-</p></li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-org627e770" class="outline-4">
-<h4 id="org627e770"><span class="section-number-4">1.2.9</span> 2017-03-12 Sunday</h4>
-<div class="outline-text-4" id="text-1-2-9">
-</div>
-<ol class="org-ol">
-<li><a id="org4ecac97"></a>Experiment with SMPI macros in the matrix product code&#xa0;&#xa0;&#xa0;<span class="tag"><span class="C">C</span>&#xa0;<span class="R">R</span>&#xa0;<span class="EXPERIMENTS">EXPERIMENTS</span>&#xa0;<span class="PERFORMANCE">PERFORMANCE</span></span><br />
-<div class="outline-text-5" id="text-1-2-9-1">
-<ul class="org-ul">
-<li><p>
-Use the matrix product code, at commit <code>91633ea99463109736b900c92f2eacc84630e5b5</code>. Run 10 tests with or without
-<code>SMPI_SHARED_MALLOC</code> and <code>SMPI_SAMPLE</code> with a matrix size of 4000 and 64 processes, by running the command:
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">./smpi_macros.py 10 /tmp/results.csv
-</pre>
-</div></li>
-<li>Analysis, in R:</li>
-</ul>
-<div class="org-src-container">
-<pre class="src src-R">results &lt;- read.csv("/tmp/results.csv");
-head(results)
-</pre>
-</div>
-
-<pre class="example">
-      time size smpi_sample smpi_malloc
-1 2.134820 4000           1           1
-2 2.608971 4000           0           0
-3 3.767625 4000           1           0
-4 2.412387 4000           0           1
-5 3.767162 4000           1           0
-6 2.497480 4000           0           0
-</pre>
-
-
-<p>
-We already see that the case where we use <code>SMPI_SAMPLE</code> but not <code>SMPI_SHARED_MALLOC</code> seems to be different than the others.
-</p>
-
-<div class="org-src-container">
-<pre class="src src-R">res_aov = aov(time~(smpi_sample + smpi_malloc)^2, data=results)
-summary(res_aov)
-</pre>
-</div>
-
-<pre class="example">
-                        Df Sum Sq Mean Sq F value   Pr(&gt;F)    
-smpi_sample              1  1.202   1.202   9.227  0.00442 ** 
-smpi_malloc              1  4.579   4.579  35.163 8.62e-07 ***
-smpi_sample:smpi_malloc  1  8.332   8.332  63.981 1.68e-09 ***
-Residuals               36  4.688   0.130                     
----
-Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
-</pre>
-
-
-<div class="org-src-container">
-<pre class="src src-R">suppressWarnings(suppressMessages(library(FrF2))) # FrF2 outputs a bunch of useless messages...
-MEPlot(res_aov, abbrev=4, select=c(1, 2), response="time")
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="images/smpi_macros_1.png" alt="smpi_macros_1.png" />
-</p>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">IAPlot(res_aov, abbrev=4, show.alias=FALSE, select=c(1, 2))
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="images/smpi_macros_2.png" alt="smpi_macros_2.png" />
-</p>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">mean(results[results$smpi_sample == 0 &amp; results$smpi_malloc == 0,]$time)
-mean(results[results$smpi_sample == 0 &amp; results$smpi_malloc == 1,]$time)
-mean(results[results$smpi_sample == 1 &amp; results$smpi_malloc == 0,]$time)
-mean(results[results$smpi_sample == 1 &amp; results$smpi_malloc == 1,]$time)
-</pre>
-</div>
-
-<pre class="example">
-[1] 2.513953
-[1] 2.750056
-[1] 3.773385
-[1] 2.183901
-</pre>
-
-
-<ul class="org-ul">
-<li>In this small experiment, we see that both macros have a non-negligible impact on the time estimated by SMPI. When
-none of the optimizations is used, adding one of them will decreases the application’s performances. When one of the
-optimizations is already used, adding the other one increases the application’s performances.</li>
-<li>When I have added the SMPI macros in <code>matmul.c</code>, I have firstly added <code>SMPI_SHARED_MALLOC</code> and then <code>SMPI_SAMPLE_GLOBAL</code>
-(see the entry for 13/02/2017). According to the tests above, here the variation is not huge (I did not try the
-configuration with <code>SMPI_SAMPLE_GLOBAL</code> and without <code>SMPI_SHARED_MALLOC</code>). Furthermore, I did not perform extensive
-tests. This may explain why I did not notice this sooner.</li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-orgf81dcf5" class="outline-4">
-<h4 id="orgf81dcf5"><span class="section-number-4">1.2.10</span> 2017-03-13 Monday</h4>
-<div class="outline-text-4" id="text-1-2-10">
-</div>
-<ol class="org-ol">
-<li><a id="org39a733c"></a>Let’s play with Grid 5000&#xa0;&#xa0;&#xa0;<span class="tag"><span class="G5K">G5K</span></span><br />
-<div class="outline-text-5" id="text-1-2-10-1">
-<ul class="org-ul">
-<li><p>
-Connect to Grenoble’s site:
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">ssh tocornebize@access.grid5000.fr
-ssh grenoble
-</pre>
-</div></li>
-<li><p>
-Reserve a node and deploy:
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">oarsub -I -l <span class="org-variable-name">nodes</span>=1,<span class="org-variable-name">walltime</span>=7 -t deploy
-kadeploy3 -f $<span class="org-variable-name">OAR_NODE_FILE</span> -e jessie-x64-big -k
-</pre>
-</div></li>
-<li><p>
-Connect as root on the new node:
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">ssh root@genepi-33.grenoble.grid5000.fr
-</pre>
-</div></li>
-<li><p>
-Install Simgrid:
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">wget https://github.com/simgrid/simgrid/archive/c8db21208f3436c35d3fdf5a875a0059719bff43.zip -O simgrid.zip
-unzip simgrid.zip
-<span class="org-builtin">cd</span> simgrid-*
-mkdir build
-<span class="org-builtin">cd</span> build
-cmake -Denable_documentation=OFF ..
-make -j 8
-make install
-</pre>
-</div></li>
-<li>Copy HPL on the machine, with <code>scp</code>.</li>
-<li>Change the variable <code>TOPDIR</code> in the file <code>Make.SMPI</code>.</li>
-<li>Do not forget to clean HPL directory when copying it, otherwise the modification of the variable <code>TOPDIR</code> will not be
-applied on the sub-makefiles.</li>
-<li>Success of compilation and execution of HPL with Simgrid on one Grid5000 node.</li>
-<li>Strange thing: the virtual time did not change much (228 seconds, or 23.3 Gflops), although the simulation time
-changed a lot (50 seconds, against 15 seconds on my laptop) and I used the same value for the option <code>running-power</code>.</li>
-</ul>
-</div>
-</li>
-<li><a id="org7dae175"></a>Scrit for automatic installation&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SHELL">SHELL</span>&#xa0;<span class="G5K">G5K</span></span><br />
-<div class="outline-text-5" id="text-1-2-10-2">
-<ul class="org-ul">
-<li><p>
-A small bash script to install Simgrid and compile HPL. Store it in file <code>deploy.sh</code>. It assume that archives for
-Simgrid and HPL are located in <code>/home/tocornebize</code>.
-</p>
-<div class="org-src-container">
-<pre class="src src-sh"><span class="org-keyword">function</span> <span class="org-function-name">abort</span> {
-    <span class="org-builtin">echo</span> -e <span class="org-string">"\e[1;31m Error:"</span> $<span class="org-variable-name">1</span> <span class="org-string">"\e[0m"</span>
-    <span class="org-keyword">exit</span> 1
-}
-
-rm -rf hpl* simgrid*
-cp /home/tocornebize/{hpl,simgrid}.zip . &amp;&amp;<span class="org-sh-escaped-newline">\</span>
-unzip hpl.zip
-unzip simgrid.zip
-<span class="org-keyword">if</span> [ $<span class="org-variable-name">?</span> -ne 0 ]
-<span class="org-keyword">then</span>
-    abort <span class="org-string">"Could not copy or extract the archives."</span>
-<span class="org-keyword">fi</span>
-
-<span class="org-builtin">echo</span> <span class="org-string">""</span>
-<span class="org-builtin">echo</span> -e <span class="org-string">"\e[1;34m Installing Simgrid\e[0m"</span>
-<span class="org-builtin">cd</span> simgrid* &amp;&amp;<span class="org-sh-escaped-newline">\</span>
-mkdir build &amp;&amp;<span class="org-sh-escaped-newline">\</span>
-<span class="org-builtin">cd</span> build &amp;&amp;<span class="org-sh-escaped-newline">\</span>
-cmake -Denable_documentation=OFF .. &amp;&amp;<span class="org-sh-escaped-newline">\</span>
-make -j 8 &amp;&amp;<span class="org-sh-escaped-newline">\</span>
-make install &amp;&amp;<span class="org-sh-escaped-newline">\</span>
-<span class="org-builtin">cd</span> ../..
-<span class="org-keyword">if</span> [ $<span class="org-variable-name">?</span> -ne 0 ]
-<span class="org-keyword">then</span>
-    abort <span class="org-string">"Could not install Simgrid."</span>
-<span class="org-keyword">fi</span>
-
-<span class="org-builtin">echo</span> <span class="org-string">""</span>
-<span class="org-builtin">echo</span> -e <span class="org-string">"\e[1;34m Installing HPL\e[0m"</span>
-<span class="org-builtin">cd</span> hpl* &amp;&amp;<span class="org-sh-escaped-newline">\</span>
-sed -ri <span class="org-string">"s|TOPdir\s+=.+|TOPdir="</span><span class="org-sh-quoted-exec">`pwd`</span><span class="org-string">"|g"</span> Make.SMPI &amp;&amp;<span class="org-string">\ </span><span class="org-comment-delimiter"># </span><span class="org-comment">fixing TOPdir variable</span>
-make startup -j 8 <span class="org-variable-name">arch</span>=SMPI &amp;&amp;<span class="org-sh-escaped-newline">\</span>
-make -j 8 <span class="org-variable-name">arch</span>=SMPI &amp;&amp;<span class="org-sh-escaped-newline">\</span>
-<span class="org-builtin">cd</span> ..
-<span class="org-keyword">if</span> [ $<span class="org-variable-name">?</span> -ne 0 ]
-<span class="org-keyword">then</span>
-    abort <span class="org-string">"Could not compile HPL."</span>
-<span class="org-keyword">fi</span>
-
-<span class="org-builtin">echo</span> <span class="org-string">""</span>
-<span class="org-builtin">echo</span> -e <span class="org-string">"\e[1;32m Everything was ok\e[0m"</span>
-</pre>
-</div></li>
-<li><p>
-Given a node obtained with <code>oarsub</code> and <code>kadeploy3</code>, connect in ssh to it. Then, just run:
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">/home/tocornebize/deploy.sh
-</pre>
-</div></li>
-</ul>
-</div>
-</li>
-<li><a id="orgf784045"></a>Recurrent failure in HPL with <code>SMPI_SHARED_MALLOC</code>&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="BUG">BUG</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-10-3">
-<ul class="org-ul">
-<li><p>
-The following error often happens when runing HPL with <code>SMPI_SHARED_MALLOC</code>:
-</p>
-<pre class="example">
-src/simix/smx_global.cpp:557: [simix_kernel/CRITICAL] Oops ! Deadlock or code not perfectly clean.
-</pre></li>
-<li>It does not seem to happen without <code>SMPI_SHARED_MALLOC</code>.</li>
-<li>It does not always happen with <code>SMPI_SHARED_MALLOC</code>.</li>
-<li>I do not understand what is happening.</li>
-</ul>
-</div>
-</li>
-<li><a id="org4f685e0"></a>Another failure in HPL with <code>SMPI_SHARED_MALLOC</code>&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="BUG">BUG</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-10-4">
-<ul class="org-ul">
-<li><p>
-Similarly, the tests on the matrix at the end of HPL are never computed when we use =SMPI<sub>SHARED</sub><sub>MALLOC</sub>, because of an
-error. For instance:
-</p>
-<pre class="example">
-HPL ERROR from process # 0, on line 331 of function HPL_pdtest:
-&gt;&gt;&gt; Error code returned by solve is 1021, skip &lt;&lt;&lt;
-</pre></li>
-<li>Example of error code: 1021, 1322, 1324, 1575&#x2026; These values appear nowhere in the code.</li>
-</ul>
-</div>
-</li>
-<li><a id="org68d3f64"></a>Tracking the error in HPL<br />
-<div class="outline-text-5" id="text-1-2-10-5">
-<ul class="org-ul">
-<li>Put some <code>printf</code> to track the error.</li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-org7ec343b" class="outline-4">
-<h4 id="org7ec343b"><span class="section-number-4">1.2.11</span> 2017-03-14 Tuesday</h4>
-<div class="outline-text-4" id="text-1-2-11">
-</div>
-<ol class="org-ol">
-<li><a id="org9051982"></a>Keep tracking the error.&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="GIT">GIT</span>&#xa0;<span class="TRACING">TRACING</span>&#xa0;<span class="BUG">BUG</span></span><br />
-<div class="outline-text-5" id="text-1-2-11-1">
-<ul class="org-ul">
-<li>Add the option <code>--cfg=smpi/simulate-computation:0</code> to have a deterministic execution.</li>
-<li><p>
-The error code is the field <code>info</code> of the matrix. It is modified in the execution path <code>HPL_pdgesv</code> &rarr; <code>HPL_pdgesv0</code> &rarr;
-<code>HPL_pdpanel_free</code>, by the following line:
-</p>
-<div class="org-src-container">
-<pre class="src src-c"><span class="org-keyword">if</span>( PANEL-&gt;pmat-&gt;info == 0 ) PANEL-&gt;pmat-&gt;info = *(PANEL-&gt;DINFO);
-</pre>
-</div>
-<p>
-Thus, we now have to track the values of the <code>DINFO</code> field in the panel.
-</p></li>
-<li>Strange thing, the field <code>DINFO</code> is a pointer to a <code>float</code>.</li>
-<li><p>
-To track this, use this function:
-</p>
-<div class="org-src-container">
-<pre class="src src-c"><span class="org-type">void</span> <span class="org-function-name">print_info</span>(<span class="org-type">HPL_T_panel</span> *<span class="org-variable-name">PANEL</span>, <span class="org-type">int</span> <span class="org-variable-name">line</span>) {
-   <span class="org-keyword">if</span>(PANEL-&gt;grid-&gt;myrow == 0 &amp;&amp; PANEL-&gt;grid-&gt;mycol == 0) {
-        printf(<span class="org-string">"info = %f, line = %d\n"</span>, *PANEL-&gt;DINFO, line);
-   }
-}
-</pre>
-</div>
-<p>
-Put some calls to it at nearly every line of the target file (when you are done with a file, remove these calls).
-</p></li>
-<li>Field <code>DINFO</code> is modified in the execution path <code>HPL_pdgesv0</code> &rarr; <code>HPL_pdfact</code> &rarr; <code>panel-&gt;algo-&gt;rffun</code>. The pointer <code>rffun</code> is
-one of the functions <code>HPL_pdrpan***</code>. In our settings, <code>HPL_pdrpanllT</code> is used.</li>
-<li>Field <code>DINFO</code> is modified by <code>PANEL-&gt;algo-&gt;pffun</code>, which is one of the functions <code>HPL_pdpan***</code>. In our settings,
-<code>HPL_pdpanllT</code> is used.</li>
-<li><p>
-Then it is modified by the first call to <code>HPL_dlocswpT</code>. This function directly modifies the value of <code>DINFO</code> with the
-line:
-</p>
-<div class="org-src-container">
-<pre class="src src-c"><span class="org-keyword">if</span>( *(PANEL-&gt;DINFO) == 0.0 )
-   *(PANEL-&gt;DINFO) = (<span class="org-type">double</span>)(PANEL-&gt;ia + JJ + 1);
-</pre>
-</div></li>
-<li>If we remove this line, as expected the message about the error code disappears. So it confirms the error code come
-from here.</li>
-<li><p>
-Looking at <code>HPL_pdpanel_init.c</code>,
-</p>
-<ul class="org-ul">
-<li><p>
-<code>DINFO</code> is a pointer to a part of <code>DPIV</code>:
-</p>
-<div class="org-src-container">
-<pre class="src src-c">PANEL-&gt;DINFO = PANEL-&gt;DPIV + JB;
-</pre>
-</div></li>
-<li><p>
-<code>DPIV</code> is a pointer to a part of <code>L1</code>:
-</p>
-<div class="org-src-container">
-<pre class="src src-c">PANEL-&gt;DPIV  = PANEL-&gt;L1    + JB * JB;
-</pre>
-</div></li>
-<li><p>
-<code>L1</code> is an (aligned) alias for <code>WORK</code>, which is itself a block of memory allocated with <code>malloc</code>:
-</p>
-<div class="org-src-container">
-<pre class="src src-c">PANEL-&gt;WORK = (<span class="org-type">void</span>*) malloc((<span class="org-type">size_t</span>)(lwork) * <span class="org-keyword">sizeof</span>(<span class="org-type">double</span>));
-<span class="org-comment-delimiter">// </span><span class="org-comment">[...]</span>
-PANEL-&gt;L1    = (<span class="org-type">double</span> *)<span class="org-function-name">HPL_PTR</span>( PANEL-&gt;WORK, dalign );
-</pre>
-</div></li>
-</ul>
-<p>
-L1 is the jb &times; jb upper block of the local matrix. It is used for computations. Thus, it seems that HPL expects a
-particular cell of this local matrix to have the value 0. This cell is not always the same.
-Interpretation: HPL is checking that the matrix is correctly factorized (it uses LU factorization, so it computes L
-and U such that A=LU, L is lower-triangular and U is upper-triangular). Since we use shared memory, it is not
-surprising that the correctness check do not pass anymore.
-What is more surprising is that this particular check was still passing when the two BLAS functions were replaced by
-<code>smpi_usleep</code>. A guess: the fact that the resulting matrices are triangular only depends on the correctness of the
-swapping of rows.
-</p></li>
-<li>Thus, it seems that the error code is explained. This is a normal behavior, considered what we are doing.</li>
-<li>The deadlock happening in some executions is not explained however.</li>
-</ul>
-</div>
-</li>
-<li><a id="orgcd31e6c"></a>Webinar&#xa0;&#xa0;&#xa0;<span class="tag"><span class="MEETING">MEETING</span></span><br />
-<div class="outline-text-5" id="text-1-2-11-2">
-<ul class="org-ul">
-<li>Enabling open and reproducible research at computer system’s conferences: good, bad and ugly</li>
-<li>Grigori Fursin</li>
-<li>The speaker created an <a href="http://ctuning.org/">organization</a> about reproducible research.</li>
-<li>Artifact evaluation is about peer review of experiments.</li>
-<li>How it works: papers accepted to a conference can ask for an artifact evaluation. If they pass it, they would get a
-nice stamp on the paper. If they fail it, nobody will know. View this as a bonus for a paper. For the evaluation of
-the artifacts, the conference nominates several reviewers.</li>
-<li>ACM conferences also start using this kind of things, with several different stamps.</li>
-<li>But artifact evaluation is not easy to do. Firstly, there is a lot of artifacts to evaluate, hard to scale. Some
-artifact evaluations require proprietary software and/or rare hardware (e.g. supercomputers). Also, hard to find a
-reviewer with suitable skills for some cases.</li>
-<li>Also, it is difficult to reproduce empirical results (changing software and hardware). Everyone has its own scripts,
-so hard to standardize a universal workflow.</li>
-<li>Other <a href="http://cknowledge.org">website</a>.</li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-org4302032" class="outline-4">
-<h4 id="org4302032"><span class="section-number-4">1.2.12</span> 2017-03-15 Wednesday</h4>
-<div class="outline-text-4" id="text-1-2-12">
-</div>
-<ol class="org-ol">
-<li><a id="org883daa0"></a>Hunting the deadlock&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="PYTHON">PYTHON</span>&#xa0;<span class="R">R</span>&#xa0;<span class="TRACING">TRACING</span>&#xa0;<span class="BUG">BUG</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-12-1">
-<ul class="org-ul">
-<li>With N=40000, P=Q=4 and the option <code>--cfg=smpi/simulate-computation:0</code>, it seems we always have a deadlock.</li>
-<li>Let’s trace it, with option <code>-trace -trace-file /tmp/trace --cfg=smpi/trace-call-location:1</code>.</li>
-<li>Processing the trace file:</li>
-</ul>
-<div class="org-src-container">
-<pre class="src src-sh">pj_dump --user-defined --ignore-incomplete-links /tmp/trace &gt; /tmp/trace.csv
-grep <span class="org-string">"State,"</span> /tmp/trace.csv | sed -e <span class="org-string">'s/()//'</span> -e <span class="org-string">'s/MPI_STATE, //ig'</span>  -e <span class="org-string">'s/State, //ig'</span> -e <span class="org-string">'s/rank-//'</span> -e<span class="org-sh-escaped-newline">\</span>
-<span class="org-string">'s/PMPI_/MPI_/'</span> | grep MPI_  | tr <span class="org-string">'A-Z'</span> <span class="org-string">'a-z'</span> &gt; /tmp/trace_processed.csv
-</pre>
-</div>
-
-<p>
-Clean the paths:
-</p>
-<div class="org-src-container">
-<pre class="src src-python"><span class="org-keyword">import</span> re
-<span class="org-variable-name">reg</span> = re.<span class="org-builtin">compile</span>(<span class="org-string">'((?:[^/])*)(?:/[a-zA-Z0-9_-]*)*((?:/hpl-2.2(?:/[a-zA-Z0-9_-]*)*).*)'</span>)
-<span class="org-keyword">def</span> <span class="org-function-name">process</span>(in_file, out_file):
-    <span class="org-keyword">with</span> <span class="org-builtin">open</span>(in_file, <span class="org-string">'r'</span>) <span class="org-keyword">as</span> in_f:
-        <span class="org-keyword">with</span> <span class="org-builtin">open</span>(out_file, <span class="org-string">'w'</span>) <span class="org-keyword">as</span> out_f:
-            <span class="org-keyword">for</span> line <span class="org-keyword">in</span> in_f:
-                <span class="org-variable-name">match</span> = reg.match(line)
-                out_f.write(<span class="org-string">'%s%s\n'</span> % (match.group(1), match.group(2)))
-process(<span class="org-string">'/tmp/trace_processed.csv'</span>, <span class="org-string">'/tmp/trace_cleaned.csv'</span>)
-</pre>
-</div>
-
-<p>
-Analysis:
-</p>
-<div class="org-src-container">
-<pre class="src src-R">trace &lt;- read.csv("/tmp/trace_cleaned.csv", header=F, strip.white=T, sep=",");
-names(trace) = c("rank", "start", "end", "duration", "level", "state", "Filename", "Linenumber");
-trace$idx = 1:length(trace$rank)
-head(trace)
-</pre>
-</div>
-
-<pre class="example">
-  rank    start      end duration level    state
-1    8 0.000000 0.000000 0.000000     0 mpi_init
-2    8 0.000000 0.000202 0.000202     0 mpi_recv
-3    8 0.000202 0.000403 0.000201     0 mpi_recv
-4    8 0.000403 0.000806 0.000403     0 mpi_recv
-5    8 0.000806 0.000806 0.000000     0 mpi_send
-6    8 0.000806 0.001612 0.000806     0 mpi_recv
-                               Filename Linenumber idx
-1 /hpl-2.2/testing/ptest/hpl_pddriver.c        109   1
-2        /hpl-2.2/src/grid/hpl_reduce.c        165   2
-3        /hpl-2.2/src/grid/hpl_reduce.c        165   3
-4        /hpl-2.2/src/grid/hpl_reduce.c        165   4
-5        /hpl-2.2/src/grid/hpl_reduce.c        159   5
-6     /hpl-2.2/src/grid/hpl_broadcast.c        130   6
-</pre>
-
-<div class="org-src-container">
-<pre class="src src-R">get_last_event = function(df) {
-    result = data.frame() 
-    for(rank in (sort(unique(trace$rank)))) {
-	tmp_trace = trace[trace$rank == rank,]
-	result = rbind(result, tmp_trace[which.max(tmp_trace$idx),])
-    }
-    return(result)
-}
-get_last_event(trace)[c(1, 2, 3, 6, 7, 8)]
-</pre>
-</div>
-
-<pre class="example">
-      rank    start      end    state                         Filename
-18756    0 67.01313 67.10575 mpi_recv /hpl-2.2/src/pgesv/hpl_spreadt.c
-9391     1 66.84201 67.10575 mpi_recv /hpl-2.2/src/pgesv/hpl_spreadt.c
-7865     2 66.92821 67.10575 mpi_recv /hpl-2.2/src/pgesv/hpl_spreadt.c
-7048     3 67.01313 67.10575 mpi_recv /hpl-2.2/src/pgesv/hpl_spreadt.c
-6242     4 67.08334 67.10575 mpi_send   /hpl-2.2/src/pgesv/hpl_rollt.c
-4699     5 66.93228 67.10575 mpi_wait   /hpl-2.2/src/pgesv/hpl_rollt.c
-3174     6 67.02313 67.10575 mpi_wait   /hpl-2.2/src/pgesv/hpl_rollt.c
-2358     7 67.08334 67.10575 mpi_send   /hpl-2.2/src/pgesv/hpl_rollt.c
-1554     8 67.08334 67.10575 mpi_recv /hpl-2.2/src/pgesv/hpl_spreadt.c
-17201    9 66.93228 67.10575 mpi_send /hpl-2.2/src/pgesv/hpl_spreadt.c
-15675   10 67.02313 67.10575 mpi_send /hpl-2.2/src/pgesv/hpl_spreadt.c
-14858   11 67.08334 67.10575 mpi_recv /hpl-2.2/src/pgesv/hpl_spreadt.c
-14053   12 67.06093 67.10575 mpi_recv /hpl-2.2/src/pgesv/hpl_spreadt.c
-12516   13 66.88778 67.10575 mpi_recv /hpl-2.2/src/pgesv/hpl_spreadt.c
-10998   14 66.97831 67.10575 mpi_recv /hpl-2.2/src/pgesv/hpl_spreadt.c
-10189   15 67.06093 67.10575 mpi_recv /hpl-2.2/src/pgesv/hpl_spreadt.c
-      Linenumber
-18756        321
-9391         321
-7865         321
-7048         321
-6242         235
-4699         242
-3174         242
-2358         235
-1554         321
-17201        351
-15675        351
-14858        321
-14053        321
-12516        321
-10998        321
-10189        321
-</pre>
-
-<p>
-If the trace is correct, the deadlock happens in functions <code>HPL_rollT</code> and <code>HPL_spreadT</code>.
-Some <code>printf</code> confirm that the deadlock is indeed happening in these places.
-</p>
-</div>
-</li>
-<li><a id="org71eeb2c"></a>Found the deadlock&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="C">C</span>&#xa0;<span class="BUG">BUG</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-12-2">
-<ul class="org-ul">
-<li><p>
-Let’s add some <code>printf</code> in files <code>HPL_spreadT.c</code> and <code>HPL_rollT.c</code>.
-First, add the functions:
-</p>
-<div class="org-src-container">
-<pre class="src src-c"><span class="org-type">int</span> <span class="org-function-name">local_rank_to_global</span>(<span class="org-type">int</span> <span class="org-variable-name">local_rank</span>, <span class="org-type">MPI_Comm</span> <span class="org-variable-name">local_communicator</span>) {
-    <span class="org-type">int</span> <span class="org-variable-name">result</span>;
-    <span class="org-type">MPI_Group</span> <span class="org-variable-name">local_group</span>, <span class="org-variable-name">world_group</span>;
-    MPI_Comm_group(local_communicator, &amp;local_group);
-    MPI_Comm_group(MPI_COMM_WORLD, &amp;world_group);
-    MPI_Group_translate_ranks(local_group, 1, &amp;local_rank, world_group, &amp;result);
-    <span class="org-keyword">return</span> result;
-}
-<span class="org-type">void</span> <span class="org-function-name">print_info</span>(<span class="org-type">int</span> <span class="org-variable-name">src_rank</span>, <span class="org-type">int</span> <span class="org-variable-name">dst_rank</span>, <span class="org-type">char</span> *<span class="org-variable-name">function</span>, <span class="org-type">int</span> <span class="org-variable-name">line</span>, <span class="org-type">char</span> *<span class="org-variable-name">file</span>) {
-    printf(<span class="org-string">"src=%d dst=%d function=%s line=%d file=%s\n"</span>, src_rank, dst_rank, function,
-line, file);
-}
-</pre>
-</div>
-<p>
-Then, add a call to <code>print_info</code> before each of the four lines we found:
-</p>
-<ul class="org-ul">
-<li><p>
-<code>HPL_spreadT.c</code>, line 321:
-</p>
-<div class="org-src-container">
-<pre class="src src-c"><span class="org-type">int</span> <span class="org-variable-name">local_rank</span> = local_rank_to_global(IPMAP[SRCDIST+partner], comm);
-<span class="org-function-name">print_info</span>(my_rank, local_rank, <span class="org-string">"mpi_recv"</span>, __LINE__, __FILE__);
-</pre>
-</div></li>
-<li><p>
-<code>HPL_spreadT.c</code>, line 351:
-</p>
-<div class="org-src-container">
-<pre class="src src-c"><span class="org-type">int</span> <span class="org-variable-name">local_rank</span> = local_rank_to_global(IPMAP[SRCDIST+partner], comm);
-<span class="org-function-name">print_info</span>(my_rank, local_rank, <span class="org-string">"mpi_send"</span>, __LINE__, __FILE__);
-</pre>
-</div></li>
-<li><p>
-<code>HPL_rollT.c</code>, line 235:
-</p>
-<div class="org-src-container">
-<pre class="src src-c"><span class="org-type">int</span> <span class="org-variable-name">local_rank</span> = local_rank_to_global(partner, comm);
-<span class="org-function-name">print_info</span>(my_rank, local_rank, <span class="org-string">"mpi_send"</span>, __LINE__, __FILE__);
-</pre>
-</div></li>
-<li><p>
-<code>HPL_rollT.c</code>, line 242:
-</p>
-<div class="org-src-container">
-<pre class="src src-c"><span class="org-type">int</span> <span class="org-variable-name">local_rank</span> = local_rank_to_global(partner, comm);
-<span class="org-function-name">print_info</span>(my_rank, local_rank, <span class="org-string">"mpi_wait"</span>, __LINE__, __FILE__);
-</pre>
-</div></li>
-</ul></li>
-<li>Then, run HPL with <code>stdout</code> redirected to a file <code>/tmp/output</code>.</li>
-<li><p>
-For each rank, look for the last time this rank was the caller of a blocking MPI primitive. For instance, for rank <code>15</code>:
-</p>
-<div class="org-src-container">
-<pre class="src src-sh"><span class="org-variable-name">RANK</span>=<span class="org-string">"15 "</span> &amp;&amp; grep <span class="org-string">"src="</span>$<span class="org-variable-name">RANK</span> /tmp/output | tail -n 1
-</pre>
-</div>
-<p>
-Observe the destination and the function.
-With P=Q=4, we had these dependencies:
-</p>
-<pre class="example">
-         12
-          |
-mpi_recv  |
-          |
-          v     mpi_recv
-          4 &lt;——————————————+
-          |                |
-          |                |
-mpi_wait  |                |
-          |                |
-          v                |
-          8 —————————————&gt; 0
-               mpi_send
-</pre>
-<p>
-There is the same pattern for {1, 5, 9, 13}, {2, 6, 10, 14} and {3, 7, 11, 15}.
-</p></li>
-<li>This exact deadlock has been reproduced on Grid 5000, with the same parameters.</li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-org8d1c8b9" class="outline-4">
-<h4 id="org8d1c8b9"><span class="section-number-4">1.2.13</span> 2017-03-16 Thursday</h4>
-<div class="outline-text-4" id="text-1-2-13">
-</div>
-<ol class="org-ol">
-<li><a id="orgdf801f3"></a>Still looking for the deadlock&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="BUG">BUG</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-13-1">
-<ul class="org-ul">
-<li>When HPL is ran with <code>smpi_usleep</code> but without <code>SMPI_SHARED_{MALLOC,FREE}</code>, there is no deadlock, even with the same parameters (N=40000,
-P=Q=4). Warning: testing with N=40000 require a lot of memory, about 12GB.</li>
-<li><p>
-When HPL is ran with <code>SMPI_SHARED_{MALLOC,FREE}</code> but without <code>smpi_usleep</code>, there is a deadlock. Note that we still use
-the option <code>--cfg=smpi/simulate-computation:0</code>. It happens in the same location, but the deadlock is different. Now, it
-is like this (and is located only in <code>HPL_spreadT</code>):
-</p>
-<pre class="example">
-          4
-          |
-mpi_recv  |
-          |
-          v     mpi_recv
-          0 &lt;——————————————+
-          |                |
-          |                |
-mpi_send  |                |
-          |                |
-          v                |
-         12 —————————————&gt; 8
-               mpi_send
-</pre>
-<p>
-There is the same pattern for {1, 5, 9, 13}, {2, 6, 10, 14} and {3, 7, 11, 15}.
-</p></li>
-</ul>
-</div>
-</li>
-<li><a id="orgf59ded3"></a>Understanding HPL code&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="TRACING">TRACING</span>&#xa0;<span class="BUG">BUG</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-13-2">
-<ul class="org-ul">
-<li>In file <code>HPL_spreadT.c</code>.</li>
-<li><p>
-In our settings, the following <code>if</code> statement is never taken:
-</p>
-<div class="org-src-container">
-<pre class="src src-c"><span class="org-keyword">if</span>(SIDE == HplLeft)
-</pre>
-</div></li>
-<li>In the <code>else</code> part, there is a big <code>do while</code> loop. Some initializations happen before this loop.</li>
-<li><code>npm1</code>: initialized to <code>nprow - SRCDIST - 1</code>, not modified during the loop.</li>
-<li><code>ip2</code>: initialized to the biggest power of 2 smaller or equal to <code>npm1</code>. Divided by <code>2</code> at each step. The loop stops when
-<code>ip2</code> is <code>0</code>.</li>
-<li><code>mask</code>: initialized to <code>ip2*2-1</code> (<code>ip2</code> is a single bit set to <code>1</code> followed by a bunch of <code>0</code>, <code>mask</code> is the same bit set to <code>1</code>
-followed by a bunch of <code>1</code>). At the beginning of each step, the first <code>1</code> of <code>mask</code> is flipped, so <code>mask</code> is <code>ip2-1</code> after this
-statement.</li>
-<li><code>IPMAP</code>: mapping of the processes.</li>
-<li><code>IPMAPM1</code>: inverse mapping (<code>IPMAPM1[IPMAP[i]]</code> is equal to <code>i</code>).</li>
-<li><code>mydist</code>: initialized to <code>IPMAP1[myrow]</code>, not modified after.</li>
-<li><code>partner</code>: at each step, set to <code>mydist^ip2</code>, i.e. we flip exactly one bit of <code>mydist</code>.</li>
-<li>We do the communications only when <code>mydist &amp; mask</code> is <code>0</code> and when <code>lbuf &gt; 0</code>.
-<ul class="org-ul">
-<li>If <code>mydist &amp; ip2</code> is not <code>0</code>, we receive.</li>
-<li>If <code>mydist &amp; ip2</code> is <code>0</code>, we send.</li>
-</ul></li>
-<li><p>
-Print the content of <code>IPMAP</code>. Add the following line before the <code>do while</code>:
-</p>
-<div class="org-src-container">
-<pre class="src src-c">printf(<span class="org-string">"IPMAP: my_rank=%d, %d %d %d %d \n"</span>, my_rank,
-  local_rank_to_global(IPMAP[0], comm), local_rank_to_global(IPMAP[1], comm),
-  local_rank_to_global(IPMAP[2], comm), local_rank_to_global(IPMAP[3], comm));
-</pre>
-</div>
-<p>
-We get this output:
-</p>
-<pre class="example">
-IPMAP: my_rank=0, 0 4 12 8
-IPMAP: my_rank=12, 0 4 12 8
-IPMAP: my_rank=8, 0 4 8 12
-IPMAP: my_rank=4, 0 4 12 8
-IPMAP: my_rank=0, 0 4 12 8
-IPMAP: my_rank=4, 0 4 12 8
-IPMAP: my_rank=1, 1 5 13 9
-IPMAP: my_rank=5, 1 5 13 9
-IPMAP: my_rank=13, 1 5 13 9
-IPMAP: my_rank=9, 1 5 9 13
-IPMAP: my_rank=5, 1 5 13 9
-IPMAP: my_rank=1, 1 5 13 9
-IPMAP: my_rank=2, 2 6 14 10
-IPMAP: my_rank=3, 3 7 15 11
-IPMAP: my_rank=6, 2 6 14 10
-IPMAP: my_rank=7, 3 7 15 11
-IPMAP: my_rank=10, 2 6 10 14
-IPMAP: my_rank=11, 3 7 11 15
-IPMAP: my_rank=14, 2 6 14 10
-IPMAP: my_rank=15, 3 7 15 11
-IPMAP: my_rank=6, 2 6 14 10
-IPMAP: my_rank=2, 2 6 14 10
-IPMAP: my_rank=7, 3 7 15 11
-IPMAP: my_rank=3, 3 7 15 11
-</pre>
-<p>
-Recall that our communicators are {n, n+4, n+8, n+12} for n in {0, 1, 2, 3}. We see a pattern here: when processes
-have a local rank in {0, 1, 3}, their <code>IPMAP</code> is {0, 1, 3, 2} (local ranks), but when the local rank is 2, then <code>IPMAP</code> is
-{0, 1, 2, 3}.
-</p></li>
-<li><p>
-Now, let’s print the other parameters. Add the following line just after the modification of <code>mask</code> at the beginning of
-the <code>do while</code>:
-</p>
-<div class="org-src-container">
-<pre class="src src-c">printf(<span class="org-string">"### my_rank=%d (%d) id_func=%d mask=%d ip2=%d mydist=%d"</span>, my_rank,
-  my_local_rank, id_func, mask, ip2, mydist);
-</pre>
-</div>
-<p>
-Here, <code>id_func</code> is a static variable initialized to <code>-1</code> and incremented at the beginning of every function call.
-Later in the code, add these:
-</p>
-<div class="org-src-container">
-<pre class="src src-c">printf(<span class="org-string">" partner=%d"</span>, partner);
-</pre>
-</div>
-<p>
-and
-</p>
-<div class="org-src-container">
-<pre class="src src-c">printf(<span class="org-string">" mpi_recv(%d)\n"</span>, <span class="org-type">IPMAP</span>[SRCDIST+partner]);
-</pre>
-</div>
-<p>
-or
-</p>
-<div class="org-src-container">
-<pre class="src src-c">printf(<span class="org-string">" mpi_send(%d)\n"</span>, <span class="org-type">IPMAP</span>[SRCDIST+partner]);
-</pre>
-</div>
-<p>
-(depending on if we do a send or a receive).
-We have this output for {0, 4, 8, 12} (this is similar for other communicators):
-</p>
-<div class="org-src-container">
-<pre class="src src-bash">grep <span class="org-string">"my_rank=0 "</span> output | grep <span class="org-string">"###"</span>
-<span class="org-comment-delimiter">### </span><span class="org-comment">my_rank=0 (0) id_func=0 mask=1 ip2=2 mydist=0 partner=2 mpi_send(3)</span>
-<span class="org-comment-delimiter">### </span><span class="org-comment">my_rank=0 (0) id_func=0 mask=0 ip2=1 mydist=0 partner=1 mpi_send(1)</span>
-<span class="org-comment-delimiter">### </span><span class="org-comment">my_rank=0 (0) id_func=1 mask=1 ip2=2 mydist=0 partner=2 mpi_send(3)</span>
-grep <span class="org-string">"my_rank=4 "</span> output | grep <span class="org-string">"###"</span>
-<span class="org-comment-delimiter">### </span><span class="org-comment">my_rank=4 (1) id_func=0 mask=1 ip2=2 mydist=1</span>
-<span class="org-comment-delimiter">### </span><span class="org-comment">my_rank=4 (1) id_func=0 mask=0 ip2=1 mydist=1 partner=0 mpi_recv(0)</span>
-<span class="org-comment-delimiter">### </span><span class="org-comment">my_rank=4 (1) id_func=1 mask=1 ip2=2 mydist=1</span>
-<span class="org-comment-delimiter">### </span><span class="org-comment">my_rank=4 (1) id_func=1 mask=0 ip2=1 mydist=1 partner=0 mpi_recv(0)</span>
-grep <span class="org-string">"my_rank=8 "</span> output | grep <span class="org-string">"###"</span>
-<span class="org-comment-delimiter">### </span><span class="org-comment">my_rank=8 (2) id_func=0 mask=1 ip2=2 mydist=2 partner=0 mpi_recv(0)</span>
-grep <span class="org-string">"my_rank=12 "</span> output | grep <span class="org-string">"###"</span>
-<span class="org-comment-delimiter">### </span><span class="org-comment">my_rank=12 (3) id_func=0 mask=1 ip2=2 mydist=2 partner=0 mpi_recv(0)</span>
-<span class="org-comment-delimiter">### </span><span class="org-comment">my_rank=12 (3) id_func=0 mask=0 ip2=1 mydist=2 partner=3 mpi_send(2)</span>
-</pre>
-</div>
-<p>
-We see that the pattern of communication looks like a binary tree. At each function call, in the first step 0 sends to
-12, in the second step 0 sends to 4 and 12 sends to 8. The problem is that all the <code>mpi_recv</code> match the <code>mpi_send</code> except
-for the node 8. This node calls <code>mpi_recv</code> with node 0 for the source, but we would expect it to have 12 for source.
-The same pattern is observed for other communicators.
-</p></li>
-<li>We saw that the nodes with local rank 2 call <code>MPI_Recv</code> with an unexpected source. These nodes also have a different
-<code>IPMAP</code>. Hypothesis: these different <code>IPMAP</code> are a bug.</li>
-<li>Doing the same experiment without <code>SMPI_SHARED_{MALLOC,FREE}</code> (the case where we do not have a deadlock). Here, we
-observe that the values of <code>IPMAP</code> are the same in all processes. Also, there is a matching <code>MPI_Recv</code> for every <code>MPI_Send</code>,
-as expected.</li>
-<li>Thus, to fix the deadlock, we should search where <code>IPMAP</code> is defined.</li>
-</ul>
-</div>
-</li>
-<li><a id="orge58ac3c"></a>Seminar&#xa0;&#xa0;&#xa0;<span class="tag"><span class="MEETING">MEETING</span></span><br />
-<div class="outline-text-5" id="text-1-2-13-3">
-<ul class="org-ul">
-<li>Taking advantage of application structure for visual performance analysis</li>
-<li>Lucas Mello Schnorr</li>
-<li>Context: two models. Explicit programming (e.g. MPI) or task-based programming (e.g. Cilk).</li>
-<li>In task-based programming, no clear phases (contrarily to things like MPI, where we have communication phases and
-computation phases). Thus, hard to understand the performances when visualizing a trace.</li>
-<li>The scheduler has to assign tasks, anticipate the critical path and minimize data movements. The difficulty is that it
-does not know the whole DAG at the beginning.</li>
-<li>Workflow based on several tools: <code>pj_dump</code>, <code>R</code>, <code>tidyverse</code>, <code>ggplot2</code>, <code>plotly</code>. Everything can be done in org-mode. Agile
-workflow, fail fast if the idea is not working, easily share experiments with colleagues.</li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-orgefa35de" class="outline-4">
-<h4 id="orgefa35de"><span class="section-number-4">1.2.14</span> 2017-03-17 Friday</h4>
-<div class="outline-text-4" id="text-1-2-14">
-</div>
-<ol class="org-ol">
-<li><a id="org3499f18"></a>Let’s look at <code>IPMAP</code>&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="C">C</span>&#xa0;<span class="TRACING">TRACING</span>&#xa0;<span class="BUG">BUG</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-14-1">
-<ul class="org-ul">
-<li><code>IPMAP</code> is given as an argument to <code>HPL_spreadT</code>.</li>
-<li>The function <code>HPL_spreadT</code> is usedd in <code>HPL_pdlaswp01T</code> and <code>HPL_equil</code>.</li>
-<li>In our settings, all processes begin by a call to <code>HPL_pdlaswp01T</code>. Then, all processes with local ranks <code>0</code> and <code>1</code> do a
-call to <code>HPL_equil</code> (local ranks <code>2</code> and <code>3</code> are already deadlocked). Values of <code>IPMAP</code> are the same between the two different
-calls.
-We thus have to look at <code>HPL_pdlaswp01T</code>.</li>
-<li><p>
-<code>IPMAP</code> is defined in this function with other variables. They are all a contiguous block in <code>PANEL-&gt;IWORK</code>:
-</p>
-<div class="org-src-container">
-<pre class="src src-c">iflag  = PANEL-&gt;IWORK;
-<span class="org-comment-delimiter">// </span><span class="org-comment">[...]</span>
-k = (<span class="org-type">int</span>)((<span class="org-type">unsigned</span> <span class="org-type">int</span>)(jb) &lt;&lt; 1);  ipl = iflag + 1; ipID = ipl + 1;
-ipA     = ipID + ((<span class="org-type">unsigned</span> <span class="org-type">int</span>)(k) &lt;&lt; 1); lindxA = ipA + 1;
-lindxAU = lindxA + k; iplen = lindxAU + k; ipmap = iplen + nprow + 1;
-ipmapm1 = ipmap + nprow; permU = ipmapm1 + nprow; iwork = permU + jb;
-</pre>
-</div></li>
-<li><code>PANEL-&gt;IWORK</code> is allocated in <code>HPL_pdpanel_init</code> with a simple <code>malloc</code>. So the bugs does not come from here.</li>
-<li>The content of <code>IPMAP</code> is defined in the function <code>HPL_plindx10</code>.</li>
-<li>Function <code>HPL_plindx10</code> firstly compute the content of array <code>IPLEN</code>, then call function <code>HPL_logsort</code> to compute <code>IPMAP</code> (the
-content of <code>IPMAP</code> depends on the content of <code>IPLEN</code>).</li>
-<li><p>
-Printing the content of <code>IPLEN</code> just after its initialization.
-Add this code just before the call to <code>HPL_logsort</code>:
-</p>
-<div class="org-src-container">
-<pre class="src src-c"><span class="org-type">int</span> <span class="org-variable-name">my_rank</span>;
-<span class="org-function-name">MPI_Comm_rank</span>(MPI_COMM_WORLD, &amp;my_rank);
-printf(<span class="org-string">"&gt;&gt; my_rank=%d, icurrow=%d, IPLEN ="</span>, my_rank, icurrow);
-<span class="org-keyword">for</span>(i = 0; i &lt;= nprow; i++) {
-     printf(<span class="org-string">" %d"</span>, IPLEN[i]);
-}
-printf(<span class="org-string">"\n"</span>);
-</pre>
-</div>
-<p>
-Here are the contents of <code>IPLEN</code> for ranks {0, 4, 8, 12}.
-</p>
-<ul class="org-ul">
-<li><p>
-With <code>SMPI_SHARED_{MALLOC,FREE}</code>:
-</p>
-<table border="2" cellspacing="0" cellpadding="6" rules="groups" frame="hsides">
-
-
-<colgroup>
-<col  class="org-right" />
-
-<col  class="org-right" />
-
-<col  class="org-right" />
-
-<col  class="org-right" />
-
-<col  class="org-right" />
-
-<col  class="org-right" />
-</colgroup>
-<thead>
-<tr>
-<th scope="col" class="org-right">Rank</th>
-<th scope="col" class="org-right">IPLEN[0]</th>
-<th scope="col" class="org-right">IPLEN[1]</th>
-<th scope="col" class="org-right">IPLEN[2]</th>
-<th scope="col" class="org-right">IPLEN[3]</th>
-<th scope="col" class="org-right">IPLEN[4]</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td class="org-right">0</td>
-<td class="org-right">0</td>
-<td class="org-right">103</td>
-<td class="org-right">14</td>
-<td class="org-right">1</td>
-<td class="org-right">2</td>
-</tr>
-
-<tr>
-<td class="org-right">4</td>
-<td class="org-right">0</td>
-<td class="org-right">102</td>
-<td class="org-right">15</td>
-<td class="org-right">1</td>
-<td class="org-right">2</td>
-</tr>
-
-<tr>
-<td class="org-right">8</td>
-<td class="org-right">0</td>
-<td class="org-right">102</td>
-<td class="org-right">14</td>
-<td class="org-right">2</td>
-<td class="org-right">2</td>
-</tr>
-
-<tr>
-<td class="org-right">12</td>
-<td class="org-right">0</td>
-<td class="org-right">102</td>
-<td class="org-right">14</td>
-<td class="org-right">1</td>
-<td class="org-right">3</td>
-</tr>
-</tbody>
-</table></li>
-<li><p>
-Without <code>SMPI_SHARED_{MALLOC,FREE}</code>:
-</p>
-<table border="2" cellspacing="0" cellpadding="6" rules="groups" frame="hsides">
-
-
-<colgroup>
-<col  class="org-right" />
-
-<col  class="org-right" />
-
-<col  class="org-right" />
-
-<col  class="org-right" />
-
-<col  class="org-right" />
-
-<col  class="org-right" />
-</colgroup>
-<thead>
-<tr>
-<th scope="col" class="org-right">Rank</th>
-<th scope="col" class="org-right">IPLEN[0]</th>
-<th scope="col" class="org-right">IPLEN[1]</th>
-<th scope="col" class="org-right">IPLEN[2]</th>
-<th scope="col" class="org-right">IPLEN[3]</th>
-<th scope="col" class="org-right">IPLEN[4]</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td class="org-right">0</td>
-<td class="org-right">0</td>
-<td class="org-right">31</td>
-<td class="org-right">24</td>
-<td class="org-right">26</td>
-<td class="org-right">39</td>
-</tr>
-
-<tr>
-<td class="org-right">4</td>
-<td class="org-right">0</td>
-<td class="org-right">31</td>
-<td class="org-right">24</td>
-<td class="org-right">26</td>
-<td class="org-right">39</td>
-</tr>
-
-<tr>
-<td class="org-right">8</td>
-<td class="org-right">0</td>
-<td class="org-right">31</td>
-<td class="org-right">24</td>
-<td class="org-right">26</td>
-<td class="org-right">39</td>
-</tr>
-
-<tr>
-<td class="org-right">12</td>
-<td class="org-right">0</td>
-<td class="org-right">31</td>
-<td class="org-right">24</td>
-<td class="org-right">26</td>
-<td class="org-right">39</td>
-</tr>
-</tbody>
-</table></li>
-</ul>
-
-<p>
-We can note two things. Firstly, without <code>SMPI_SHARED_{MALLOC,FREE}</code>, all processes have an <code>IPLEN</code> with the same
-content. This is not the case with <code>SMPI_SHARED_{MALLOC,FREE}</code>. Furthermore, values in <code>IPLEN</code> are closer in the
-<code>malloc/free</code> case.
-Thus, the issue is very likely to come from <code>IPLEN</code>.
-</p></li>
-</ul>
-</div>
-</li>
-<li><a id="orgb4a1a3b"></a>Let’s look at <code>IPLEN</code> and <code>IPID</code>&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="TRACING">TRACING</span>&#xa0;<span class="BUG">BUG</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-14-2">
-<ul class="org-ul">
-<li>The content of <code>IPLEN</code> depends on the content of <code>IPID</code>.</li>
-<li>Add a <code>printf</code> to get its content. Every element it contains is present exactly twice in the array.</li>
-<li>With <code>SHARED_{MALLOC,FREE}</code>,
-<ul class="org-ul">
-<li><code>IPID</code> has a size of 300 for local rank <code>0</code>, 302 for the others.</li>
-<li><code>IPID</code> of local rank <code>1</code> is equal to <code>IPID</code> of local rank <code>0</code> plus twice the element <code>120</code>.</li>
-<li><code>IPID</code> of local rank <code>2</code> is equal to <code>IPID</code> of local rank <code>0</code> plus twice the element <code>240</code>.</li>
-<li><code>IPID</code> of local rank <code>3</code> is equal to <code>IPID</code> of local rank <code>0</code> plus twice the element <code>360</code>.</li>
-</ul></li>
-<li>Without <code>SHARED_{MALLOC,FREE}</code>,
-<ul class="org-ul">
-<li><code>IPID</code> has a size of 478 for all ranks.</li>
-<li>All <code>IPID</code> are equal.</li>
-</ul></li>
-<li><code>IPID</code> is computed in function <code>HPL_pipid</code>.</li>
-<li>The content of <code>IPID</code> depends on the content of the array <code>PANEL-&gt;DPIV</code>. This array is made of <code>120</code> elements. These
-elements are of type <code>double</code>. The function cast them to <code>int</code> and do some comparisons using them, which is strange.</li>
-<li>Add a <code>printf</code> to get its content.</li>
-<li>With <code>SHARED_{MALLOC,FREE}</code>,
-<ul class="org-ul">
-<li>The <code>DPIV</code> of the processes having the same local rank are equal.</li>
-<li>The 30 first elements of the arrays <code>DPIV</code> of the processes of a same communicator are equal. The following elements
-are different.</li>
-<li>For the processes of local rank <code>0</code>, these following elements are 30, 31, 32,&#x2026;, 119. In other words, for <code>i &gt; 29</code>, we
-have <code>DPIV[i]</code> equal to <code>i</code>.</li>
-<li>For the processes of local rank <code>1</code>, these elements are all equal to 120. For local rank <code>2</code>, they are equal to 240. For
-local rank <code>3</code>, they are equal to =360.</li>
-</ul></li>
-<li>Without <code>SHARED_{MALLOC,FREE}</code>,
-<ul class="org-ul">
-<li>All the <code>DPIV</code> of all processes are equal.</li>
-<li>All its elements are present exactly once, except <code>4143</code> which is present twice.</li>
-</ul></li>
-</ul>
-<p>
-– Thus, it seems that the issue come from <code>PANEL-&gt;DPIV</code>.
-</p>
-</div>
-</li>
-<li><a id="orgae468f4"></a>Summing up&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="BUG">BUG</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-14-3">
-<ul class="org-ul">
-<li>The values of <code>IPMAP</code> depends on the values of <code>IPLEN</code>.</li>
-<li>The values of <code>IPLEN</code> depends on the values of <code>IPID</code>.</li>
-<li>The values of <code>IPID</code> depends on the values of <code>PANEL-&gt;DPIV</code>.</li>
-<li>For all these arrays, we can observe some strange things in the case <code>SMPI_SHARED_{MALLOC,FREE}</code> (comparing to the case
-<code>malloc/free</code>):
-<ul class="org-ul">
-<li>The content of the arrays is not the same for different ranks.</li>
-<li>The content itself looks kind of strange (e.g. <code>DPIV</code> has a lot of identical values).</li>
-</ul></li>
-</ul>
-</div>
-</li>
-<li><a id="org714d20f"></a>So, why do we have these <code>DPIV</code>?&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="BUG">BUG</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-14-4">
-<ul class="org-ul">
-<li><p>
-The content of <code>DPIV</code> is defined at the end of function <code>HPL_pdmxswp</code>, by the line:
-</p>
-<div class="org-src-container">
-<pre class="src src-c">(PANEL-&gt;DPIV)[JJ] = WORK[2];
-</pre>
-</div>
-<p>
-With some <code>printf</code>, we see that <code>DPIV</code> is filled in order. The values are the same that the ones already observed in <code>DPIV</code>.
-</p></li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-org0dafd71" class="outline-4">
-<h4 id="org0dafd71"><span class="section-number-4">1.2.15</span> 2017-03-20 Monday</h4>
-<div class="outline-text-4" id="text-1-2-15">
-</div>
-<ol class="org-ol">
-<li><a id="org5503996"></a>Write a small Python script to monitor memory usage&#xa0;&#xa0;&#xa0;<span class="tag"><span class="PYTHON">PYTHON</span></span><br />
-<div class="outline-text-5" id="text-1-2-15-1">
-<ul class="org-ul">
-<li>Based on command <code>smemstat</code>.</li>
-<li>Run the command every second in quiet mode with json output. Parse the json file and output the information on screen,
-nicely formated.</li>
-<li>Future work:
-<ul class="org-ul">
-<li>Different sampling rate passed as a command line argument.</li>
-<li>Export in CSV. This would allow to plot memory consumption over time.</li>
-</ul></li>
-</ul>
-</div>
-</li>
-<li><a id="org97f1c9c"></a>Failed tentatives for <code>DPIV</code>&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="BUG">BUG</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-15-2">
-<ul class="org-ul">
-<li><p>
-Tried to hard-code the values of <code>DPIV</code> with something like that:
-</p>
-<div class="org-src-container">
-<pre class="src src-c">(PANEL-&gt;DPIV)[JJ] = 42;
-</pre>
-</div>
-<p>
-Got a segmentation fault.
-</p></li>
-</ul>
-</div>
-</li>
-<li><a id="orgcaea176"></a>Discussion with Arnaud&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="BUG">BUG</span>&#xa0;<span class="HPL">HPL</span>&#xa0;<span class="MEETING">MEETING</span></span><br />
-<div class="outline-text-5" id="text-1-2-15-3">
-<ul class="org-ul">
-<li>Had a look at HPL code.</li>
-<li>Next steps to try to find the issue:
-<ul class="org-ul">
-<li>Try another block size for global <code>SMPI_SHARED_MALLOC</code>.</li>
-<li>Retry local <code>SMPI_SHARED_MALLOC</code>.</li>
-<li>Try other matrix sizes, other process grids.</li>
-<li>In <code>HPL_pdmxswp</code>, print the values of <code>WORK[{0,1,2,3}]</code> before and after the execution.</li>
-</ul></li>
-</ul>
-</div>
-</li>
-<li><a id="orga0c5440"></a>Looking at <code>WORK[{0,1,2,3}]</code>&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="C">C</span>&#xa0;<span class="BUG">BUG</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-15-4">
-<ul class="org-ul">
-<li>Meaning of the first values of this array:
-<ul class="org-ul">
-<li><code>WORK[0]</code> : local maximum absolute value scalar,</li>
-<li><code>WORK[1]</code> : corresponding local  row index,</li>
-<li><code>WORK[2]</code> : corresponding global row index,</li>
-<li><code>WORK[3]</code> : coordinate of process owning this max.</li>
-</ul></li>
-<li>Just before the call to <code>HPL_pdxswp</code>, these values are computed locally. Then, <code>HPL_pdxswp</code> does some computations to get
-the global values.</li>
-<li>Adding some <code>printf</code>.</li>
-<li>Without <code>SHARED_{MALLOC,FREE}</code>, the absolute value of <code>WORK[0]</code> increases at each call and quickly becomes very large. It
-reaches <code>3.8e+302</code>, then it is <code>NaN</code>. This happens regardless of if we replace BLAS operations by <code>smpi_usleep</code>.</li>
-<li>With <code>SHARED_{MALLOC,FREE}</code>, the absolute value of <code>WORK[0]</code> is relatively small.</li>
-<li>If we replace the value of <code>WORK[0]</code> by a (small) constant, the simulations terminates without deadlock.</li>
-<li>Recall that we run the simulation with N=40000 and P=Q=4.</li>
-<li>The simulation takes 197 seconds, where 170 seconds are actual computations of the application (thus, there is still
-room for optimization).</li>
-<li>The estimated performances are 27.5 Gflops. This is a bit higher than what we had before with the matrix of
-size 20000. We need to check if this difference is due to the higher matrix size (expected and ok) or our dirty hack
-(not ok).</li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-orga5b9649" class="outline-4">
-<h4 id="orga5b9649"><span class="section-number-4">1.2.16</span> 2017-03-21 Tuesday</h4>
-<div class="outline-text-4" id="text-1-2-16">
-</div>
-<ol class="org-ol">
-<li><a id="orgc04330e"></a>Checking the consistency of <code>IPMAP</code>&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="PYTHON">PYTHON</span>&#xa0;<span class="TRACING">TRACING</span>&#xa0;<span class="BUG">BUG</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-16-1">
-<ul class="org-ul">
-<li>Before the modification on <code>WORK[0]</code>, the <code>IPMAP</code> were not consistent on the different processes of a same communicator
-(see the entry of <code>16/03/2017</code>).</li>
-<li>Let’s check if this issue is fixed.</li>
-<li><p>
-Add the following in <code>HPL_spreadT.c</code>:
-</p>
-<div class="org-src-container">
-<pre class="src src-c"><span class="org-type">int</span> <span class="org-variable-name">my_rank</span>;
-<span class="org-function-name">MPI_Comm_rank</span>(MPI_COMM_WORLD, &amp;my_rank);
-printf(<span class="org-string">"my_rank=%d IPMAP=%d,%d,%d,%d\n"</span>, my_rank, IPMAP[0], IPMAP[1], IPMAP[2], IPMAP[3]);
-</pre>
-</div></li>
-<li>Run HPL with <code>stdout</code> redirected to <code>/tmp/output</code>.</li>
-<li>Check that at each step, the values of <code>IPMAP</code> are the same for the processes of a same communicator. Recall that the
-communicators are <code>{0,4,8,12}</code>, <code>{1,5,9,13}</code>, <code>{2,6,10,14}</code> and <code>{3,7,11,15}</code>.</li>
-</ul>
-<div class="org-src-container">
-<pre class="src src-python"><span class="org-keyword">import</span> re
-<span class="org-variable-name">reg</span> = re.<span class="org-builtin">compile</span>(<span class="org-string">'my_rank=([0-9]+) IPMAP=(.+)'</span>)
-
-<span class="org-keyword">def</span> <span class="org-function-name">process</span>(in_file):
-    <span class="org-variable-name">results</span> = {n: [] <span class="org-keyword">for</span> n <span class="org-keyword">in</span> <span class="org-builtin">range</span>(16)}
-    <span class="org-keyword">with</span> <span class="org-builtin">open</span>(in_file, <span class="org-string">'r'</span>) <span class="org-keyword">as</span> in_f:
-        <span class="org-keyword">for</span> line <span class="org-keyword">in</span> in_f:
-            <span class="org-variable-name">match</span> = reg.match(line)
-            <span class="org-keyword">if</span> match <span class="org-keyword">is</span> <span class="org-keyword">not</span> <span class="org-constant">None</span>:
-                <span class="org-variable-name">n</span> = <span class="org-builtin">int</span>(match.group(1))
-                <span class="org-variable-name">ipmap</span> = match.group(2)
-                results[n].append(ipmap)
-    <span class="org-keyword">for</span> comm <span class="org-keyword">in</span> <span class="org-builtin">range</span>(4):
-        <span class="org-keyword">print</span>(<span class="org-string">'Number of entries for communicator %d: %d'</span> % (comm, <span class="org-builtin">len</span>(results[comm])))
-        <span class="org-keyword">for</span> rank <span class="org-keyword">in</span> <span class="org-builtin">range</span>(1, 4):
-            <span class="org-keyword">assert</span> results[comm] == results[comm+4*rank]
-    <span class="org-keyword">print</span>(<span class="org-string">'OK'</span>)
-process(<span class="org-string">'/tmp/output'</span>)
-</pre>
-</div>
-
-<pre class="example">
-Number of entries for communicator 0: 913
-Number of entries for communicator 1: 904
-Number of entries for communicator 2: 907
-Number of entries for communicator 3: 910
-OK
-</pre>
-
-<ul class="org-ul">
-<li>We see here that the values of <code>IPMAP</code> are consistent.</li>
-</ul>
-</div>
-</li>
-<li><a id="orgf0aa7c3"></a>Comparison with previous code&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-16-2">
-<ul class="org-ul">
-<li><p>
-Let’s compare with the previous version of the code (without the modification on <code>WORK[0]</code>). We use N=20000, P=Q=4.
-</p>
-<table border="2" cellspacing="0" cellpadding="6" rules="groups" frame="hsides">
-
-
-<colgroup>
-<col  class="org-left" />
-
-<col  class="org-right" />
-
-<col  class="org-right" />
-
-<col  class="org-right" />
-
-<col  class="org-right" />
-</colgroup>
-<thead>
-<tr>
-<th scope="col" class="org-left">Code</th>
-<th scope="col" class="org-right">Virtual time</th>
-<th scope="col" class="org-right">Gflops</th>
-<th scope="col" class="org-right">Total simulation time</th>
-<th scope="col" class="org-right">Time for application computations</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td class="org-left"><b>Before</b></td>
-<td class="org-right">222.27</td>
-<td class="org-right">2.400e+01</td>
-<td class="org-right">19.2529</td>
-<td class="org-right">10.0526</td>
-</tr>
-
-<tr>
-<td class="org-left"><b>After</b></td>
-<td class="org-right">258.28</td>
-<td class="org-right">2.065e+01</td>
-<td class="org-right">48.2851</td>
-<td class="org-right">41.7249</td>
-</tr>
-</tbody>
-</table></li>
-<li>We find that both the virtual time and the real time are longer, due to an higher ammount of time spent in the
-application.</li>
-<li>Do not forget to remove the option <code>--cfg=smpi/simulate-computation:0</code> when testing for things like that. At first, I
-did not removed it. The real time was higher but the virtual time was unchanged.</li>
-<li>It seems that the modification of <code>WORK[0]</code> has led to a modification of the behavior of the application, which yields
-significant differences in terms of performances.</li>
-</ul>
-</div>
-</li>
-<li><a id="org5d0deb6"></a>Having a look at what takes time&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="PYTHON">PYTHON</span>&#xa0;<span class="R">R</span>&#xa0;<span class="EXPERIMENTS">EXPERIMENTS</span>&#xa0;<span class="PERFORMANCE">PERFORMANCE</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-16-3">
-<ul class="org-ul">
-<li>Using the settings N=20000, P=Q=4. Recall that with these settings, the simulation time was nearly 52 seconds.</li>
-<li>Simulation time drops to 30 seconds if we disable the calls to <code>HPL_dgemv</code> (this was not the case before, according to
-experimentations of <code>08/03/2017</code>).</li>
-<li>There was no deadlock with N=20000, so we can compare the cases with and without the modification of <code>WORK[0]</code>.</li>
-<li><p>
-Modify the definition of <code>HPL_dgemv</code> in <code>HPL_blas.h</code> for both cases:
-</p>
-<div class="org-src-container">
-<pre class="src src-c"><span class="org-preprocessor">#define</span>    <span class="org-function-name">HPL_dgemv</span>(<span class="org-variable-name">Order</span>, <span class="org-variable-name">TransA</span>, <span class="org-variable-name">M</span>, <span class="org-variable-name">N</span>, <span class="org-variable-name">alpha</span>, <span class="org-variable-name">A</span>, <span class="org-variable-name">lda</span>, <span class="org-variable-name">X</span>, <span class="org-variable-name">incX</span>, <span class="org-variable-name">beta</span>, <span class="org-variable-name">Y</span>, <span class="org-variable-name">incY</span>) ({\
-    <span class="org-type">int</span> <span class="org-variable-name">my_rank</span>, <span class="org-variable-name">buff</span>=0;\
-    MPI_Comm_rank(MPI_COMM_WORLD, &amp;my_rank);\
-    <span class="org-keyword">struct</span> <span class="org-type">timeval</span> <span class="org-variable-name">before</span> = {};\
-    <span class="org-keyword">struct</span> <span class="org-type">timeval</span> <span class="org-variable-name">after</span> = {};\
-    gettimeofday(&amp;before, <span class="org-constant">NULL</span>);\
-    cblas_dgemv(Order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY);\
-    gettimeofday(&amp;after, <span class="org-constant">NULL</span>);\
-    <span class="org-type">double</span> <span class="org-variable-name">time_before</span> = (<span class="org-type">double</span>)(before.tv_sec) + (<span class="org-type">double</span>)(before.tv_usec)*1e-6;\
-    <span class="org-type">double</span> <span class="org-variable-name">time_after</span> = (<span class="org-type">double</span>)(after.tv_sec) + (<span class="org-type">double</span>)(after.tv_usec)*1e-6;\
-    <span class="org-type">double</span> <span class="org-variable-name">real_time</span> = time_after-time_before;\
-    printf(<span class="org-string">"file=%s line=%d rank=%d m=%d n=%d lead_A=%d inc_X=%d inc_Y=%d real_time=%f\n"</span>, __FILE__, __LINE__, my_rank, M, N, lda, incX, incY, real_time);\
-})
-</pre>
-</div></li>
-<li>Run HPL for both cases, with <code>stdout</code> redirected to some file (<code>/tmp/output_before</code> when <code>WORK[0]</code> is unmodified,
-<code>/tmp/output_after</code> when it is modified).</li>
-<li>Process the outputs:</li>
-</ul>
-<div class="org-src-container">
-<pre class="src src-python"><span class="org-keyword">import</span> re
-<span class="org-keyword">import</span> csv
-<span class="org-variable-name">reg</span> = re.<span class="org-builtin">compile</span>(<span class="org-string">'file=([a-zA-Z0-9/_.-]+) line=([0-9]+) rank=([0-9]+) m=([0-9]+) n=([0-9]+) lead_A=([0-9]+) inc_X=([0-9]+) inc_Y=([0-9]+) real_time=(-?[0-9]+.[0-9]+)'</span>)
-
-<span class="org-keyword">def</span> <span class="org-function-name">process</span>(in_file, out_file):
-    <span class="org-keyword">with</span> <span class="org-builtin">open</span>(in_file, <span class="org-string">'r'</span>) <span class="org-keyword">as</span> in_f:
-        <span class="org-keyword">with</span> <span class="org-builtin">open</span>(out_file, <span class="org-string">'w'</span>) <span class="org-keyword">as</span> out_f:
-            <span class="org-variable-name">csv_writer</span> = csv.writer(out_f)
-            csv_writer.writerow((<span class="org-string">'file'</span>, <span class="org-string">'line'</span>, <span class="org-string">'rank'</span>, <span class="org-string">'m'</span>, <span class="org-string">'n'</span>, <span class="org-string">'lead_A'</span>, <span class="org-string">'inc_X'</span>, <span class="org-string">'inc_Y'</span>, <span class="org-string">'real_time'</span>))
-            <span class="org-keyword">for</span> line <span class="org-keyword">in</span> in_f:
-                <span class="org-variable-name">match</span> = reg.match(line)
-                <span class="org-keyword">if</span> match <span class="org-keyword">is</span> <span class="org-keyword">not</span> <span class="org-constant">None</span>:
-                    <span class="org-variable-name">result</span> = <span class="org-builtin">list</span>(match.group(i) <span class="org-keyword">for</span> i <span class="org-keyword">in</span> <span class="org-builtin">range</span>(1, 10))
-                    <span class="org-variable-name">result</span>[0] = result[0][result[0].index(<span class="org-string">'/hpl'</span>):].lower()
-                    csv_writer.writerow(result)
-process(<span class="org-string">'/tmp/output_before'</span>, <span class="org-string">'/tmp/parameters_before.csv'</span>)
-process(<span class="org-string">'/tmp/output_after'</span>, <span class="org-string">'/tmp/parameters_after.csv'</span>)
-</pre>
-</div>
-
-<ul class="org-ul">
-<li>Analysis with R:</li>
-</ul>
-<div class="org-src-container">
-<pre class="src src-R">parameters_before &lt;- read.csv("/tmp/parameters_before.csv");
-parameters_after &lt;- read.csv("/tmp/parameters_after.csv");
-head(parameters_before)
-head(parameters_after)
-</pre>
-</div>
-
-<pre class="example">
-                               file line rank    m n lead_A inc_X inc_Y
-1 /hpl-2.2/src/pfact/hpl_pdpanllt.c  207    4 5040 1   5040   120     1
-2 /hpl-2.2/src/pfact/hpl_pdpanllt.c  207   12 4920 1   4920   120     1
-3 /hpl-2.2/src/pfact/hpl_pdpanllt.c  207    0 5039 1   5040   120     1
-4 /hpl-2.2/src/pfact/hpl_pdpanllt.c  207    8 5000 1   5000   120     1
-5 /hpl-2.2/src/pfact/hpl_pdpanllt.c  207   12 4920 1   4920   120     1
-6 /hpl-2.2/src/pfact/hpl_pdpanllt.c  207    4 5040 1   5040   120     1
-  real_time
-1  0.000034
-2  0.000034
-3  0.000030
-4  0.000156
-5  0.000043
-6  0.000031
-                               file line rank    m n lead_A inc_X inc_Y
-1 /hpl-2.2/src/pfact/hpl_pdpanllt.c  207    4 5040 1   5040   120     1
-2 /hpl-2.2/src/pfact/hpl_pdpanllt.c  207   12 4920 1   4920   120     1
-3 /hpl-2.2/src/pfact/hpl_pdpanllt.c  207    0 5039 1   5040   120     1
-4 /hpl-2.2/src/pfact/hpl_pdpanllt.c  207    8 5000 1   5000   120     1
-5 /hpl-2.2/src/pfact/hpl_pdpanllt.c  207   12 4920 1   4920   120     1
-6 /hpl-2.2/src/pfact/hpl_pdpanllt.c  207    8 5000 1   5000   120     1
-  real_time
-1  0.000026
-2  0.000030
-3  0.000030
-4  0.000123
-5  0.000030
-6  0.000028
-</pre>
-
-<div class="org-src-container">
-<pre class="src src-R">sum(parameters_before$real_time)
-sum(parameters_after$real_time)
-</pre>
-</div>
-
-<pre class="example">
-[1] 0.127207
-[1] 2.61086
-</pre>
-
-
-<ul class="org-ul">
-<li>There is a clear difference between the two cases. When <code>WORK[0]</code> is modified, the time spent in the function <code>HPL_dgemv</code>
-is 20 times higher. However, this makes a difference of about 2.5 seconds, whereas a difference of 20 seconds was
-observed when removing <code>HPL_dgemv</code>.</li>
-<li>Therefore, it seems that removing the calls to <code>HPL_dgemv</code> triggers a modification of the behavior of the application,
-resulting in a lower time, but this is not this function itself which takes time.</li>
-<li>Note that this was not the case for functions <code>HPL_dgemm</code> and <code>HPL_dtrsm</code>: this was the calls to these functions which
-took time, not a consequence of the calls (just tested: taking the sum of all the times gives a total of about 75
-seconds for <code>HPL_dtrsm</code> and about 2896 seconds for <code>HPL_dgemm</code>).</li>
-<li>In experimentation of <code>08/03/2017</code>, removing <code>HPL_dgemv</code> only resulted in a drop of 1 second for the execution time.</li>
-<li>Thus, it seems that modifying <code>WORK[0]</code> has increased the execution time, which is cancelled if we then remove
-<code>HPL_dgemv</code>. Therefore, we should not treat this function as <code>HPL_dgemm</code> and <code>HPL_dtrsm</code> (replacing it by a <code>smpi_usleep</code>), we
-should simply remove it.</li>
-<li>If we remove it, we get a virtual time of 226 seconds, i.e. 23.6 Gflops. This is much closer to what we used to
-have. Now the simulation time is 26 seconds, this is worse than what we used to have, but still better than what we
-had after the modification of <code>WORK[0]</code>.</li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-org21c4482" class="outline-4">
-<h4 id="org21c4482"><span class="section-number-4">1.2.17</span> 2017-03-22 Wednesday</h4>
-<div class="outline-text-4" id="text-1-2-17">
-</div>
-<ol class="org-ol">
-<li><a id="org69e3152"></a>Better usability of HPL&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="C">C</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-17-1">
-<ul class="org-ul">
-<li>Before, HPL code had to be changed by hand to enable or disable SMPI optimizations (<code>SHARED_{MALLOC,FREE}</code> and
-<code>smpi_usleep</code>) and to enable or disable the tracing of BLAS function calls.</li>
-<li><p>
-Now, thanks to some preprocessor macros, these different settings can be configured on the command line when
-compiling:
-</p>
-<div class="org-src-container">
-<pre class="src src-sh"><span class="org-comment-delimiter"># </span><span class="org-comment">Compile vanilla HPL for SMPI</span>
-make <span class="org-variable-name">arch</span>=SMPI
-<span class="org-comment-delimiter"># </span><span class="org-comment">Compile HPL for SMPI with the tracing of BLAS function calls</span>
-make <span class="org-variable-name">SMPI_OPTS</span>=-DSMPI_MEASURE
-<span class="org-comment-delimiter"># </span><span class="org-comment">Compile HPL for SMPI with the SMPI optimizations (shared malloc/free, smpi_usleep)</span>
-make <span class="org-variable-name">SMPI_OPTS</span>=-DSMPI_OPTIMIZATION
-</pre>
-</div></li>
-<li>Next step: automation of the computation of the linear regression coefficients, to also pass these coefficients as
-preprocessor variables.</li>
-</ul>
-</div>
-</li>
-<li><a id="org9cef5f3"></a>Script to parse the output file and do the linear regression&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="PYTHON">PYTHON</span>&#xa0;<span class="EXPERIMENTS">EXPERIMENTS</span>&#xa0;<span class="TRACING">TRACING</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-17-2">
-<ul class="org-ul">
-<li>Everything is done in Python (linear regression included) to simplify the procedure for the user.</li>
-<li><p>
-Given an output file <code>/tmp/output</code> as produced by a call to HPL (compiled with <code>-DSMPI_MEASURE</code> option), call the script:
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">python3 ../scripts/linear_regression.py /tmp/output
-</pre>
-</div>
-
-<pre class="example">
--DSMPI_DGEMM_COEFF=1.097757e-09 -DSMPI_DTRSM_COEFF=9.134754e-08
-</pre>
-
-<p>
-It outputs the list of the coefficients found by the linear regressions for the relevant BLAS functions. This list
-should then be passed to the variable <code>SMPI_OPTS</code> when compiling with <code>-DSMPI_OPTIMIZATION</code>.
-</p></li>
-</ul>
-</div>
-</li>
-<li><a id="org48c6033"></a>Discussion with Arnaud&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="ORGMODE">ORGMODE</span>&#xa0;<span class="PERFORMANCE">PERFORMANCE</span>&#xa0;<span class="HPL">HPL</span>&#xa0;<span class="MEETING">MEETING</span></span><br />
-<div class="outline-text-5" id="text-1-2-17-3">
-<ul class="org-ul">
-<li>Possible trip to Bordeaux on the week of <span class="timestamp-wrapper"><span class="timestamp">[2017-04-10 Mon]</span></span>-<span class="timestamp-wrapper"><span class="timestamp">[2017-04-14 Fri]</span></span>. The goal is to discuss with contributors
-of Simgrid.</li>
-<li>Found very strange the different time we get when we modify <code>WORK[0]</code>, especially because it is computation time (would
-be more understandable if it was communication time, since the communication patterns for the pivot exchange are very
-likely to be impacted). Should do a profiling.</li>
-<li>Some tips regarding org-mode (tags).</li>
-</ul>
-</div>
-</li>
-<li><a id="orgf7f8fc2"></a><span class="done DONE">DONE</span> Profile HPL<br />
-<div class="outline-text-5" id="text-1-2-17-4">
-<ul class="org-ul">
-<li>Use Valgrind with <a href="http://valgrind.org/docs/manual/cl-manual.html">Callgrind</a> and <a href="http://kcachegrind.sourceforge.net/html/Shot3.html">Kcachegrind</a> or <a href="https://sourceware.org/binutils/docs/gprof/">Gprof</a>.</li>
-<li>Do the profiling on unmodified HPL and modified HPL, to see if there is any obvious difference.</li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-orgc090832" class="outline-4">
-<h4 id="orgc090832"><span class="section-number-4">1.2.18</span> 2017-03-23 Thursday</h4>
-<div class="outline-text-4" id="text-1-2-18">
-</div>
-<ol class="org-ol">
-<li><a id="org6d28819"></a>Profiling vanilla HPL&#xa0;&#xa0;&#xa0;<span class="tag"><span class="EXPERIMENTS">EXPERIMENTS</span>&#xa0;<span class="PERFORMANCE">PERFORMANCE</span>&#xa0;<span class="PROFILING">PROFILING</span>&#xa0;<span class="VALGRIND">VALGRIND</span>&#xa0;<span class="SMPI">SMPI</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-18-1">
-<ul class="org-ul">
-<li>Profiling with Valgrind of vanilla HPL (no time measurements nor SMPI optimizations). Add the option <code>-g</code> in the Makefile.</li>
-<li>HPL commit: <code>4494976bc0dd67e04e54abec2520fd468792527a</code>.</li>
-<li>Settings: N=5000, P=Q=4.</li>
-<li><p>
-Compile with the command:
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">make -j 4 <span class="org-variable-name">arch</span>=SMPI
-</pre>
-</div></li>
-<li><p>
-Run with the command:
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">smpirun -wrapper <span class="org-string">"valgrind --tool=callgrind"</span> --cfg=smpi/bcast:mpich --cfg=smpi/running-power:6217956542.969
---cfg=smpi/display-timing:yes --cfg=smpi/privatize-global-variables:yes -np 16 -hostfile ./hostfile_64.txt -platform
-./cluster_fat_tree_64.xml ./xhpl
-</pre>
-</div></li>
-<li>At first, the package <code>libatlas3-base</code> was used for the BLAS functions. The name of these functions were not shown in
-Kcachegrind.
-<ul class="org-ul">
-<li><a href="callgrind/callgrind.out.19313">Output file</a></li>
-<li>Visualization:
-<img src="callgrind/callgrind.19313.png" alt="callgrind.19313.png" /></li>
-</ul></li>
-<li>Then, removed this package and installed <code>libatlas-cpp-0.6-2-dbg</code>.
-<ul class="org-ul">
-<li><a href="callgrind/callgrind.out.26943">Output file</a></li>
-<li>Visualization
-<img src="callgrind/callgrind.26943.png" alt="callgrind.26943.png" /></li>
-</ul></li>
-<li>So now we have the names of the BLAS functions, but the layout is very different.</li>
-<li>Also, the executions with this library take more time, especially with Vallgrind. It also impacts the virtual time and
-the Gflops.</li>
-<li>What we observe with this new library seems to be consistent with what we observed previously: <code>dgemm</code> is the most time
-consumming function (by far), <code>dtrsm</code> comes after. So maybe this library is good enough to understand what happens, and
-then we could switch back to the previous library to have good performances.</li>
-</ul>
-</div>
-</li>
-<li><a id="orged38b01"></a>Profiling modified HPL&#xa0;&#xa0;&#xa0;<span class="tag"><span class="EXPERIMENTS">EXPERIMENTS</span>&#xa0;<span class="PERFORMANCE">PERFORMANCE</span>&#xa0;<span class="PROFILING">PROFILING</span>&#xa0;<span class="VALGRIND">VALGRIND</span>&#xa0;<span class="SMPI">SMPI</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-18-2">
-<ul class="org-ul">
-<li>Profiling with Valgrind of modified HPL. Add the option <code>-g</code> in the Makefile.</li>
-<li>HPL commit: <code>4494976bc0dd67e04e54abec2520fd468792527a</code>. Then for each case, a small piece of the code has been modified.</li>
-<li>Settings: N=5000, P=Q=4.</li>
-<li><p>
-Compile with the command:
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">make <span class="org-variable-name">SMPI_OPTS</span>=-DSMPI_OPTIMIZATION -j 4 <span class="org-variable-name">arch</span>=SMPI
-</pre>
-</div></li>
-<li><p>
-Run with the command:
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">smpirun -wrapper <span class="org-string">"valgrind --tool=callgrind"</span> --cfg=smpi/bcast:mpich --cfg=smpi/running-power:6217956542.969
---cfg=smpi/display-timing:yes --cfg=smpi/privatize-global-variables:yes -np 16 -hostfile ./hostfile_64.txt -platform
-./cluster_fat_tree_64.xml ./xhpl
-</pre>
-</div></li>
-<li>Using the library from <code>libatlas-cpp-0.6-2-dbg</code>.</li>
-<li>First experiment, the call to <code>HPL_dgemv</code> is a no-op and <code>WORK[0]</code> is set to a constant.
-<ul class="org-ul">
-<li><a href="callgrind/callgrind.out.5388">Output file</a></li>
-<li>Visualization:
-<img src="callgrind/callgrind.5388.png" alt="callgrind.5388.png" /></li>
-</ul></li>
-<li>Second experiment, the call to <code>HPL_dgemv</code> is aliased to <code>cblas_dgemv</code> and <code>WORK[0]</code> is set to a constant.
-<ul class="org-ul">
-<li><a href="callgrind/callgrind.out.10043">Output file</a></li>
-<li>Visualization: similar.</li>
-</ul></li>
-<li>Third experiment, the call to <code>HPL_dgemv</code> is aliased to <code>cblas_dgemv</code> and <code>WORK[0]</code> is not modified.
-<ul class="org-ul">
-<li><a href="callgrind/callgrind.out.12915">Output file</a></li>
-<li>Visualization: similar.</li>
-</ul></li>
-<li>It is clear that we can shrink even further the simulation by removing the code that initialize the matrices (this is
-the code that calls the function <code>HPL_rand</code>).</li>
-<li>There is no explanation for the differences observed with <code>HPL_dgemv</code> and <code>WORK[0]</code>, the figures look similar. However the
-differences observed between the three cases are quite small (in terms of execution time or Gflops).</li>
-</ul>
-</div>
-</li>
-<li><a id="orgeed00bf"></a>Comparison of the code&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-18-3">
-<ul class="org-ul">
-<li><p>
-Let’s compare again the different versions of the code, this time with the new CBLAS library (package
-<code>libatlas-cpp-0.6-2-dbg</code>). We use N=20000, P=Q=4.
-</p>
-<table border="2" cellspacing="0" cellpadding="6" rules="groups" frame="hsides">
-
-
-<colgroup>
-<col  class="org-left" />
-
-<col  class="org-right" />
-
-<col  class="org-right" />
-
-<col  class="org-right" />
-
-<col  class="org-right" />
-</colgroup>
-<thead>
-<tr>
-<th scope="col" class="org-left">Code</th>
-<th scope="col" class="org-right">Virtual time</th>
-<th scope="col" class="org-right">Gflops</th>
-<th scope="col" class="org-right">Total simulation time</th>
-<th scope="col" class="org-right">Time for application computations</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td class="org-left"><b>WORK[0] unmodified, real dgemv</b></td>
-<td class="org-right">223.81</td>
-<td class="org-right">2.383e+01</td>
-<td class="org-right">15.5049</td>
-<td class="org-right">9.5045</td>
-</tr>
-
-<tr>
-<td class="org-left"><b>WORK[0] modified, real dgemv</b></td>
-<td class="org-right">223.74</td>
-<td class="org-right">2.384e+01</td>
-<td class="org-right">25.9935</td>
-<td class="org-right">20.0480</td>
-</tr>
-
-<tr>
-<td class="org-left"><b>WORK[0] modified, no-op dgemv</b></td>
-<td class="org-right">226.28</td>
-<td class="org-right">2.357e+01</td>
-<td class="org-right">26.3907</td>
-<td class="org-right">20.3201</td>
-</tr>
-</tbody>
-</table>
-<p>
-Remark: for the first version of the code, the experiment had to be ran twice, since the first run ended in a deadlock.
-</p></li>
-<li>The two first rows correspond to the two rows of the table of <span class="timestamp-wrapper"><span class="timestamp">[2017-03-21 Tue]</span></span>.</li>
-<li>There is no significant difference for the virtual time and the Gflops.</li>
-<li>There is a significant difference for the total simulation time and the time spent for aapplication computations, but
-it is less important than what we previously observed.</li>
-<li>It is strange that this difference in the computation time does not appear in the virtual time. Note that the option
-<code>--cfg=smpi/simulate-computation:0</code> was not used, so it does not come from here.</li>
-</ul>
-</div>
-</li>
-<li><a id="orgca2dec1"></a>Seminar&#xa0;&#xa0;&#xa0;<span class="tag"><span class="MEETING">MEETING</span></span><br />
-<div class="outline-text-5" id="text-1-2-18-4">
-<ul class="org-ul">
-<li>Decaf: Decoupled Dataflows for In Situ High-Performance Workflows</li>
-<li>Mathieu Dreher</li>
-<li>They do some physics experiment (with a particle collider). Then, they analyze the results and build a model. Thus,
-the whole process has three major steps.</li>
-<li>In current systems, the bottleneck is the I/O. It will be even worse for future systems (computation speed will be
-increased, not I/O speed). This is why we should have in-situ workflows (less data to move).</li>
-<li>In the “classical workflow”, we compute all the iterations, then we analyze them.</li>
-<li>In the “in situ workflow”, two things are possible. Time partitionning: we compute one iteration, then analyze it,
-then go back to the computation. Space partitioning: the analysis is done in parallel on other nodes.</li>
-</ul>
-</div>
-</li>
-<li><a id="org6119ef2"></a>Profiling modified HPL, bigger matrices&#xa0;&#xa0;&#xa0;<span class="tag"><span class="EXPERIMENTS">EXPERIMENTS</span>&#xa0;<span class="PERFORMANCE">PERFORMANCE</span>&#xa0;<span class="PROFILING">PROFILING</span>&#xa0;<span class="VALGRIND">VALGRIND</span>&#xa0;<span class="SMPI">SMPI</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-18-5">
-<ul class="org-ul">
-<li>Profiling with Valgrind of modified HPL. Add the option <code>-g</code> in the Makefile.</li>
-<li>HPL commit: <code>4494976bc0dd67e04e54abec2520fd468792527a</code>. Then for each case, a small piece of the code has been modified.</li>
-<li>Settings: N=20000, P=Q=4.</li>
-<li><p>
-Compile with the command:
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">make <span class="org-variable-name">SMPI_OPTS</span>=-DSMPI_OPTIMIZATION -j 4 <span class="org-variable-name">arch</span>=SMPI
-</pre>
-</div></li>
-<li><p>
-Run with the command:
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">smpirun -wrapper <span class="org-string">"valgrind --tool=callgrind"</span> --cfg=smpi/bcast:mpich --cfg=smpi/running-power:6217956542.969
---cfg=smpi/display-timing:yes --cfg=smpi/privatize-global-variables:yes -np 16 -hostfile ./hostfile_64.txt -platform
-./cluster_fat_tree_64.xml ./xhpl
-</pre>
-</div></li>
-<li>Using the library from <code>libatlas-cpp-0.6-2-dbg</code>.</li>
-<li>First experiment, the call to <code>HPL_dgemv</code> is a no-op and <code>WORK[0]</code> is set to a constant.
-<ul class="org-ul">
-<li><a href="callgrind/callgrind.out.6159">Output file</a></li>
-<li>Visualization:
-<img src="callgrind/callgrind.6159.png" alt="callgrind.6159.png" /></li>
-</ul></li>
-<li>Second experiment, the call to <code>HPL_dgemv</code> is aliased to <code>cblas_dgemv</code> and <code>WORK[0]</code> is set to a constant.
-<ul class="org-ul">
-<li><a href="callgrind/callgrind.out.2590">Output file</a></li>
-<li>Visualization:
-<img src="callgrind/callgrind.2590.png" alt="callgrind.2590.png" /></li>
-</ul></li>
-<li>Third experiment, the call to <code>HPL_dgemv</code> is aliased to <code>cblas_dgemv</code> and <code>WORK[0]</code> is not modified.
-<ul class="org-ul">
-<li><a href="callgrind/callgrind.out.31804">Output file</a></li>
-<li>Visualization:
-<img src="callgrind/callgrind.31804.png" alt="callgrind.31804.png" /></li>
-</ul></li>
-<li>The three figures have roughly the same pattern.</li>
-<li>However, some numbers of the two first figures are twice as large as the corresponding numbers of the third
-figure. For instance, the biggest <code>HPL_rand</code> has 70803540000 in the third figure and 141607080000 in the two first ones.</li>
-<li>The reason for that is that, in the two first cases, <code>HPL_pdmatgen</code> is called 32 times, whereas in the last case it is
-called only 16 times. In our settings, we would expect this function to be called 16 times, since we simulate 16 processes.</li>
-<li>This is very strange that <code>WORK[0]</code> has an impact on the behavior of matrices generation.</li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-org65daad3" class="outline-4">
-<h4 id="org65daad3"><span class="section-number-4">1.2.19</span> 2017-03-24 Friday</h4>
-<div class="outline-text-4" id="text-1-2-19">
-</div>
-<ol class="org-ol">
-<li><a id="org3f9776e"></a>Why <code>WORK[0]</code> impacts the number of calls to <code>HPL_pdmatgen</code>&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-19-1">
-<ul class="org-ul">
-<li>Everything happens in the file <code>HPL_pdtest.c</code>. This is in relation with the error code issue discussed on <span class="timestamp-wrapper"><span class="timestamp">[2017-03-14 Tue]</span></span>.</li>
-<li>When we use SMPI optimizations (<code>smpi_usleep</code> and <code>SMPI_SHARED_MALLOC</code>) without modifying <code>WORK[0]</code>, HPL detects an error in
-the data of the matrices and returns an error code. If we also fix <code>WORK[0]</code> to some constant, HPL does not detect this
-error.</li>
-<li>After doing the factorization, if no error code has been returned, HPL does some additional tests on the values of the
-matrix. These tests are quite long, and imply to re-generate the matrix, by calling <code>HPL_pdmatgen</code>.</li>
-<li>This explains why <code>WORK[0]</code> has an impact on the simulation time and the number of time <code>HPL_pdmatgen</code> is called.</li>
-<li>This does not explain the difference in terms of virtual time observed on <span class="timestamp-wrapper"><span class="timestamp">[2017-03-21 Tue]</span></span>, since this is only the
-time needed for the factorization and not the time for the initialization and the checks.</li>
-<li>This difference of virtual time was not re-observed on <span class="timestamp-wrapper"><span class="timestamp">[2017-03-23 Thu]</span></span>. Note that another BLAS library was used.</li>
-</ul>
-</div>
-</li>
-<li><a id="org99eb18e"></a>Comparison of the code&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-19-2">
-<ul class="org-ul">
-<li>Let’s compare again the different versions of the code, with the “old” version of the CBLAS library (package
-<code>libatlas3-base</code>). We use N=20000, P=Q=4.</li>
-<li>This is the same experiment than <span class="timestamp-wrapper"><span class="timestamp">[2017-03-23 Thu]</span></span>, except for the BLAS library.</li>
-<li><p>
-The two first rows correspond to the two rows of the table of <span class="timestamp-wrapper"><span class="timestamp">[2017-03-21 Tue]</span></span>.
-</p>
-<table border="2" cellspacing="0" cellpadding="6" rules="groups" frame="hsides">
-
-
-<colgroup>
-<col  class="org-left" />
-
-<col  class="org-right" />
-
-<col  class="org-right" />
-
-<col  class="org-right" />
-
-<col  class="org-right" />
-</colgroup>
-<thead>
-<tr>
-<th scope="col" class="org-left">Code</th>
-<th scope="col" class="org-right">Virtual time</th>
-<th scope="col" class="org-right">Gflops</th>
-<th scope="col" class="org-right">Total simulation time</th>
-<th scope="col" class="org-right">Time for application computations</th>
-</tr>
-</thead>
-<tbody>
-<tr>
-<td class="org-left"><b>WORK[0] unmodified, real dgemv</b></td>
-<td class="org-right">223.68</td>
-<td class="org-right">2.385e+01</td>
-<td class="org-right">15.8909</td>
-<td class="org-right">9.5658</td>
-</tr>
-
-<tr>
-<td class="org-left"><b>WORK[0] modified, real dgemv</b></td>
-<td class="org-right">257.79</td>
-<td class="org-right">2.069e+01</td>
-<td class="org-right">47.9488</td>
-<td class="org-right">41.5125</td>
-</tr>
-
-<tr>
-<td class="org-left"><b>WORK[0] modified, no-op dgemv</b></td>
-<td class="org-right">225.91</td>
-<td class="org-right">2.361e+01</td>
-<td class="org-right">26.2768</td>
-<td class="org-right">20.1776</td>
-</tr>
-</tbody>
-</table></li>
-<li>The experiment of <span class="timestamp-wrapper"><span class="timestamp">[2017-03-21 Tue] </span></span> is replicated: the two first rows look similar.</li>
-<li>There is still the big gap in terms of both simulation time and virtual time. The former could be explained by the
-checks done at the end of HPL, but not the later (see previous entry of the journal).</li>
-<li>Interestingly, the first and the last rows look very similar to the first and the last row of the <span class="timestamp-wrapper"><span class="timestamp">[2017-03-23 Thu]</span></span>
-experiment, although the BLAS library has changed. The middle row however is very different.</li>
-<li>These gaps are not replicated when using Valgrind. For all simulations, we have a virtual time of about 262 to 263
-seconds, which is about 2.03e+01 Gflops.</li>
-</ul>
-</div>
-</li>
-<li><a id="org359a3da"></a>Removing initialization and checks&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="PERFORMANCE">PERFORMANCE</span>&#xa0;<span class="PROFILING">PROFILING</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-19-3">
-<ul class="org-ul">
-<li>The previous experiments demonstrated that the initialization (done after the factorization) and the checks (done
-after) take a significant amount of time. They do not account for the estimation of the Gflops, so we can safely
-remove them.</li>
-<li>Quick experiment, with HPL at commit <code>cb54a92b8304e0cd2f1728b887cc4cc615334c2d</code>, N=20000 and P=Q=4, using library from
-package <code>libatlas3-base</code>.</li>
-<li>We get a virtual time of 227.35, which is 2.346e+01 Gflops. It confirms that the initialization and the checks are not
-accounted in this measure.</li>
-<li>The simulation time is now 9.63 seconds, with 3.53 seconds spent for actual computations of the application.</li>
-<li>We see here that the simulation is already well optimized, there is not much room for additional gains.</li>
-<li>Profiling with Valgrind:
-<ul class="org-ul">
-<li><a href="callgrind/callgrind.out.5744">Output file</a></li>
-<li>Visualization:
-<img src="callgrind/callgrind.5744.png" alt="callgrind.5744.png" /></li>
-</ul></li>
-<li>We see here that a large part of the time is spent in functions called by Simgrid (e.g. <code>memcpy</code>).</li>
-</ul>
-</div>
-</li>
-<li><a id="orge845776"></a>Work on the experiment script&#xa0;&#xa0;&#xa0;<span class="tag"><span class="PYTHON">PYTHON</span></span><br />
-<div class="outline-text-5" id="text-1-2-19-4">
-<ul class="org-ul">
-<li>Add three features:
-<ul class="org-ul">
-<li>“Dump simulation and application times in the CSV.”</li>
-<li>“Dump physical and virtual memory in the CSV.”</li>
-<li>“Experiments with random sizes and number of processors.”</li>
-</ul></li>
-<li><p>
-Example of usage:
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">./run_measures.py --global_csv /tmp/bla.csv --nb_runs 10 --size 1000:2000,4000:5000,20000:21000 --nb_proc 1:4,8,16,32,64
---fat_tree <span class="org-string">"2;8,8;1,1:4;1,1"</span> --experiment HPL
-</pre>
-</div>
-<p>
-This will run 10 times, in a random order, all combinations of the parameters:
-</p>
-<ul class="org-ul">
-<li>Matrix size in [1000,2000]&cup;[4000,5000]&cup;[20000,21000]</li>
-<li>Number of processes in {1,2,3,4,8,16,32,64}</li>
-<li>Fat-trees <code>2;8,8;1,1;1,1</code> and <code>2;8,8;1,2;1,1</code> and <code>2;8,8;1,3;1,1</code> and <code>2;8,8;1,4;1,1</code>.</li>
-</ul>
-<p>
-The results are dumped in a CSV file. For each experiment, we store all the parameters (topology, size, number of
-processes) as well as the interesting metrics (virtual time, Gflops, simulation time, time spent in application, peak
-physical and virtual memory used).
-</p></li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-orgb68199a" class="outline-4">
-<h4 id="orgb68199a"><span class="section-number-4">1.2.20</span> 2017-03-25 Saturday</h4>
-<div class="outline-text-4" id="text-1-2-20">
-</div>
-<ol class="org-ol">
-<li><a id="org4ac7be4"></a>Time and memory efficiency of HPL simulation&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="R">R</span>&#xa0;<span class="EXPERIMENTS">EXPERIMENTS</span>&#xa0;<span class="PERFORMANCE">PERFORMANCE</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-20-1">
-<ul class="org-ul">
-<li>HPL commit: <code>cb54a92b8304e0cd2f1728b887cc4cc615334c2d</code></li>
-<li>Script commit: <code>8af35470776a0b6f2041cf8e0121739f94fdc34d</code></li>
-<li><p>
-Command line to run the experiment:
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">./run_measures.py --global_csv hpl2.csv --nb_runs 3 --size 100,5000,10000,15000,20000,25000,30000,35000,40000
---nb_proc 1,8,16,24,32,40,48,56,64 --fat_tree <span class="org-string">"2;8,8;1,8;1,1"</span> --experiment HPL
-</pre>
-</div></li>
-<li><p>
-Plots:
-</p>
-<div class="org-src-container">
-<pre class="src src-R">library(ggplot2)
-do_plot &lt;- function(my_plot, title) {
-    return(my_plot +
-	stat_summary(geom="line", fun.y=mean)+
-	stat_summary(fun.data = mean_sdl)+
-	ggtitle(title)
-    )
-}
-results &lt;- read.csv('hpl_analysis/hpl.csv')
-head(results)
-</pre>
-</div>
-
-<pre class="example">
-       topology nb_roots nb_proc  size    time Gflops simulation_time
-1 2;8,8;1,8;1,1        8      48 40000  593.10 71.940        60.75480
-2 2;8,8;1,8;1,1        8      40 20000  144.88 36.820        24.53460
-3 2;8,8;1,8;1,1        8       8 30000 1290.99 13.940        13.39820
-4 2;8,8;1,8;1,1        8      56 10000   37.93 17.580        12.92780
-5 2;8,8;1,8;1,1        8       1 30000 9609.94  1.873         3.67895
-6 2;8,8;1,8;1,1        8      64 10000   27.20 24.510         9.96141
-  application_time       uss         rss
-1         14.47840 701091840 13509701632
-2          6.44959 327905280  3533713408
-3          6.14242 217612288  7422472192
-4          2.55716 211193856  1016156160
-5          3.58312   5619712  7209476096
-6          2.10660 179879936   984698880
-</pre>
-
-<div class="org-src-container">
-<pre class="src src-R">do_plot(ggplot(results, aes(x=size, y=simulation_time, group=nb_proc, color=nb_proc)),
-   "Simulation time vs size")
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="hpl_analysis/1.png" alt="1.png" />
-</p>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">do_plot(ggplot(results, aes(x=nb_proc, y=simulation_time, group=size, color=size)),
-    "Simulation time vs number of processes")
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="hpl_analysis/2.png" alt="2.png" />
-</p>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">do_plot(ggplot(results, aes(x=size, y=uss, group=nb_proc, color=nb_proc)),
-    "Physical memory consumption vs size")
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="hpl_analysis/3.png" alt="3.png" />
-</p>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">do_plot(ggplot(results, aes(x=nb_proc, y=uss, group=size, color=size)),
-   "Physical memory consumption vs number of processes")
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="hpl_analysis/4.png" alt="4.png" />
-</p>
-</div></li>
-
-<li>We see here that despite all the optimizations:
-<ul class="org-ul">
-<li>The simulation time seems to be quadratic in the matrix size.</li>
-<li>The simulation time seems to be (roughly) linear in the number of processes.</li>
-<li>The memory consumption seems to be linear in the matrix size.</li>
-<li>The memory consumption seems to be (roughly) linear in the number of processes.</li>
-</ul></li>
-<li>There are some irregularities regarding the time and memory vs the number of processes. An hypothesis is that it is
-due to different virtual topologies. In this experiment, the number of processes are multiple of 8. So, some of these
-numbers are square numbers, others are not. It seems that we achieve the best performances when the number of
-processes is a square.
-To generate P and Q, the sizes of the process grid, we try to find two divisors of the number of processes that are
-reasonably close (if possible). Thus, when the number of processes is a square, we have P=Q.</li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-orgd50ff46" class="outline-4">
-<h4 id="orgd50ff46"><span class="section-number-4">1.2.21</span> 2017-03-27 Monday</h4>
-<div class="outline-text-4" id="text-1-2-21">
-</div>
-<ol class="org-ol">
-<li><a id="orgf6f0689"></a><span class="done DONE">DONE</span> Remaining work on HPL (following discussion with Arnaud) <code>[3/3]</code>&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="HPL">HPL</span>&#xa0;<span class="MEETING">MEETING</span></span><br />
-<div class="outline-text-5" id="text-1-2-21-1">
-<ul class="org-ul">
-<li class="on"><code>[X]</code> Do not look further regarding the <code>WORK[0]</code> anomaly.</li>
-<li class="on"><code>[X]</code> Do careful experiments to validate the optimizations.</li>
-<li class="on"><code>[X]</code> Currently, the simulation will not scale in memory. Track the sizes of the <code>malloc</code> in <code>HPL_panel_init</code>.</li>
-</ul>
-</div>
-</li>
-<li><a id="org151a577"></a>More detailed analysis of <code>malloc</code>&#xa0;&#xa0;&#xa0;<span class="tag"><span class="R">R</span>&#xa0;<span class="TRACING">TRACING</span>&#xa0;<span class="PERFORMANCE">PERFORMANCE</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-21-2">
-<ul class="org-ul">
-<li>We saw that the memory consumption is still too high, we need to reduce it.</li>
-<li>Let’s take back the results from <span class="timestamp-wrapper"><span class="timestamp">[2017-03-17 Fri]</span></span>. The corresponding CSV file has been copied in repository <code>hpl_malloc</code>.</li>
-<li>Recall that this is a trace of all the <code>malloc</code>, with N=20000 and P=Q=4.</li>
-<li><p>
-We will focus on the file <code>HPL_pdpanel_init.c</code> since we suppose that these are the biggest allocations (after the
-allocation of the matrix).
-</p>
-<div class="org-src-container">
-<pre class="src src-R">results &lt;- read.csv("hpl_malloc/malloc.csv");
-results &lt;- results[results$file == "/hpl-2.2/src/panel/hpl_pdpanel_init.c",]
-results$idx &lt;- 0:(length(results$size)-1)
-head(results)
-</pre>
-</div>
-
-<pre class="example">
-                                     file line rank    size idx
-99  /hpl-2.2/src/panel/hpl_pdpanel_init.c  245    0 4839432   0
-100 /hpl-2.2/src/panel/hpl_pdpanel_init.c  339    0    5344   1
-101 /hpl-2.2/src/panel/hpl_pdpanel_init.c  245    0 4839432   2
-102 /hpl-2.2/src/panel/hpl_pdpanel_init.c  339    0    5344   3
-106 /hpl-2.2/src/panel/hpl_pdpanel_init.c  245    2 9640392   4
-107 /hpl-2.2/src/panel/hpl_pdpanel_init.c  339    2    5344   5
-</pre>
-
-
-<div class="org-src-container">
-<pre class="src src-R">library(ggplot2)
-ggplot(results, aes(x=idx, y=size, color=factor(line))) + geom_point(alpha=.2) + ggtitle("Sizes of malloc in HPL_pdpanel_init (N=20000, P=Q=4)")
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="hpl_malloc/1.png" alt="1.png" />
-</p>
-</div></li>
-
-<li>Now that we have removed the matrix allocation, the panel allocation is clearly the one responsible of the high memory
-consumption. Here, for 16 processes and a matrix of size 20000, this allocation is responsible for 160MB of memory.</li>
-<li>The <code>malloc</code> of line 245 is the one that is a concern. It is made for the <code>WORK</code> attribute.</li>
-<li>The <code>malloc</code> of line 339 is not a concern. It is made for the <code>IWORK</code> attribute.</li>
-<li>It is strange that all these allocations are made. Why not allocating the panel once, and then reusing it until the
-end?</li>
-<li><p>
-It may be difficult to split the panel in two parts (one <code>SHARED_MALLOC</code> and one classical <code>malloc</code>). In
-<code>HPL_pdpanel_init.c</code>, we can find this comment:
-</p>
-<pre class="example">
-* L1:    JB x JB in all processes
-* DPIV:  JB      in all processes
-* DINFO: 1       in all processes
-* We make sure that those three arrays are contiguous in memory for the
-* later panel broadcast.  We  also  choose  to put this amount of space
-* right  after  L2 (when it exist) so that one can receive a contiguous
-* buffer.
-</pre></li>
-</ul>
-</div>
-</li>
-<li><a id="org278c66e"></a>Validation of the optimizations&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="R">R</span>&#xa0;<span class="EXPERIMENTS">EXPERIMENTS</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-21-3">
-<ul class="org-ul">
-<li>Let’s compare vanilla HPL with optimized HPL, to see if the simulation is still faithful.</li>
-<li>Results for optimized HPL are those of <span class="timestamp-wrapper"><span class="timestamp">[2017-03-25 Sat]</span></span>.</li>
-<li>Results for vanilla HPL have been freshly generated:
-<ul class="org-ul">
-<li>Using HPL commit <code>6cc643a5c2a123fa549d02a764bea408b5ad6114</code></li>
-<li>Using script commit <code>7a9e467f9446c65a9dbc2a76c4dab7a3d8209148</code></li>
-<li><p>
-Command:
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">./run_measures.py --global_csv hpl_vanilla.csv --nb_runs 1 --size 100,5000,10000,15000,20000 --nb_proc
-1,8,16,24,32,40,48,56,64 --fat_tree <span class="org-string">"2;8,8;1,8;1,1"</span> --experiment HPL
-</pre>
-</div></li>
-</ul></li>
-<li><p>
-Analysis:
-</p>
-<div class="org-src-container">
-<pre class="src src-R">library(ggplot2)
-optimized_results &lt;- read.csv('hpl_analysis/hpl.csv')
-vanilla_results &lt;- read.csv('hpl_analysis/hpl_vanilla.csv')
-optimized_results$hpl = 'optimized_hpl'
-vanilla_results$hpl = 'vanilla_hpl'
-results = rbind(optimized_results, vanilla_results)
-head(results)
-</pre>
-</div>
-
-<pre class="example">
-       topology nb_roots nb_proc  size    time Gflops simulation_time
-1 2;8,8;1,8;1,1        8      48 40000  593.10 71.940        60.75480
-2 2;8,8;1,8;1,1        8      40 20000  144.88 36.820        24.53460
-3 2;8,8;1,8;1,1        8       8 30000 1290.99 13.940        13.39820
-4 2;8,8;1,8;1,1        8      56 10000   37.93 17.580        12.92780
-5 2;8,8;1,8;1,1        8       1 30000 9609.94  1.873         3.67895
-6 2;8,8;1,8;1,1        8      64 10000   27.20 24.510         9.96141
-  application_time       uss         rss           hpl
-1         14.47840 701091840 13509701632 optimized_hpl
-2          6.44959 327905280  3533713408 optimized_hpl
-3          6.14242 217612288  7422472192 optimized_hpl
-4          2.55716 211193856  1016156160 optimized_hpl
-5          3.58312   5619712  7209476096 optimized_hpl
-6          2.10660 179879936   984698880 optimized_hpl
-</pre>
-
-<div class="org-src-container">
-<pre class="src src-R">plot_results &lt;- function(nb_proc) {
-    ggplot(results[results$nb_proc==nb_proc,], aes(x=size, y=Gflops, color=hpl)) +
-	geom_point() + geom_line() +
-	expand_limits(x=0, y=0) +
-	ggtitle(paste("Gflops vs size, nb_proc = ", nb_proc))
-}
-</pre>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">plot_results(1)
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="hpl_analysis/5.png" alt="5.png" />
-</p>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">plot_results(8)
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="hpl_analysis/6.png" alt="6.png" />
-</p>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">plot_results(16)
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="hpl_analysis/7.png" alt="7.png" />
-</p>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">plot_results(24)
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="hpl_analysis/8.png" alt="8.png" />
-</p>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">plot_results(32)
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="hpl_analysis/9.png" alt="9.png" />
-</p>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">plot_results(40)
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="hpl_analysis/10.png" alt="10.png" />
-</p>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">plot_results(48)
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="hpl_analysis/11.png" alt="11.png" />
-</p>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">plot_results(56)
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="hpl_analysis/12.png" alt="12.png" />
-</p>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">plot_results(64)
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="hpl_analysis/13.png" alt="13.png" />
-</p>
-</div></li>
-
-<li><p>
-From the above plots, it seems that optimized HPL is always too optimistic in terms of performances. However, the
-difference is not so important.
-</p>
-
-<div class="org-src-container">
-<pre class="src src-R">merged_results = merge(x=vanilla_results, y=optimized_results, by=c("nb_proc", "size"))
-merged_results$error = abs((merged_results$Gflops.x - merged_results$Gflops.y)/merged_results$Gflops.y)
-ggplot(merged_results, aes(x=factor(size), y=error)) +
-    geom_boxplot() + geom_jitter(aes(color=nb_proc)) +
-    ggtitle("Error vs size")
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="hpl_analysis/14.png" alt="14.png" />
-</p>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">ggplot(merged_results, aes(x=factor(nb_proc), y=error)) +
-    geom_boxplot() + geom_jitter(aes(color=size)) +
-    ggtitle("Error vs nb_proc")
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="hpl_analysis/15.png" alt="15.png" />
-</p>
-</div></li>
-
-<li><p>
-We see here that the biggest errors made by the simulation are for a size of 100 and 1 process. For larger sizes and
-numbers of processes, the error never goes above 10%. In average, it is lower than 5%.
-</p>
-
-<div class="org-src-container">
-<pre class="src src-R">ggplot(results[results$nb_proc==64,], aes(x=size, y=simulation_time, color=hpl)) +
-    geom_point() + geom_line() +
-    expand_limits(x=0, y=0) +
-    ggtitle("Simulation time vs size, P=Q=8")
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="hpl_analysis/16.png" alt="16.png" />
-</p>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">ggplot(results[results$nb_proc==64,], aes(x=size, y=uss, color=hpl)) +
-    geom_point() + geom_line() +
-    expand_limits(x=0, y=0) +
-    ggtitle("Real memory vs size, P=Q=8")
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="hpl_analysis/17.png" alt="17.png" />
-</p>
-</div></li>
-
-<li>There is a very important gain in terms of memory consumption and simulation time.</li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-org1844753" class="outline-4">
-<h4 id="org1844753"><span class="section-number-4">1.2.22</span> 2017-03-28 Tuesday</h4>
-<div class="outline-text-4" id="text-1-2-22">
-</div>
-<ol class="org-ol">
-<li><a id="org9a5ce51"></a>Booked the plane tickets for Bordeaux<br /></li>
-<li><a id="orgd5a5568"></a>Tentative of allocation hack in HPL<sub>pdpanel</sub><sub>init</sub>&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="C">C</span>&#xa0;<span class="PERFORMANCE">PERFORMANCE</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-22-2">
-<ul class="org-ul">
-<li>Greatly inspired from what is done for the global <code>SMPI_SHARED_MALLOC</code>.</li>
-<li>The idea is to reserve a large block of virtual addresses. The first part is mapped to a (short) buffer in a cyclic
-way. The second part is kept private.</li>
-<li>Currently some bugs (invalid writes, leading to a segmentation fault).</li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-org195b282" class="outline-4">
-<h4 id="org195b282"><span class="section-number-4">1.2.23</span> 2017-03-29 Wednesday</h4>
-<div class="outline-text-4" id="text-1-2-23">
-</div>
-<ol class="org-ol">
-<li><a id="orgcdfe514"></a>Keep trying to use some shared memory for <code>PANEL-&gt;WORK</code>&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="C">C</span>&#xa0;<span class="PERFORMANCE">PERFORMANCE</span>&#xa0;<span class="BUG">BUG</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-23-1">
-<ul class="org-ul">
-<li>The invalid writes of yesterday were on accesses to <code>WORK</code> buffer. Forgot the space needed for the buffer <code>U</code> at the end
-of <code>WORK</code>. Now fixed.</li>
-<li>Add some <code>printf</code> to see the start and end addresses of the different buffers. Everything seems fine.</li>
-<li>Add a check. We fill the private zone (<code>DPIV</code> and <code>DINFO</code>) with 0. Then we fill the shared zone with garbage. Finally we
-check that the private zone is still full 0.</li>
-<li>Now, there is an invalid write of 4 bytes, by <code>HPL_plindx1</code>, located just after the buffer <code>IWORK</code> (the allocation of this
-buffer did not change).</li>
-<li>Test for N=5000, P=Q=4. Found that in file <code>HPL_plindx1.c</code>, variable <code>ipU</code> reaches 120 in the buggy case, but only 119 in
-the normal case. So it is likely that the array is not too short, but rather that this variable is incremented too much.</li>
-<li><p>
-In the <code>for</code> loop where this happens, <code>ipU</code> is incremented when some conditions are fulfilled. One of these conditions is
-the combination of these two <code>if</code>:
-</p>
-<div class="org-src-container">
-<pre class="src src-c"><span class="org-keyword">if</span>( srcrow == icurrow ) {
-    <span class="org-keyword">if</span>( ( dstrow == icurrow ) &amp;&amp; ( dst - ia &lt; jb ) )
-<span class="org-comment-delimiter">// </span><span class="org-comment">[...]</span>
-</pre>
-</div>
-<p>
-When <code>ipU</code> reaches 120, the illegal write is:
-</p>
-<div class="org-src-container">
-<pre class="src src-c"><span class="org-type">iwork</span>[ipU] = IPLEN[il];
-</pre>
-</div>
-<p>
-When this happens, the variable <code>dst</code> is 0 and thus the condition <code>dst-ia&lt;jb</code> is met. But intuitively, this condition
-should not be met like this (<code>jb</code> is always positive).
-A bit earlier in the loop, <code>dst</code> is set with:
-</p>
-<div class="org-src-container">
-<pre class="src src-c">dst = IPID[i+1];
-</pre>
-</div>
-<p>
-Printing this array in the buggy case and in the normal case, we see that the last element of the array is sometimes 0
-in the buggy case, but never in the normal case. Thus, it seems that there is an issue with <code>IPID</code>.
-</p></li>
-<li>Note that we also had issues with <code>IPID</code> when using <code>SHARED_MALLOC</code>.</li>
-</ul>
-</div>
-</li>
-<li><a id="org0a2ec25"></a>Looking at <code>PANEL-&gt;DPIV</code> (again)&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="BUG">BUG</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-23-2">
-<ul class="org-ul">
-<li>Add a <code>printf</code> in <code>HPL_pipid.c</code> (function which compute <code>IPID</code>, using <code>DPIV</code>) to see the content of <code>DPIV</code>.</li>
-<li>In the buggy case, sometimes, the array <code>DPIV</code> is full of 0. It does not happen in the normal case. If we put something
-else in <code>DPIV</code> when it is allocated, then this is shown instead of the zeroes (e.g. if we put 0, 1, 2&#x2026;). Thus, in
-these cases, <code>DPIV</code> is never filled after its initialization.</li>
-<li>Hypothesis: when the pannels are sent with MPI, the size is too short and <code>DPIV</code> is not sent.</li>
-</ul>
-</div>
-</li>
-<li><a id="orgebe02e2"></a>Discussion with Arnaud and Augustin&#xa0;&#xa0;&#xa0;<span class="tag"><span class="MEETING">MEETING</span></span><br />
-<div class="outline-text-5" id="text-1-2-23-3">
-<ul class="org-ul">
-<li>Instead of putting an empty space between the shared block and the private block (for alignment), make them really
-contiguous (and do not share the last page of the “shared” block).</li>
-</ul>
-</div>
-</li>
-<li><a id="org5c87e13"></a>Reimplement the shared allocation&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="C">C</span>&#xa0;<span class="PERFORMANCE">PERFORMANCE</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-23-4">
-<ul class="org-ul">
-<li>The code was a mess, let’s restart something better, using Augustin’s idea.</li>
-<li><p>
-The interface is as follows:
-</p>
-<div class="org-src-container">
-<pre class="src src-c"><span class="org-type">void</span> *<span class="org-function-name">allocate_shared</span>(<span class="org-type">int</span> <span class="org-variable-name">size</span>, <span class="org-type">int</span> <span class="org-variable-name">start_private</span>, <span class="org-type">int</span> <span class="org-variable-name">stop_private</span>);
-</pre>
-</div>
-<p>
-It allocates a contiguous block of virtual addresses of given size that all fit in a small block of physical memory,
-except for a contiguous block located between the indices start<sub>private</sub> (included) and stop<sub>private</sub> (excluded).
-Calling <code>allocate_shared(size, size, size)</code> is (semantically) equivalent to calling <code>SMPI_SHARED_MALLOC(size)</code>.
-Calling <code>allocate_shared(size, 0, size)</code> is (semantically) equivalent to calling <code>malloc(size)</code>.
-</p></li>
-<li>Similarly to <code>SHARED_MALLOC</code>, we map the shared zones by block, on a same range of addresses. The “best” block size is
-to discuss.</li>
-<li>Since every call to <code>mmap</code> is a syscall, we should try to not have a too low block size. Used 0x1000 at the beginning,
-the performances were terrible.</li>
-<li>Still for performance reasons, if the size is too low, we should simply do a malloc (and thus not have any shared zone).</li>
-<li>Valgrind does not report any error (it was the case with the previous implementation). There are some small memory
-leaks however.</li>
-<li>Performances are good. Tested with N=40000, P=Q=8. Simulation time increased from 85 seconds to 112 seconds. Memory
-consumption decreased from 675 MB to 95 MB. The virtual time and the Gflops were not impacted.</li>
-</ul>
-</div>
-</li>
-<li><a id="org1bc5192"></a><span class="done DONE">DONE</span> Remaining work for shared allocation <code>[4/4]</code><br />
-<div class="outline-text-5" id="text-1-2-23-5">
-<ul class="org-ul">
-<li class="on"><code>[X]</code> Track the memory leaks (unclosed file?).</li>
-<li class="on"><code>[X]</code> Clean the block size definition. Put it somewhere accessible by both <code>HPL_pdpanel_init</code> and <code>HPL_pdpanel_free</code>. Maybe
-use two different values for the block size and the condition to switch to a simple <code>malloc</code>.</li>
-<li class="on"><code>[X]</code> Find the best value(s) for the block size (and maybe the <code>malloc</code> condition).</li>
-<li class="on"><code>[X]</code> Contribute this function to Simgrid.</li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-orgbd17131" class="outline-4">
-<h4 id="orgbd17131"><span class="section-number-4">1.2.24</span> 2017-03-30 Thursday</h4>
-<div class="outline-text-4" id="text-1-2-24">
-</div>
-<ol class="org-ol">
-<li><a id="orgb76c42e"></a>Quick work on shared allocations&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="C">C</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-24-1">
-<ul class="org-ul">
-<li>Clean the size definitions.
-<ul class="org-ul">
-<li>Use a separate file that is imported in <code>HPL_pdpanel_init.c</code> and <code>HPL_pdpanel_free.c</code>.</li>
-<li>Use two different sizes: the block size, and the size at which we switch for <code>malloc</code>.</li>
-</ul></li>
-<li>Quick look at the possibilities for the sizes
-<ul class="org-ul">
-<li>Some quick experiments with N=40000, P=Q=8.</li>
-<li>With <code>BLOCK_SIZE</code> and <code>MALLOC_MAX_SIZE</code> equal to 0x10000:
-<ul class="org-ul">
-<li>Simulation time: 112 seconds</li>
-<li>Physical memory: 96 MB</li>
-</ul></li>
-<li>With <code>BLOCK_SIZE</code> equal to 0x10000 and <code>MALLOC_MAX_SIZE</code> equal to 0 (never do a <code>malloc</code>):
-<ul class="org-ul">
-<li>Simulation time: also 112 seconds</li>
-<li>Physical memory: also 96 MB</li>
-</ul></li>
-<li>With <code>BLOCK_SIZE</code> equal to 0x10000 and <code>MALLOC_MAX_SIZE</code> equal to 0x40000 (4 times greater):
-<ul class="org-ul">
-<li>Simulation time: 137 seconds</li>
-<li>Physical memory: 93 MB</li>
-</ul></li>
-<li>Thus, it seems that the gain of using <code>malloc</code> is not so important. Worst: it can yield a significant loss. Let’s
-remove it.</li>
-<li>With <code>BLOCK_SIZE</code> equal to 0x100000 and <code>malloc</code> removed: execution cancelled, all the physical memory was used.</li>
-</ul></li>
-<li>Stop using <code>malloc</code>. Also move back the size definition in <code>HPL_pdpanel_init.c</code></li>
-<li>The code is simpler like this, and the <code>malloc</code> trick did not give better performances.</li>
-<li>Do not bother with the memory leak. It was already here before the shared allocation.</li>
-<li><b>Warning:</b> calling <code>munmap</code> with a size of 0 gives a huge memory consumption. It should be called with the correct size.</li>
-</ul>
-</div>
-</li>
-<li><a id="org483f1a0"></a>Implement the partial <code>shared_malloc</code> in Simgrid<br />
-<div class="outline-text-5" id="text-1-2-24-2">
-<ul class="org-ul">
-<li>Even more generic implementation than the one done in HPL. Now, we give a list of offsets of block that should be
-shared. Thus, we can have an arbitrary mix of shared zones with private zones inside an allocated block.</li>
-<li><p>
-Tests currently fail. To run a single test and see its output, run:
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">ctest --verbose -R tesh-smpi-macro-shared-thread
-</pre>
-</div>
-<p>
-I suspect (but did not check) that this is because we currently share only blocks aligned on the block size.
-It would be better to share blocks aligned on the page size (need to fix it). But this does not change the fact that
-some parts will not be shared. This is expected, we should modify the tests.
-</p></li>
-</ul>
-</div>
-</li>
-<li><a id="orgc6925be"></a>Time and memory efficiency of the partial <code>shared_malloc</code>&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="R">R</span>&#xa0;<span class="EXPERIMENTS">EXPERIMENTS</span>&#xa0;<span class="PERFORMANCE">PERFORMANCE</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-24-3">
-<ul class="org-ul">
-<li>We switch back to the implementation of partial <code>shared_malloc</code> done in HPL, to measure its performances.</li>
-<li>Simgrid commit: <code>c8db21208f3436c35d3fdf5a875a0059719bff43</code> (the same commit that for the previous performance analysis)</li>
-<li>HPL commit: <code>7af9eb0ec54418bf1521c5eafa9acda1b150446f</code></li>
-<li>Script commit: <code>7a9e467f9446c65a9dbc2a76c4dab7a3d8209148</code></li>
-<li><p>
-Command line to run the experiment:
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">./run_measures.py --global_csv hpl_partial_shared.csv --nb_runs 1 --size 100,5000,10000,15000,20000,25000,30000,35000,40000
---nb_proc 1,8,16,24,32,40,48,56,64 --fat_tree <span class="org-string">"2;8,8;1,8;1,1"</span> --experiment HPL
-</pre>
-</div></li>
-<li><p>
-Analysis:
-</p>
-<div class="org-src-container">
-<pre class="src src-R">library(ggplot2)
-partial_shared_results &lt;- read.csv('hpl_analysis/hpl_partial_shared.csv')
-optimized_results &lt;- read.csv('hpl_analysis/hpl.csv')
-vanilla_results &lt;- read.csv('hpl_analysis/hpl_vanilla.csv')
-partial_shared_results$hpl = 'partial_shared_hpl'
-optimized_results$hpl = 'optimized_hpl'
-vanilla_results$hpl = 'vanilla_hpl'
-results = rbind(partial_shared_results, optimized_results, vanilla_results)
-head(results)
-</pre>
-</div>
-
-<pre class="example">
-       topology nb_roots nb_proc  size     time    Gflops simulation_time
-1 2;8,8;1,8;1,1        8      24 25000   319.37 32.620000      25.8119000
-2 2;8,8;1,8;1,1        8      24  5000    13.03  6.399000       2.7273300
-3 2;8,8;1,8;1,1        8      24 35000   781.76 36.570000      49.3234000
-4 2;8,8;1,8;1,1        8      40   100     0.23  0.003028       0.0779319
-5 2;8,8;1,8;1,1        8       1 35000 15257.68  1.873000       5.8686300
-6 2;8,8;1,8;1,1        8      64 40000   488.99 87.260000     111.7290000
-  application_time      uss         rss                hpl
-1        8.0867100 55365632  5274730496 partial_shared_hpl
-2        0.6131710 14643200   257220608 partial_shared_hpl
-3       16.0733000 74350592 10180751360 partial_shared_hpl
-4        0.0196671        0           0 partial_shared_hpl
-5        5.7156200  4775936  9809465344 partial_shared_hpl
-6       29.3046000 95391744 13475909632 partial_shared_hpl
-</pre></li>
-</ul>
-
-<div class="org-src-container">
-<pre class="src src-R">plot_results &lt;- function(nb_proc) {
-    ggplot(results[results$nb_proc==nb_proc,], aes(x=size, y=Gflops, color=hpl)) +
-	geom_point() + geom_line() +
-	expand_limits(x=0, y=0) +
-	ggtitle(paste("Gflops vs size, nb_proc = ", nb_proc))
-}
-</pre>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">plot_results(32)
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="hpl_analysis/18.png" alt="18.png" />
-</p>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">plot_results(64)
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="hpl_analysis/19.png" alt="19.png" />
-</p>
-</div>
-
-<ul class="org-ul">
-<li><p>
-It seems that this new optimization did not change the accuracy of the simulation. Let’s have a look at the time and
-memory.
-</p>
-
-<div class="org-src-container">
-<pre class="src src-R">ggplot(results[results$nb_proc==64,], aes(x=size, y=simulation_time, color=hpl)) +
-    geom_point() + geom_line() +
-    expand_limits(x=0, y=0) +
-    ggtitle("Simulation time vs size, P=Q=8")
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="hpl_analysis/20.png" alt="20.png" />
-</p>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">ggplot(results[results$nb_proc==64,], aes(x=size, y=uss, color=hpl)) +
-    geom_point() + geom_line() +
-    expand_limits(x=0, y=0) +
-    ggtitle("Real memory vs size, P=Q=8")
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="hpl_analysis/21.png" alt="21.png" />
-</p>
-</div></li>
-
-<li>We see here that sharing some parts of the <code>PANEL-&gt;WORK</code> buffer has two effects. The simulation time is a bit larger,
-but the memory consumption is much lower.</li>
-<li><p>
-Let’s have a look in more details at this version of HPL.
-</p>
-
-<div class="org-src-container">
-<pre class="src src-R">do_plot &lt;- function(my_plot, title) {
-    return(my_plot +
-	geom_point() + geom_line() +
-	ggtitle(title)
-    )
-}
-</pre>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">do_plot(ggplot(partial_shared_results, aes(x=size, y=simulation_time, group=nb_proc, color=nb_proc)),
-   "Simulation time vs size")
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="hpl_analysis/22.png" alt="22.png" />
-</p>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">do_plot(ggplot(partial_shared_results, aes(x=nb_proc, y=simulation_time, group=size, color=size)),
-    "Simulation time vs number of processes")
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="hpl_analysis/23.png" alt="23.png" />
-</p>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">do_plot(ggplot(partial_shared_results, aes(x=size, y=uss, group=nb_proc, color=nb_proc)),
-    "Physical memory consumption vs size")
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="hpl_analysis/24.png" alt="24.png" />
-</p>
-</div></li>
-</ul>
-
-
-<div class="org-src-container">
-<pre class="src src-R">do_plot(ggplot(partial_shared_results, aes(x=nb_proc, y=uss, group=size, color=size)),
-    "Physical memory consumption vs number of processes")
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="hpl_analysis/25.png" alt="25.png" />
-</p>
-</div>
-
-<ul class="org-ul">
-<li>The trend for the simulation time looks similar to what we got previously.</li>
-<li>The memory consumption still looks linear in the size and in the number of processes. However, it is almost flat for
-the number of processes.</li>
-</ul>
-</div>
-</li>
-<li><a id="org02436f1"></a>Regression of Time and memory efficiency of the partial <code>shared_malloc</code>  (Arnaud)&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="R">R</span>&#xa0;<span class="EXPERIMENTS">EXPERIMENTS</span>&#xa0;<span class="PERFORMANCE">PERFORMANCE</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-24-4">
-<div class="org-src-container">
-<pre class="src src-R">results$hpl=factor(results$hpl)
-data = results[results$hpl=="partial_shared_hpl" &amp; 
-	       results$nb_proc &gt; 1 &amp; results$size &gt; 1000, # get rid of particularly small values
-	       c("nb_proc","size","Gflops","simulation_time","uss")]
-head(data)
-</pre>
-</div>
-
-<pre class="example">
-  nb_proc  size Gflops simulation_time      uss
-1      24 25000 32.620        25.81190 55365632
-2      24  5000  6.399         2.72733 14643200
-3      24 35000 36.570        49.32340 74350592
-6      64 40000 87.260       111.72900 95391744
-7      24 10000 16.600         6.22743 26472448
-8      40 40000 55.990       100.31300 91209728
-</pre>
-
-
-<div class="org-src-container">
-<pre class="src src-R">plot(data)
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="hpl_analysis/26.png" alt="26.png" />
-</p>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">reg_rss = lm(data=data,uss ~ size+nb_proc) # Interactions do not bring much
-summary(reg_rss)
-</pre>
-</div>
-
-<pre class="example">
-
-Call:
-lm(formula = uss ~ size + nb_proc, data = data)
-
-Residuals:
-     Min       1Q   Median       3Q      Max 
--6941093 -1573650  -348763  1611008  8790400 
-
-Coefficients:
-             Estimate Std. Error t value Pr(&gt;|t|)    
-(Intercept) 7.827e+05  1.030e+06   0.760     0.45    
-size        2.054e+03  3.045e+01  67.449  &lt; 2e-16 ***
-nb_proc     1.717e+05  1.903e+04   9.022 7.85e-13 ***
----
-Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
-
-Residual standard error: 2791000 on 61 degrees of freedom
-Multiple R-squared:  0.987,	Adjusted R-squared:  0.9866 
-F-statistic:  2315 on 2 and 61 DF,  p-value: &lt; 2.2e-16
-</pre>
-
-<div class="org-src-container">
-<pre class="src src-R">par(mfrow=c(2,3)) ; 
-  plot(data=data,uss~size); 
-  plot(data=data,uss~nb_proc);
-  plot(reg_rss); 
-par(mfrow=c(1,1))
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="hpl_analysis/27.png" alt="27.png" />
-</p>
-</div>
-
-<p>
-The Stampede HPL output indicates:
-</p>
-<pre class="example">
-The following parameter values will be used:
-
-N        : 3875000 
-NB       :    1024 
-PMAP     : Column-major process mapping
-P        :      77 
-Q        :      78 
-PFACT    :   Right 
-NBMIN    :       4 
-NDIV     :       2 
-RFACT    :   Crout 
-BCAST    :  BlongM 
-DEPTH    :       0 
-SWAP     : Binary-exchange
-L1       : no-transposed form
-U        : no-transposed form
-EQUIL    : no
-ALIGN    :    8 double precision words
-</pre>
-
-<p>
-We aim at <code>size=3875000</code> and <code>nb_proc=77*78</code>.
-</p>
-
-<div class="org-src-container">
-<pre class="src src-R">data[data$nb_proc==64 &amp; data$size==40000,]
-data[data$nb_proc==64 &amp; data$size==40000,]$uss/1E6 # in MB
-example=data.frame(size=c(3875000,40000), nb_proc=c(77*78,64));
-predict(reg_rss, example, interval="prediction", level=0.95)/1E6
-</pre>
-</div>
-
-<pre class="example">
-  nb_proc  size Gflops simulation_time      uss
-6      64 40000  87.26         111.729 95391744
-[1] 95.39174
-         fit        lwr        upr
-1 8991.32610 8664.69163 9317.96056
-2   93.93216   88.10931   99.75501
-</pre>
-
-
-<p>
-So we should need around 8 to 9 GB. Good.
-</p>
-
-<div class="org-src-container">
-<pre class="src src-R">reg_time = lm(data=data,simulation_time ~ poly(size,3)*poly(nb_proc,2)) # Interactions do not bring much
-summary(reg_time)
-reg_time = lm(data=data,simulation_time ~ poly(size,3)+poly(nb_proc,2)+I(size*nb_proc)) # Interactions do not bring much
-summary(reg_time)
-reg_time = lm(data=data,simulation_time ~ poly(size,2)+poly(nb_proc,1)+I(size*nb_proc)) # Interactions do not bring much
-summary(reg_time)
-</pre>
-</div>
-
-<pre class="example">
-
-Call:
-lm(formula = simulation_time ~ poly(size, 3) * poly(nb_proc, 
-    2), data = data)
-
-Residuals:
-     Min       1Q   Median       3Q      Max 
--14.6972  -2.8188   0.1211   1.4618  23.6037 
-
-Coefficients:
-                                 Estimate Std. Error t value Pr(&gt;|t|)    
-(Intercept)                       34.3882     0.8715  39.458  &lt; 2e-16 ***
-poly(size, 3)1                   200.7402     6.9721  28.792  &lt; 2e-16 ***
-poly(size, 3)2                    37.6113     6.9721   5.395 1.71e-06 ***
-poly(size, 3)3                     0.9386     6.9721   0.135   0.8934    
-poly(nb_proc, 2)1                110.2551     6.9721  15.814  &lt; 2e-16 ***
-poly(nb_proc, 2)2                 -9.0383     6.9721  -1.296   0.2006    
-poly(size, 3)1:poly(nb_proc, 2)1 619.6089    55.7771  11.109 2.43e-15 ***
-poly(size, 3)2:poly(nb_proc, 2)1 101.1174    55.7771   1.813   0.0756 .  
-poly(size, 3)3:poly(nb_proc, 2)1  -2.3618    55.7771  -0.042   0.9664    
-poly(size, 3)1:poly(nb_proc, 2)2 -54.5865    55.7771  -0.979   0.3323    
-poly(size, 3)2:poly(nb_proc, 2)2 -13.4280    55.7771  -0.241   0.8107    
-poly(size, 3)3:poly(nb_proc, 2)2  -6.7984    55.7771  -0.122   0.9035    
----
-Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
-
-Residual standard error: 6.972 on 52 degrees of freedom
-Multiple R-squared:  0.9597,	Adjusted R-squared:  0.9511 
-F-statistic: 112.5 on 11 and 52 DF,  p-value: &lt; 2.2e-16
-
-Call:
-lm(formula = simulation_time ~ poly(size, 3) + poly(nb_proc, 
-    2) + I(size * nb_proc), data = data)
-
-Residuals:
-     Min       1Q   Median       3Q      Max 
--11.9992  -3.5157   0.0224   2.7090  25.8055 
-
-Coefficients:
-                    Estimate Std. Error t value Pr(&gt;|t|)    
-(Intercept)       -2.954e+00  3.452e+00  -0.856  0.39567    
-poly(size, 3)1     4.863e+01  1.527e+01   3.184  0.00236 ** 
-poly(size, 3)2     3.761e+01  6.930e+00   5.427 1.22e-06 ***
-poly(size, 3)3     9.386e-01  6.930e+00   0.135  0.89275    
-poly(nb_proc, 2)1 -4.186e+01  1.527e+01  -2.740  0.00818 ** 
-poly(nb_proc, 2)2 -9.038e+00  6.930e+00  -1.304  0.19742    
-I(size * nb_proc)  4.610e-05  4.125e-06  11.176 5.47e-16 ***
----
-Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
-
-Residual standard error: 6.93 on 57 degrees of freedom
-Multiple R-squared:  0.9563,	Adjusted R-squared:  0.9517 
-F-statistic:   208 on 6 and 57 DF,  p-value: &lt; 2.2e-16
-
-Call:
-lm(formula = simulation_time ~ poly(size, 2) + poly(nb_proc, 
-    1) + I(size * nb_proc), data = data)
-
-Residuals:
-     Min       1Q   Median       3Q      Max 
--11.8123  -3.6614   0.2628   2.4029  25.7019 
-
-Coefficients:
-                    Estimate Std. Error t value Pr(&gt;|t|)    
-(Intercept)       -2.954e+00  3.444e+00  -0.858  0.39442    
-poly(size, 2)1     4.863e+01  1.524e+01   3.191  0.00227 ** 
-poly(size, 2)2     3.761e+01  6.914e+00   5.440 1.07e-06 ***
-poly(nb_proc, 1)  -4.186e+01  1.524e+01  -2.747  0.00797 ** 
-I(size * nb_proc)  4.610e-05  4.115e-06  11.202 3.08e-16 ***
----
-Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
-
-Residual standard error: 6.914 on 59 degrees of freedom
-Multiple R-squared:  0.955,	Adjusted R-squared:  0.952 
-F-statistic: 313.1 on 4 and 59 DF,  p-value: &lt; 2.2e-16
-</pre>
-
-
-<div class="org-src-container">
-<pre class="src src-R">par(mfrow=c(2,3)) ; 
-  plot(data=data,simulation_time~size); 
-  plot(data=data,simulation_time~nb_proc);
-  plot(reg_time); 
-par(mfrow=c(1,1))
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="hpl_analysis/28.png" alt="28.png" />
-</p>
-</div>
-
-
-<div class="org-src-container">
-<pre class="src src-R">data[data$nb_proc==64 &amp; data$size==40000,]
-predict(reg_time, example, interval="prediction", level=0.95)/3600 # in hours
-</pre>
-</div>
-
-<pre class="example">
-  nb_proc  size Gflops simulation_time      uss
-6      64 40000  87.26         111.729 95391744
-           fit          lwr          upr
-1 467.31578577 385.82615026 548.80542127
-2   0.03431702   0.03008967   0.03854438
-</pre>
-
-
-<p>
-Aouch. This would be a 3 weeks simulation. :( We need to speed things
-up.
-</p>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-orge06311a" class="outline-4">
-<h4 id="orge06311a"><span class="section-number-4">1.2.25</span> 2017-03-31 Friday</h4>
-<div class="outline-text-4" id="text-1-2-25">
-</div>
-<ol class="org-ol">
-<li><a id="org874e213"></a>Found a bug in the last commits of Simgrid&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="BUG">BUG</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-2-25-1">
-<ul class="org-ul">
-<li>Issue reported on <a href="https://github.com/simgrid/simgrid/issues/147">Github</a>.</li>
-<li>Bug fixed.</li>
-<li><p>
-There are still some problems with HPL, some unitialized values used for comparisons:
-</p>
-<pre class="example">
-==3320== Memcheck, a memory error detector
-==3320== Copyright (C) 2002-2015, and GNU GPL'd, by Julian Seward et al.
-==3320== Using Valgrind-3.11.0 and LibVEX; rerun with -h for copyright info
-==3320== Command: ./xhpl --cfg=surf/precision:1e-9 --cfg=network/model:SMPI --cfg=network/TCP-gamma:4194304 --cfg=smpi/bcast:mpich --cfg=smpi/running-power:6217956542.969 --cfg=smpi/display-timing:yes --cfg=smpi/privatize-global-variables:yes --cfg=smpi/shared-malloc:local --cfg=smpi/privatize-global-variables:1 ./cluster_fat_tree_64.xml smpitmp-apprXPdW8
-==3320== 
-[0.000000] [xbt_cfg/INFO] Configuration change: Set 'surf/precision' to '1e-9'
-[0.000000] [xbt_cfg/INFO] Configuration change: Set 'network/model' to 'SMPI'
-[0.000000] [xbt_cfg/INFO] Configuration change: Set 'network/TCP-gamma' to '4194304'
-[0.000000] [xbt_cfg/INFO] Configuration change: Set 'smpi/bcast' to 'mpich'
-[0.000000] [xbt_cfg/INFO] Configuration change: Set 'smpi/running-power' to '6217956542.969'
-[0.000000] [xbt_cfg/INFO] Option smpi/running-power has been renamed to smpi/host-speed. Consider switching.
-[0.000000] [xbt_cfg/INFO] Configuration change: Set 'smpi/display-timing' to 'yes'
-[0.000000] [xbt_cfg/INFO] Configuration change: Set 'smpi/privatize-global-variables' to 'yes'
-[0.000000] [xbt_cfg/INFO] Configuration change: Set 'smpi/shared-malloc' to 'local'
-[0.000000] [xbt_cfg/INFO] Configuration change: Set 'smpi/privatize-global-variables' to '1'
-[0.000000] [smpi_coll/INFO] Switch to algorithm mpich for collective bcast
-================================================================================
-HPLinpack 2.2  --  High-Performance Linpack benchmark  --   February 24, 2016
-Written by A. Petitet and R. Clint Whaley,  Innovative Computing Laboratory, UTK
-Modified by Piotr Luszczek, Innovative Computing Laboratory, UTK
-Modified by Julien Langou, University of Colorado Denver
-================================================================================
-
-An explanation of the input/output parameters follows:
-T/V    : Wall time / encoded variant.
-N      : The order of the coefficient matrix A.
-NB     : The partitioning blocking factor.
-P      : The number of process rows.
-Q      : The number of process columns.
-Time   : Time in seconds to solve the linear system.
-Gflops : Rate of execution for solving the linear system.
-
-The following parameter values will be used:
-
-N      :      29       30       34       35 
-NB     :       1        2        3        4 
-PMAP   : Row-major process mapping
-P      :       2        1        4 
-Q      :       2        4        1 
-PFACT  :    Left    Crout    Right 
-NBMIN  :       2        4 
-NDIV   :       2 
-RFACT  :    Left    Crout    Right 
-BCAST  :   1ring 
-DEPTH  :       0 
-SWAP   : Mix (threshold = 64)
-L1     : transposed form
-U      : transposed form
-EQUIL  : yes
-ALIGN  : 8 double precision words
-
---------------------------------------------------------------------------------
-
-- The matrix A is randomly generated for each test.
-- The following scaled residual check will be computed:
-      ||Ax-b||_oo / ( eps * ( || x ||_oo * || A ||_oo + || b ||_oo ) * N )
-- The relative machine precision (eps) is taken to be               1.110223e-16
-- Computational tests pass if scaled residuals are less than                16.0
-
-==3320== Conditional jump or move depends on uninitialised value(s)
-==3320==    at 0x42447D: HPL_pipid (HPL_pipid.c:144)
-==3320==    by 0x418ED8: HPL_pdlaswp00T (HPL_pdlaswp00T.c:171)
-==3320==    by 0x40E878: HPL_pdupdateTT (HPL_pdupdateTT.c:271)
-==3320==    by 0x41AF9F: HPL_pdgesv0 (HPL_pdgesv0.c:152)
-==3320==    by 0x40EFC4: HPL_pdgesv (HPL_pdgesv.c:103)
-==3320==    by 0x406F64: HPL_pdtest (HPL_pdtest.c:197)
-==3320==    by 0x401D38: smpi_simulated_main_ (HPL_pddriver.c:223)
-==3320==    by 0x525BCDA: smpi_main_wrapper (smpi_global.cpp:366)
-==3320==    by 0x5129B8D: operator() (functional.hpp:48)
-==3320==    by 0x5129B8D: std::_Function_handler&lt;void (), simgrid::xbt::MainFunction&lt;int (*)(int, char**)&gt; &gt;::_M_invoke(std::_Any_data const&amp;) (functional:1740)
-==3320==    by 0x5151BB1: operator() (functional:2136)
-==3320==    by 0x5151BB1: operator() (Context.hpp:92)
-==3320==    by 0x5151BB1: simgrid::kernel::context::RawContext::wrapper(void*) (ContextRaw.cpp:303)
-==3320== 
-==3320== Conditional jump or move depends on uninitialised value(s)
-==3320==    at 0x42476D: HPL_plindx0 (HPL_plindx0.c:246)
-==3320==    by 0x418EF6: HPL_pdlaswp00T (HPL_pdlaswp00T.c:172)
-==3320==    by 0x40E878: HPL_pdupdateTT (HPL_pdupdateTT.c:271)
-==3320==    by 0x41AF9F: HPL_pdgesv0 (HPL_pdgesv0.c:152)
-==3320==    by 0x40EFC4: HPL_pdgesv (HPL_pdgesv.c:103)
-==3320==    by 0x406F64: HPL_pdtest (HPL_pdtest.c:197)
-==3320==    by 0x401D38: smpi_simulated_main_ (HPL_pddriver.c:223)
-==3320==    by 0x525BCDA: smpi_main_wrapper (smpi_global.cpp:366)
-==3320==    by 0x5129B8D: operator() (functional.hpp:48)
-==3320==    by 0x5129B8D: std::_Function_handler&lt;void (), simgrid::xbt::MainFunction&lt;int (*)(int, char**)&gt; &gt;::_M_invoke(std::_Any_data const&amp;) (functional:1740)
-==3320==    by 0x5151BB1: operator() (functional:2136)
-==3320==    by 0x5151BB1: operator() (Context.hpp:92)
-==3320==    by 0x5151BB1: simgrid::kernel::context::RawContext::wrapper(void*) (ContextRaw.cpp:303)
-==3320== 
-==3320== Conditional jump or move depends on uninitialised value(s)
-==3320==    at 0x4247A9: HPL_plindx0 (HPL_plindx0.c:250)
-==3320==    by 0x418EF6: HPL_pdlaswp00T (HPL_pdlaswp00T.c:172)
-==3320==    by 0x40E878: HPL_pdupdateTT (HPL_pdupdateTT.c:271)
-==3320==    by 0x41AF9F: HPL_pdgesv0 (HPL_pdgesv0.c:152)
-==3320==    by 0x40EFC4: HPL_pdgesv (HPL_pdgesv.c:103)
-==3320==    by 0x406F64: HPL_pdtest (HPL_pdtest.c:197)
-==3320==    by 0x401D38: smpi_simulated_main_ (HPL_pddriver.c:223)
-==3320==    by 0x525BCDA: smpi_main_wrapper (smpi_global.cpp:366)
-==3320==    by 0x5129B8D: operator() (functional.hpp:48)
-==3320==    by 0x5129B8D: std::_Function_handler&lt;void (), simgrid::xbt::MainFunction&lt;int (*)(int, char**)&gt; &gt;::_M_invoke(std::_Any_data const&amp;) (functional:1740)
-==3320==    by 0x5151BB1: operator() (functional:2136)
-==3320==    by 0x5151BB1: operator() (Context.hpp:92)
-==3320==    by 0x5151BB1: simgrid::kernel::context::RawContext::wrapper(void*) (ContextRaw.cpp:303)
-==3320== 
-==3320== Use of uninitialised value of size 8
-==3320==    at 0x420413: HPL_dlaswp01T (HPL_dlaswp01T.c:240)
-==3320==    by 0x418BDD: HPL_pdlaswp00T (HPL_pdlaswp00T.c:194)
-==3320==    by 0x40E878: HPL_pdupdateTT (HPL_pdupdateTT.c:271)
-==3320==    by 0x41AF9F: HPL_pdgesv0 (HPL_pdgesv0.c:152)
-==3320==    by 0x40EFC4: HPL_pdgesv (HPL_pdgesv.c:103)
-==3320==    by 0x406F64: HPL_pdtest (HPL_pdtest.c:197)
-==3320==    by 0x401D38: smpi_simulated_main_ (HPL_pddriver.c:223)
-==3320==    by 0x525BCDA: smpi_main_wrapper (smpi_global.cpp:366)
-==3320==    by 0x5129B8D: operator() (functional.hpp:48)
-==3320==    by 0x5129B8D: std::_Function_handler&lt;void (), simgrid::xbt::MainFunction&lt;int (*)(int, char**)&gt; &gt;::_M_invoke(std::_Any_data const&amp;) (functional:1740)
-==3320==    by 0x5151BB1: operator() (functional:2136)
-==3320==    by 0x5151BB1: operator() (Context.hpp:92)
-==3320==    by 0x5151BB1: simgrid::kernel::context::RawContext::wrapper(void*) (ContextRaw.cpp:303)
-==3320== 
-==3320== Conditional jump or move depends on uninitialised value(s)
-==3320==    at 0x4E779CC: idamax_ (in /usr/lib/libblas/libblas.so.3.6.0)
-==3320==    by 0x4E779FA: idamaxsub_ (in /usr/lib/libblas/libblas.so.3.6.0)
-==3320==    by 0x4E4796F: cblas_idamax (in /usr/lib/libblas/libblas.so.3.6.0)
-==3320==    by 0x4134F0: HPL_dlocmax (HPL_dlocmax.c:125)
-==3320==    by 0x40B277: HPL_pdpanllT (HPL_pdpanllT.c:167)
-==3320==    by 0x4243C8: HPL_pdfact (HPL_pdfact.c:129)
-==3320==    by 0x41AF61: HPL_pdgesv0 (HPL_pdgesv0.c:146)
-==3320==    by 0x40EFC4: HPL_pdgesv (HPL_pdgesv.c:103)
-==3320==    by 0x406F64: HPL_pdtest (HPL_pdtest.c:197)
-==3320==    by 0x401D38: smpi_simulated_main_ (HPL_pddriver.c:223)
-==3320==    by 0x525BCDA: smpi_main_wrapper (smpi_global.cpp:366)
-==3320==    by 0x5129B8D: operator() (functional.hpp:48)
-==3320==    by 0x5129B8D: std::_Function_handler&lt;void (), simgrid::xbt::MainFunction&lt;int (*)(int, char**)&gt; &gt;::_M_invoke(std::_Any_data const&amp;) (functional:1740)
-==3320== 
-==3320== Conditional jump or move depends on uninitialised value(s)
-==3320==    at 0x417083: HPL_pdmxswp (HPL_pdmxswp.c:238)
-==3320==    by 0x40B4C2: HPL_pdpanllT (HPL_pdpanllT.c:221)
-==3320==    by 0x4243C8: HPL_pdfact (HPL_pdfact.c:129)
-==3320==    by 0x41AF61: HPL_pdgesv0 (HPL_pdgesv0.c:146)
-==3320==    by 0x40EFC4: HPL_pdgesv (HPL_pdgesv.c:103)
-==3320==    by 0x406F64: HPL_pdtest (HPL_pdtest.c:197)
-==3320==    by 0x401D38: smpi_simulated_main_ (HPL_pddriver.c:223)
-==3320==    by 0x525BCDA: smpi_main_wrapper (smpi_global.cpp:366)
-==3320==    by 0x5129B8D: operator() (functional.hpp:48)
-==3320==    by 0x5129B8D: std::_Function_handler&lt;void (), simgrid::xbt::MainFunction&lt;int (*)(int, char**)&gt; &gt;::_M_invoke(std::_Any_data const&amp;) (functional:1740)
-==3320==    by 0x5151BB1: operator() (functional:2136)
-==3320==    by 0x5151BB1: operator() (Context.hpp:92)
-==3320==    by 0x5151BB1: simgrid::kernel::context::RawContext::wrapper(void*) (ContextRaw.cpp:303)
-==3320== 
-==3320== Conditional jump or move depends on uninitialised value(s)
-==3320==    at 0x417098: HPL_pdmxswp (HPL_pdmxswp.c:238)
-==3320==    by 0x40B4C2: HPL_pdpanllT (HPL_pdpanllT.c:221)
-==3320==    by 0x4243C8: HPL_pdfact (HPL_pdfact.c:129)
-==3320==    by 0x41AF61: HPL_pdgesv0 (HPL_pdgesv0.c:146)
-==3320==    by 0x40EFC4: HPL_pdgesv (HPL_pdgesv.c:103)
-==3320==    by 0x406F64: HPL_pdtest (HPL_pdtest.c:197)
-==3320==    by 0x401D38: smpi_simulated_main_ (HPL_pddriver.c:223)
-==3320==    by 0x525BCDA: smpi_main_wrapper (smpi_global.cpp:366)
-==3320==    by 0x5129B8D: operator() (functional.hpp:48)
-==3320==    by 0x5129B8D: std::_Function_handler&lt;void (), simgrid::xbt::MainFunction&lt;int (*)(int, char**)&gt; &gt;::_M_invoke(std::_Any_data const&amp;) (functional:1740)
-==3320==    by 0x5151BB1: operator() (functional:2136)
-==3320==    by 0x5151BB1: operator() (Context.hpp:92)
-==3320==    by 0x5151BB1: simgrid::kernel::context::RawContext::wrapper(void*) (ContextRaw.cpp:303)
-==3320== 
-==3320== Conditional jump or move depends on uninitialised value(s)
-==3320==    at 0x4170A2: HPL_pdmxswp (HPL_pdmxswp.c:239)
-==3320==    by 0x40B4C2: HPL_pdpanllT (HPL_pdpanllT.c:221)
-==3320==    by 0x4243C8: HPL_pdfact (HPL_pdfact.c:129)
-==3320==    by 0x41AF61: HPL_pdgesv0 (HPL_pdgesv0.c:146)
-==3320==    by 0x40EFC4: HPL_pdgesv (HPL_pdgesv.c:103)
-==3320==    by 0x406F64: HPL_pdtest (HPL_pdtest.c:197)
-==3320==    by 0x401D38: smpi_simulated_main_ (HPL_pddriver.c:223)
-==3320==    by 0x525BCDA: smpi_main_wrapper (smpi_global.cpp:366)
-==3320==    by 0x5129B8D: operator() (functional.hpp:48)
-==3320==    by 0x5129B8D: std::_Function_handler&lt;void (), simgrid::xbt::MainFunction&lt;int (*)(int, char**)&gt; &gt;::_M_invoke(std::_Any_data const&amp;) (functional:1740)
-==3320==    by 0x5151BB1: operator() (functional:2136)
-==3320==    by 0x5151BB1: operator() (Context.hpp:92)
-==3320==    by 0x5151BB1: simgrid::kernel::context::RawContext::wrapper(void*) (ContextRaw.cpp:303)
-==3320== 
-==3320== Conditional jump or move depends on uninitialised value(s)
-==3320==    at 0x4170A4: HPL_pdmxswp (HPL_pdmxswp.c:239)
-==3320==    by 0x40B4C2: HPL_pdpanllT (HPL_pdpanllT.c:221)
-==3320==    by 0x4243C8: HPL_pdfact (HPL_pdfact.c:129)
-==3320==    by 0x41AF61: HPL_pdgesv0 (HPL_pdgesv0.c:146)
-==3320==    by 0x40EFC4: HPL_pdgesv (HPL_pdgesv.c:103)
-==3320==    by 0x406F64: HPL_pdtest (HPL_pdtest.c:197)
-==3320==    by 0x401D38: smpi_simulated_main_ (HPL_pddriver.c:223)
-==3320==    by 0x525BCDA: smpi_main_wrapper (smpi_global.cpp:366)
-==3320==    by 0x5129B8D: operator() (functional.hpp:48)
-==3320==    by 0x5129B8D: std::_Function_handler&lt;void (), simgrid::xbt::MainFunction&lt;int (*)(int, char**)&gt; &gt;::_M_invoke(std::_Any_data const&amp;) (functional:1740)
-==3320==    by 0x5151BB1: operator() (functional:2136)
-==3320==    by 0x5151BB1: operator() (Context.hpp:92)
-==3320==    by 0x5151BB1: simgrid::kernel::context::RawContext::wrapper(void*) (ContextRaw.cpp:303)
-==3320== 
-==3320== Conditional jump or move depends on uninitialised value(s)
-==3320==    at 0x4170A6: HPL_pdmxswp (HPL_pdmxswp.c:239)
-==3320==    by 0x40B4C2: HPL_pdpanllT (HPL_pdpanllT.c:221)
-==3320==    by 0x4243C8: HPL_pdfact (HPL_pdfact.c:129)
-==3320==    by 0x41AF61: HPL_pdgesv0 (HPL_pdgesv0.c:146)
-==3320==    by 0x40EFC4: HPL_pdgesv (HPL_pdgesv.c:103)
-==3320==    by 0x406F64: HPL_pdtest (HPL_pdtest.c:197)
-==3320==    by 0x401D38: smpi_simulated_main_ (HPL_pddriver.c:223)
-==3320==    by 0x525BCDA: smpi_main_wrapper (smpi_global.cpp:366)
-==3320==    by 0x5129B8D: operator() (functional.hpp:48)
-==3320==    by 0x5129B8D: std::_Function_handler&lt;void (), simgrid::xbt::MainFunction&lt;int (*)(int, char**)&gt; &gt;::_M_invoke(std::_Any_data const&amp;) (functional:1740)
-==3320==    by 0x5151BB1: operator() (functional:2136)
-==3320==    by 0x5151BB1: operator() (Context.hpp:92)
-==3320==    by 0x5151BB1: simgrid::kernel::context::RawContext::wrapper(void*) (ContextRaw.cpp:303)
-==3320== 
-==3320== Conditional jump or move depends on uninitialised value(s)
-==3320==    at 0x4150D5: HPL_dlocswpT (HPL_dlocswpT.c:134)
-==3320==    by 0x40B4D2: HPL_pdpanllT (HPL_pdpanllT.c:222)
-==3320==    by 0x4243C8: HPL_pdfact (HPL_pdfact.c:129)
-==3320==    by 0x41AF61: HPL_pdgesv0 (HPL_pdgesv0.c:146)
-==3320==    by 0x40EFC4: HPL_pdgesv (HPL_pdgesv.c:103)
-==3320==    by 0x406F64: HPL_pdtest (HPL_pdtest.c:197)
-==3320==    by 0x401D38: smpi_simulated_main_ (HPL_pddriver.c:223)
-==3320==    by 0x525BCDA: smpi_main_wrapper (smpi_global.cpp:366)
-==3320==    by 0x5129B8D: operator() (functional.hpp:48)
-==3320==    by 0x5129B8D: std::_Function_handler&lt;void (), simgrid::xbt::MainFunction&lt;int (*)(int, char**)&gt; &gt;::_M_invoke(std::_Any_data const&amp;) (functional:1740)
-==3320==    by 0x5151BB1: operator() (functional:2136)
-==3320==    by 0x5151BB1: operator() (Context.hpp:92)
-==3320==    by 0x5151BB1: simgrid::kernel::context::RawContext::wrapper(void*) (ContextRaw.cpp:303)
-==3320== 
-==3320== Conditional jump or move depends on uninitialised value(s)
-==3320==    at 0x4150D7: HPL_dlocswpT (HPL_dlocswpT.c:134)
-==3320==    by 0x40B4D2: HPL_pdpanllT (HPL_pdpanllT.c:222)
-==3320==    by 0x4243C8: HPL_pdfact (HPL_pdfact.c:129)
-==3320==    by 0x41AF61: HPL_pdgesv0 (HPL_pdgesv0.c:146)
-==3320==    by 0x40EFC4: HPL_pdgesv (HPL_pdgesv.c:103)
-==3320==    by 0x406F64: HPL_pdtest (HPL_pdtest.c:197)
-==3320==    by 0x401D38: smpi_simulated_main_ (HPL_pddriver.c:223)
-==3320==    by 0x525BCDA: smpi_main_wrapper (smpi_global.cpp:366)
-==3320==    by 0x5129B8D: operator() (functional.hpp:48)
-==3320==    by 0x5129B8D: std::_Function_handler&lt;void (), simgrid::xbt::MainFunction&lt;int (*)(int, char**)&gt; &gt;::_M_invoke(std::_Any_data const&amp;) (functional:1740)
-==3320==    by 0x5151BB1: operator() (functional:2136)
-==3320==    by 0x5151BB1: operator() (Context.hpp:92)
-==3320==    by 0x5151BB1: simgrid::kernel::context::RawContext::wrapper(void*) (ContextRaw.cpp:303)
-==3320== 
-==3320== Conditional jump or move depends on uninitialised value(s)
-==3320==    at 0x40B4DF: HPL_pdpanllT (HPL_pdpanllT.c:223)
-==3320==    by 0x4243C8: HPL_pdfact (HPL_pdfact.c:129)
-==3320==    by 0x41AF61: HPL_pdgesv0 (HPL_pdgesv0.c:146)
-==3320==    by 0x40EFC4: HPL_pdgesv (HPL_pdgesv.c:103)
-==3320==    by 0x406F64: HPL_pdtest (HPL_pdtest.c:197)
-==3320==    by 0x401D38: smpi_simulated_main_ (HPL_pddriver.c:223)
-==3320==    by 0x525BCDA: smpi_main_wrapper (smpi_global.cpp:366)
-==3320==    by 0x5129B8D: operator() (functional.hpp:48)
-==3320==    by 0x5129B8D: std::_Function_handler&lt;void (), simgrid::xbt::MainFunction&lt;int (*)(int, char**)&gt; &gt;::_M_invoke(std::_Any_data const&amp;) (functional:1740)
-==3320==    by 0x5151BB1: operator() (functional:2136)
-==3320==    by 0x5151BB1: operator() (Context.hpp:92)
-==3320==    by 0x5151BB1: simgrid::kernel::context::RawContext::wrapper(void*) (ContextRaw.cpp:303)
-==3320== 
-==3320== Conditional jump or move depends on uninitialised value(s)
-==3320==    at 0x40B4E1: HPL_pdpanllT (HPL_pdpanllT.c:223)
-==3320==    by 0x4243C8: HPL_pdfact (HPL_pdfact.c:129)
-==3320==    by 0x41AF61: HPL_pdgesv0 (HPL_pdgesv0.c:146)
-==3320==    by 0x40EFC4: HPL_pdgesv (HPL_pdgesv.c:103)
-==3320==    by 0x406F64: HPL_pdtest (HPL_pdtest.c:197)
-==3320==    by 0x401D38: smpi_simulated_main_ (HPL_pddriver.c:223)
-==3320==    by 0x525BCDA: smpi_main_wrapper (smpi_global.cpp:366)
-==3320==    by 0x5129B8D: operator() (functional.hpp:48)
-==3320==    by 0x5129B8D: std::_Function_handler&lt;void (), simgrid::xbt::MainFunction&lt;int (*)(int, char**)&gt; &gt;::_M_invoke(std::_Any_data const&amp;) (functional:1740)
-==3320==    by 0x5151BB1: operator() (functional:2136)
-==3320==    by 0x5151BB1: operator() (Context.hpp:92)
-==3320==    by 0x5151BB1: simgrid::kernel::context::RawContext::wrapper(void*) (ContextRaw.cpp:303)
-==3320== 
-==3320== Conditional jump or move depends on uninitialised value(s)
-==3320==    at 0x42483B: HPL_plindx0 (HPL_plindx0.c:255)
-==3320==    by 0x418EF6: HPL_pdlaswp00T (HPL_pdlaswp00T.c:172)
-==3320==    by 0x40E878: HPL_pdupdateTT (HPL_pdupdateTT.c:271)
-==3320==    by 0x41AF9F: HPL_pdgesv0 (HPL_pdgesv0.c:152)
-==3320==    by 0x40EFC4: HPL_pdgesv (HPL_pdgesv.c:103)
-==3320==    by 0x406F64: HPL_pdtest (HPL_pdtest.c:197)
-==3320==    by 0x401D38: smpi_simulated_main_ (HPL_pddriver.c:223)
-==3320==    by 0x525BCDA: smpi_main_wrapper (smpi_global.cpp:366)
-==3320==    by 0x5129B8D: operator() (functional.hpp:48)
-==3320==    by 0x5129B8D: std::_Function_handler&lt;void (), simgrid::xbt::MainFunction&lt;int (*)(int, char**)&gt; &gt;::_M_invoke(std::_Any_data const&amp;) (functional:1740)
-==3320==    by 0x5151BB1: operator() (functional:2136)
-==3320==    by 0x5151BB1: operator() (Context.hpp:92)
-==3320==    by 0x5151BB1: simgrid::kernel::context::RawContext::wrapper(void*) (ContextRaw.cpp:303)
-==3320== 
-==3320== Conditional jump or move depends on uninitialised value(s)
-==3320==    at 0x424877: HPL_plindx0 (HPL_plindx0.c:269)
-==3320==    by 0x418EF6: HPL_pdlaswp00T (HPL_pdlaswp00T.c:172)
-==3320==    by 0x40E878: HPL_pdupdateTT (HPL_pdupdateTT.c:271)
-==3320==    by 0x41AF9F: HPL_pdgesv0 (HPL_pdgesv0.c:152)
-==3320==    by 0x40EFC4: HPL_pdgesv (HPL_pdgesv.c:103)
-==3320==    by 0x406F64: HPL_pdtest (HPL_pdtest.c:197)
-==3320==    by 0x401D38: smpi_simulated_main_ (HPL_pddriver.c:223)
-==3320==    by 0x525BCDA: smpi_main_wrapper (smpi_global.cpp:366)
-==3320==    by 0x5129B8D: operator() (functional.hpp:48)
-==3320==    by 0x5129B8D: std::_Function_handler&lt;void (), simgrid::xbt::MainFunction&lt;int (*)(int, char**)&gt; &gt;::_M_invoke(std::_Any_data const&amp;) (functional:1740)
-==3320==    by 0x5151BB1: operator() (functional:2136)
-==3320==    by 0x5151BB1: operator() (Context.hpp:92)
-==3320==    by 0x5151BB1: simgrid::kernel::context::RawContext::wrapper(void*) (ContextRaw.cpp:303)
-==3320== 
-==3320== Use of uninitialised value of size 8
-==3320==    at 0x420B90: HPL_dlaswp02N (HPL_dlaswp02N.c:199)
-==3320==    by 0x418570: HPL_pdlaswp00T (HPL_pdlaswp00T.c:198)
-==3320==    by 0x40E878: HPL_pdupdateTT (HPL_pdupdateTT.c:271)
-==3320==    by 0x41AF9F: HPL_pdgesv0 (HPL_pdgesv0.c:152)
-==3320==    by 0x40EFC4: HPL_pdgesv (HPL_pdgesv.c:103)
-==3320==    by 0x406F64: HPL_pdtest (HPL_pdtest.c:197)
-==3320==    by 0x401D38: smpi_simulated_main_ (HPL_pddriver.c:223)
-==3320==    by 0x525BCDA: smpi_main_wrapper (smpi_global.cpp:366)
-==3320==    by 0x5129B8D: operator() (functional.hpp:48)
-==3320==    by 0x5129B8D: std::_Function_handler&lt;void (), simgrid::xbt::MainFunction&lt;int (*)(int, char**)&gt; &gt;::_M_invoke(std::_Any_data const&amp;) (functional:1740)
-==3320==    by 0x5151BB1: operator() (functional:2136)
-==3320==    by 0x5151BB1: operator() (Context.hpp:92)
-==3320==    by 0x5151BB1: simgrid::kernel::context::RawContext::wrapper(void*) (ContextRaw.cpp:303)
-==3320== 
-==3320== Use of uninitialised value of size 8
-==3320==    at 0x422901: HPL_dlaswp04T (HPL_dlaswp04T.c:259)
-==3320==    by 0x418CC3: HPL_pdlaswp00T (HPL_pdlaswp00T.c:329)
-==3320==    by 0x40E878: HPL_pdupdateTT (HPL_pdupdateTT.c:271)
-==3320==    by 0x41AF9F: HPL_pdgesv0 (HPL_pdgesv0.c:152)
-==3320==    by 0x40EFC4: HPL_pdgesv (HPL_pdgesv.c:103)
-==3320==    by 0x406F64: HPL_pdtest (HPL_pdtest.c:197)
-==3320==    by 0x401D38: smpi_simulated_main_ (HPL_pddriver.c:223)
-==3320==    by 0x525BCDA: smpi_main_wrapper (smpi_global.cpp:366)
-==3320==    by 0x5129B8D: operator() (functional.hpp:48)
-==3320==    by 0x5129B8D: std::_Function_handler&lt;void (), simgrid::xbt::MainFunction&lt;int (*)(int, char**)&gt; &gt;::_M_invoke(std::_Any_data const&amp;) (functional:1740)
-==3320==    by 0x5151BB1: operator() (functional:2136)
-==3320==    by 0x5151BB1: operator() (Context.hpp:92)
-==3320==    by 0x5151BB1: simgrid::kernel::context::RawContext::wrapper(void*) (ContextRaw.cpp:303)
-==3320== 
-==3320== Conditional jump or move depends on uninitialised value(s)
-==3320==    at 0x41F06D: HPL_pdpanel_free (HPL_pdpanel_free.c:79)
-==3320==    by 0x41AF31: HPL_pdgesv0 (HPL_pdgesv0.c:141)
-==3320==    by 0x40EFC4: HPL_pdgesv (HPL_pdgesv.c:103)
-==3320==    by 0x406F64: HPL_pdtest (HPL_pdtest.c:197)
-==3320==    by 0x401D38: smpi_simulated_main_ (HPL_pddriver.c:223)
-==3320==    by 0x525BCDA: smpi_main_wrapper (smpi_global.cpp:366)
-==3320==    by 0x5129B8D: operator() (functional.hpp:48)
-==3320==    by 0x5129B8D: std::_Function_handler&lt;void (), simgrid::xbt::MainFunction&lt;int (*)(int, char**)&gt; &gt;::_M_invoke(std::_Any_data const&amp;) (functional:1740)
-==3320==    by 0x5151BB1: operator() (functional:2136)
-==3320==    by 0x5151BB1: operator() (Context.hpp:92)
-==3320==    by 0x5151BB1: simgrid::kernel::context::RawContext::wrapper(void*) (ContextRaw.cpp:303)
-==3320== 
-==3320== Conditional jump or move depends on uninitialised value(s)
-==3320==    at 0x4248A5: HPL_plindx0 (HPL_plindx0.c:258)
-==3320==    by 0x418EF6: HPL_pdlaswp00T (HPL_pdlaswp00T.c:172)
-==3320==    by 0x40E878: HPL_pdupdateTT (HPL_pdupdateTT.c:271)
-==3320==    by 0x41AF9F: HPL_pdgesv0 (HPL_pdgesv0.c:152)
-==3320==    by 0x40EFC4: HPL_pdgesv (HPL_pdgesv.c:103)
-==3320==    by 0x406F64: HPL_pdtest (HPL_pdtest.c:197)
-==3320==    by 0x401D38: smpi_simulated_main_ (HPL_pddriver.c:223)
-==3320==    by 0x525BCDA: smpi_main_wrapper (smpi_global.cpp:366)
-==3320==    by 0x5129B8D: operator() (functional.hpp:48)
-==3320==    by 0x5129B8D: std::_Function_handler&lt;void (), simgrid::xbt::MainFunction&lt;int (*)(int, char**)&gt; &gt;::_M_invoke(std::_Any_data const&amp;) (functional:1740)
-==3320==    by 0x5151BB1: operator() (functional:2136)
-==3320==    by 0x5151BB1: operator() (Context.hpp:92)
-==3320==    by 0x5151BB1: simgrid::kernel::context::RawContext::wrapper(void*) (ContextRaw.cpp:303)
-==3320== 
-==3320== Conditional jump or move depends on uninitialised value(s)
-==3320==    at 0x4203FF: HPL_dlaswp01T (HPL_dlaswp01T.c:237)
-==3320==    by 0x418BDD: HPL_pdlaswp00T (HPL_pdlaswp00T.c:194)
-==3320==    by 0x40E878: HPL_pdupdateTT (HPL_pdupdateTT.c:271)
-==3320==    by 0x41AF9F: HPL_pdgesv0 (HPL_pdgesv0.c:152)
-==3320==    by 0x40EFC4: HPL_pdgesv (HPL_pdgesv.c:103)
-==3320==    by 0x406F64: HPL_pdtest (HPL_pdtest.c:197)
-==3320==    by 0x401D38: smpi_simulated_main_ (HPL_pddriver.c:223)
-==3320==    by 0x525BCDA: smpi_main_wrapper (smpi_global.cpp:366)
-==3320==    by 0x5129B8D: operator() (functional.hpp:48)
-==3320==    by 0x5129B8D: std::_Function_handler&lt;void (), simgrid::xbt::MainFunction&lt;int (*)(int, char**)&gt; &gt;::_M_invoke(std::_Any_data const&amp;) (functional:1740)
-==3320==    by 0x5151BB1: operator() (functional:2136)
-==3320==    by 0x5151BB1: operator() (Context.hpp:92)
-==3320==    by 0x5151BB1: simgrid::kernel::context::RawContext::wrapper(void*) (ContextRaw.cpp:303)
-==3320== 
-==3320== Use of uninitialised value of size 8
-==3320==    at 0x4205A0: HPL_dlaswp01T (HPL_dlaswp01T.c:245)
-==3320==    by 0x418BDD: HPL_pdlaswp00T (HPL_pdlaswp00T.c:194)
-==3320==    by 0x40E878: HPL_pdupdateTT (HPL_pdupdateTT.c:271)
-==3320==    by 0x41AF9F: HPL_pdgesv0 (HPL_pdgesv0.c:152)
-==3320==    by 0x40EFC4: HPL_pdgesv (HPL_pdgesv.c:103)
-==3320==    by 0x406F64: HPL_pdtest (HPL_pdtest.c:197)
-==3320==    by 0x401D38: smpi_simulated_main_ (HPL_pddriver.c:223)
-==3320==    by 0x525BCDA: smpi_main_wrapper (smpi_global.cpp:366)
-==3320==    by 0x5129B8D: operator() (functional.hpp:48)
-==3320==    by 0x5129B8D: std::_Function_handler&lt;void (), simgrid::xbt::MainFunction&lt;int (*)(int, char**)&gt; &gt;::_M_invoke(std::_Any_data const&amp;) (functional:1740)
-==3320==    by 0x5151BB1: operator() (functional:2136)
-==3320==    by 0x5151BB1: operator() (Context.hpp:92)
-==3320==    by 0x5151BB1: simgrid::kernel::context::RawContext::wrapper(void*) (ContextRaw.cpp:303)
-==3320== 
-==3320== Conditional jump or move depends on uninitialised value(s)
-==3320==    at 0x4170B5: HPL_pdmxswp (HPL_pdmxswp.c:240)
-==3320==    by 0x40B4C2: HPL_pdpanllT (HPL_pdpanllT.c:221)
-==3320==    by 0x4243C8: HPL_pdfact (HPL_pdfact.c:129)
-==3320==    by 0x41AF61: HPL_pdgesv0 (HPL_pdgesv0.c:146)
-==3320==    by 0x40EFC4: HPL_pdgesv (HPL_pdgesv.c:103)
-==3320==    by 0x406F64: HPL_pdtest (HPL_pdtest.c:197)
-==3320==    by 0x401D38: smpi_simulated_main_ (HPL_pddriver.c:223)
-==3320==    by 0x525BCDA: smpi_main_wrapper (smpi_global.cpp:366)
-==3320==    by 0x5129B8D: operator() (functional.hpp:48)
-==3320==    by 0x5129B8D: std::_Function_handler&lt;void (), simgrid::xbt::MainFunction&lt;int (*)(int, char**)&gt; &gt;::_M_invoke(std::_Any_data const&amp;) (functional:1740)
-==3320==    by 0x5151BB1: operator() (functional:2136)
-==3320==    by 0x5151BB1: operator() (Context.hpp:92)
-==3320==    by 0x5151BB1: simgrid::kernel::context::RawContext::wrapper(void*) (ContextRaw.cpp:303)
-==3320== 
-==3320== Conditional jump or move depends on uninitialised value(s)
-==3320==    at 0x41F06D: HPL_pdpanel_free (HPL_pdpanel_free.c:79)
-==3320==    by 0x41F040: HPL_pdpanel_disp (HPL_pdpanel_disp.c:89)
-==3320==    by 0x41AFCD: HPL_pdgesv0 (HPL_pdgesv0.c:161)
-==3320==    by 0x40EFC4: HPL_pdgesv (HPL_pdgesv.c:103)
-==3320==    by 0x406F64: HPL_pdtest (HPL_pdtest.c:197)
-==3320==    by 0x401D38: smpi_simulated_main_ (HPL_pddriver.c:223)
-==3320==    by 0x525BCDA: smpi_main_wrapper (smpi_global.cpp:366)
-==3320==    by 0x5129B8D: operator() (functional.hpp:48)
-==3320==    by 0x5129B8D: std::_Function_handler&lt;void (), simgrid::xbt::MainFunction&lt;int (*)(int, char**)&gt; &gt;::_M_invoke(std::_Any_data const&amp;) (functional:1740)
-==3320==    by 0x5151BB1: operator() (functional:2136)
-==3320==    by 0x5151BB1: operator() (Context.hpp:92)
-==3320==    by 0x5151BB1: simgrid::kernel::context::RawContext::wrapper(void*) (ContextRaw.cpp:303)
-==3320== 
-[0.884470] /home/degomme/simgrid/src/simix/smx_global.cpp:567: [simix_kernel/CRITICAL] Oops ! Deadlock or code not perfectly clean.
-[0.884470] [simix_kernel/INFO] 16 processes are still running, waiting for something.
-[0.884470] [simix_kernel/INFO] Legend of the following listing: "Process &lt;pid&gt; (&lt;name&gt;@&lt;host&gt;): &lt;status&gt;"
-[0.884470] [simix_kernel/INFO] Process 1 (0@host-0.hawaii.edu): waiting for communication synchro 0xfb4beb0 () in state 0 to finish
-[0.884470] [simix_kernel/INFO] Process 2 (1@host-1.hawaii.edu): waiting for communication synchro 0xfb4b0c0 () in state 0 to finish
-[0.884470] [simix_kernel/INFO] Process 3 (2@host-2.hawaii.edu): waiting for communication synchro 0xfb49760 () in state 0 to finish
-[0.884470] [simix_kernel/INFO] Process 4 (3@host-3.hawaii.edu): waiting for communication synchro 0xfb47590 () in state 0 to finish
-[0.884470] [simix_kernel/INFO] Process 5 (4@host-4.hawaii.edu): waiting for synchronization synchro 0xf8a1ae0 () in state 0 to finish
-[0.884470] [simix_kernel/INFO] Process 6 (5@host-5.hawaii.edu): waiting for synchronization synchro 0xf8a1f10 () in state 0 to finish
-[0.884470] [simix_kernel/INFO] Process 7 (6@host-6.hawaii.edu): waiting for synchronization synchro 0xf897500 () in state 0 to finish
-[0.884470] [simix_kernel/INFO] Process 8 (7@host-7.hawaii.edu): waiting for synchronization synchro 0xf89b190 () in state 0 to finish
-[0.884470] [simix_kernel/INFO] Process 9 (8@host-8.hawaii.edu): waiting for synchronization synchro 0xf8a3680 () in state 0 to finish
-[0.884470] [simix_kernel/INFO] Process 10 (9@host-9.hawaii.edu): waiting for synchronization synchro 0xf896280 () in state 0 to finish
-[0.884470] [simix_kernel/INFO] Process 11 (10@host-10.hawaii.edu): waiting for synchronization synchro 0xf8970d0 () in state 0 to finish
-[0.884470] [simix_kernel/INFO] Process 12 (11@host-11.hawaii.edu): waiting for synchronization synchro 0xf89b5c0 () in state 0 to finish
-[0.884470] [simix_kernel/INFO] Process 13 (12@host-12.hawaii.edu): waiting for synchronization synchro 0xf89ce30 () in state 0 to finish
-[0.884470] [simix_kernel/INFO] Process 14 (13@host-13.hawaii.edu): waiting for synchronization synchro 0xf89f530 () in state 0 to finish
-[0.884470] [simix_kernel/INFO] Process 15 (14@host-14.hawaii.edu): waiting for synchronization synchro 0xf89f100 () in state 0 to finish
-[0.884470] [simix_kernel/INFO] Process 16 (15@host-15.hawaii.edu): waiting for synchronization synchro 0xf8a0ca0 () in state 0 to finish
-==3320== 
-==3320== Process terminating with default action of signal 6 (SIGABRT)
-==3320==    at 0x5619428: raise (raise.c:54)
-==3320==    by 0x561B029: abort (abort.c:89)
-==3320==    by 0x52347B8: xbt_abort (xbt_main.cpp:167)
-==3320==    by 0x52F4768: SIMIX_run.part.110 (smx_global.cpp:569)
-==3320==    by 0x52F6204: SIMIX_run (stl_algobase.h:224)
-==3320==    by 0x5263E66: smpi_main (smpi_global.cpp:474)
-==3320==    by 0x560482F: (below main) (libc-start.c:291)
-==3320== 
-==3320== HEAP SUMMARY:
-==3320==     in use at exit: 136,159,788 bytes in 7,560 blocks
-==3320==   total heap usage: 39,378 allocs, 31,818 frees, 140,230,437 bytes allocated
-==3320== 
-==3320== LEAK SUMMARY:
-==3320==    definitely lost: 321 bytes in 4 blocks
-==3320==    indirectly lost: 0 bytes in 0 blocks
-==3320==      possibly lost: 134,294,280 bytes in 96 blocks
-==3320==    still reachable: 1,865,187 bytes in 7,460 blocks
-==3320==         suppressed: 0 bytes in 0 blocks
-==3320== Rerun with --leak-check=full to see details of leaked memory
-==3320== 
-==3320== For counts of detected and suppressed errors, rerun with: -v
-==3320== Use --track-origins=yes to see where uninitialised values come from
-==3320== ERROR SUMMARY: 1147 errors from 24 contexts (suppressed: 0 from 0)
-valgrind --track-origins:yes ./xhpl --cfg=surf/precision:1e-9 --cfg=network/model:SMPI --cfg=network/TCP-gamma:4194304 --cfg=smpi/bcast:mpich --cfg=smpi/running-power:6217956542.969 --cfg=smpi/display-timing:yes --cfg=smpi/privatize-global-variables:yes --cfg=smpi/shared-malloc:local --cfg=smpi/privatize-global-variables:1 ./cluster_fat_tree_64.xml smpitmp-apprXPdW8
-Execution failed with code 134.
-</pre></li>
-<li>Note that this file has been obtained with a nearly-vanilla HPL (see the Github issue). No <code>smpi_usleep</code>, and shared
-<code>malloc</code> only for the matrix (no partial shared <code>malloc</code> for <code>PANEL-&gt;WORK</code>). Thus, it is quite strange to see such errors.</li>
-<li>The first error (<code>HPL_pipid.c:144</code>) happens because <code>PANEL-&gt;ia</code> is unitialized (checked by modifying the two operands one
-after the other to see if the error persists).</li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-</div>
-<div id="outline-container-org676189b" class="outline-3">
-<h3 id="org676189b"><span class="section-number-3">1.3</span> 2017-04 April</h3>
-</div>
-<div id="outline-container-orgf860efb" class="outline-3">
-<h3 id="orgf860efb"><span class="section-number-3">1.4</span> 2017-05 May</h3>
-</div>
-<div id="outline-container-org288fc03" class="outline-3">
-<h3 id="org288fc03"><span class="section-number-3">1.5</span> 2017-06 June</h3>
-<div class="outline-text-3" id="text-1-5">
-</div>
-<div id="outline-container-org6e3ed46" class="outline-4">
-<h4 id="org6e3ed46"><span class="section-number-4">1.5.1</span> 2017-06-01 Thursday</h4>
-<div class="outline-text-4" id="text-1-5-1">
-</div>
-<ol class="org-ol">
-<li><a id="orgcf5b0b3"></a>Redo validation of huge pages&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="EXPERIMENTS">EXPERIMENTS</span>&#xa0;<span class="HPL">HPL</span>&#xa0;<span class="REPORT">REPORT</span></span><br />
-<div class="outline-text-5" id="text-1-5-1-1">
-<ul class="org-ul">
-<li>Simgrid commit: <code>9a8e2f5bce8c6758d4367d21a66466a497d136fe</code></li>
-<li>HPL commit: <code>41774905395aebcb73650defaa7e2aa462e6e1a3</code></li>
-<li>Script commit: <code>eb071f09d822e1031ea0776949058bf2f55cb94a</code></li>
-<li><p>
-Compilation and execution for optimized HPL (made on <code>nova-10</code> without the huge pages, <code>nova-11</code> with the huge pages)
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">make <span class="org-variable-name">SMPI_OPTS</span>=<span class="org-string">"-DSMPI_OPTIMIZATION_LEVEL=4 -DSMPI_DGEMM_COEFFICIENT=1.742435e-10</span>
-<span class="org-string">-DSMPI_DTRSM_COEFFICIENT=8.897459e-11"</span> <span class="org-variable-name">arch</span>=SMPI
-</pre>
-</div>
-<div class="org-src-container">
-<pre class="src src-sh">sysctl -w vm.overcommit_memory=1 &amp;&amp; sysctl -w vm.max_map_count=40000000
-
-mount none /root/huge -t hugetlbfs -o rw,<span class="org-variable-name">mode</span>=0777 &amp;&amp; <span class="org-builtin">echo</span> 1 &gt;&gt; /proc/sys/vm/nr_hugepages
-</pre>
-</div>
-<div class="org-src-container">
-<pre class="src src-sh">./run_measures.py --global_csv result_size.csv --nb_runs 3 --size 50000,100000,150000,200000,250000,300000 --nb_proc
-64 --topo <span class="org-string">"2;16,32;1,16;1,1"</span> --experiment HPL --running_power 5004882812.500 --nb_cpu 8
-
-./run_measures.py --global_csv result_size.csv --nb_runs 3 --size 50000,100000,150000,200000,250000,300000 --nb_proc
-64 --topo <span class="org-string">"2;16,32;1,16;1,1"</span> --experiment HPL --running_power 5004882812.500 --nb_cpu 8 --hugepage /root/huge
-</pre>
-</div></li>
-<li><p>
-Analysis
-</p>
-<div class="org-src-container">
-<pre class="src src-R">library(ggplot2)
-library(gridExtra)
-old &lt;- rbind(read.csv("validation/result_size_L4_big_nohugepage.csv"), read.csv("validation/result_size_L4_big_nohugepage_2.csv"))
-new &lt;- read.csv("validation/result_size_L4_big_hugepage.csv")
-old$hugepage = FALSE
-new$hugepage =  TRUE
-results = rbind(old, new)
-</pre>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">do_plot(results, "size", "simulation_time", "hugepage", "Huge page", 64)
-</pre>
-</div>
-
-<p>
-<a href="validation/hugepage/1.pdf">validation/hugepage/1.pdf</a>
-</p>
-
-<div class="org-src-container">
-<pre class="src src-R">do_plot(results, "size", "memory_size", "hugepage", "Huge page", 64)
-</pre>
-</div>
-
-<p>
-<a href="validation/hugepage/3.pdf">validation/hugepage/3.pdf</a>
-</p>
-
-<div class="org-src-container">
-<pre class="src src-R">do_plot(results, "size", "Gflops", "hugepage", "Huge page", 64)
-</pre>
-</div>
-
-<p>
-<a href="validation/hugepage/5.pdf">validation/hugepage/5.pdf</a>
-</p>
-
-<div class="org-src-container">
-<pre class="src src-R">grid_arrange_shared_legend(
-    do_plot(results, "size", "simulation_time", "hugepage", "Huge page", 64),
-    do_plot(results, "size", "memory_size", "hugepage", "Huge page", 64),
-    nrow=1, ncol=2
-)
-</pre>
-</div>
-
-<p>
-<a href="validation/hugepage/report_plot.pdf">validation/hugepage/report_plot.pdf</a>
-</p>
-
-<div class="org-src-container">
-<pre class="src src-R">plot1 = generic_do_plot(ggplot(results, aes(x=size, y=cpu_utilization, color=hugepage))) +
-    ggtitle("CPU utilization for different matrix sizes\nUsing 64 MPI processes")
-plot2 = generic_do_plot(ggplot(results, aes(x=size, y=minor_page_fault, color=hugepage))) +
-    ggtitle("Number of page faults for different matrix sizes\nUsing 64 MPI processes")
-grid.arrange(plot1, plot2, ncol=2)
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="validation/hugepage/2.png" alt="2.png" />
-</p>
-</div></li>
-</ul>
-
-
-<div class="org-src-container">
-<pre class="src src-R">library(data.table)
-aggregate_results &lt;- function(results) {
-    x = data.table(results)
-    x = as.data.frame(x[, list(simulation_time=mean(simulation_time), Gflops=mean(Gflops), application_time=mean(application_time)), by=c("size", "nb_proc")])
-    return(x[with(x, order(size, nb_proc)),])
-}
-aggr_old = aggregate_results(old)
-aggr_new = aggregate_results(new)
-aggr_new$Gflops_error = (aggr_new$Gflops - aggr_old$Gflops)/aggr_new$Gflops
-</pre>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">generic_do_plot(ggplot(aggr_new, aes(x=size, y=Gflops_error)))
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="validation/hugepage/3.png" alt="3.png" />
-</p>
-</div>
-
-<ul class="org-ul">
-<li>The Gflops error is negligible.</li>
-<li>The gain of using huge pages is pretty neat for both the simulation time and the memory consumption.</li>
-<li>Very large variability of the CPU utilization, something weird has happened.</li>
-</ul>
-</div>
-</li>
-<li><a id="orgcbda5eb"></a>Scalability test&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="EXPERIMENTS">EXPERIMENTS</span>&#xa0;<span class="HPL">HPL</span>&#xa0;<span class="REPORT">REPORT</span></span><br />
-<div class="outline-text-5" id="text-1-5-1-2">
-<ul class="org-ul">
-<li>Simgrid commit: <code>9a8e2f5bce8c6758d4367d21a66466a497d136fe</code></li>
-<li>HPL commit: <code>41774905395aebcb73650defaa7e2aa462e6e1a3</code></li>
-<li>Script commit: <code>8cfd8d16787f39a29342b64599cf02166af6d632</code></li>
-<li><p>
-Compilation and execution for optimized HPL (made on <code>nova-10</code> and <code>nova-11</code>)
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">make <span class="org-variable-name">SMPI_OPTS</span>=<span class="org-string">"-DSMPI_OPTIMIZATION_LEVEL=4 -DSMPI_DGEMM_COEFFICIENT=1.742435e-10</span>
-<span class="org-string">-DSMPI_DTRSM_COEFFICIENT=8.897459e-11"</span> <span class="org-variable-name">arch</span>=SMPI
-</pre>
-</div>
-<div class="org-src-container">
-<pre class="src src-sh">sysctl -w vm.overcommit_memory=1 &amp;&amp; sysctl -w vm.max_map_count=40000000
-
-mount none /root/huge -t hugetlbfs -o rw,<span class="org-variable-name">mode</span>=0777 &amp;&amp; <span class="org-builtin">echo</span> 1 &gt;&gt; /proc/sys/vm/nr_hugepages
-</pre>
-</div>
-<div class="org-src-container">
-<pre class="src src-sh">./run_measures.py --global_csv result_size_1000000_512.csv --nb_runs 1 --size 1000000 --nb_proc 512 --topo
-<span class="org-string">"2;16,32;1,16;1,1"</span> --experiment HPL --running_power 5004882812.500 --nb_cpu 8 --hugepage /root/huge
-
-./run_measures.py --global_csv result_size_1000000_1024.csv --nb_runs 1 --size 1000000 --nb_proc 1024 --topo
-<span class="org-string">"2;16,32;1,16;1,1"</span> --experiment HPL --running_power 5004882812.500 --nb_cpu 8 --hugepage /root/huge
-
-./run_measures.py --global_csv result_size_1000000_2048.csv --nb_runs 1 --size 1000000 --nb_proc 2048 --topo
-<span class="org-string">"2;16,32;1,16;1,1"</span> --experiment HPL --running_power 5004882812.500 --nb_cpu 8 --hugepage /root/huge
-
-./run_measures.py --global_csv result_size_2000000_512.csv --nb_runs 1 --size 2000000 --nb_proc 512 --topo
-<span class="org-string">"2;16,32;1,16;1,1"</span> --experiment HPL --running_power 5004882812.500 --nb_cpu 8 --hugepage /root/huge
-
-./run_measures.py --global_csv result_size_2000000_1024.csv --nb_runs 1 --size 2000000 --nb_proc 1024 --topo
-<span class="org-string">"2;16,32;1,16;1,1"</span> --experiment HPL --running_power 5004882812.500 --nb_cpu 8 --hugepage /root/huge
-
-./run_measures.py --global_csv result_size_2000000_2048.csv --nb_runs 1 --size 2000000 --nb_proc 2048 --topo
-<span class="org-string">"2;16,32;1,16;1,1"</span> --experiment HPL --running_power 5004882812.500 --nb_cpu 8 --hugepage /root/huge
-</pre>
-</div></li>
-<li><p>
-Results:
-</p>
-<div class="org-src-container">
-<pre class="src src-R">rbind(
-    read.csv('scalability/result_1000000_512.csv'),
-    read.csv('scalability/result_1000000_1024.csv'),
-    read.csv('scalability/result_1000000_2048.csv'),
-    read.csv('scalability/result_2000000_512.csv'),
-    read.csv('scalability/result_2000000_1024.csv'),
-    read.csv('scalability/result_2000000_2048.csv')
-)
-</pre>
-</div>
-
-<pre class="example">
-          topology nb_roots nb_proc    size full_time      time Gflops
-1 2;16,32;1,16;1,1       16     512 1000000    716521  716521.0  930.4
-2 2;16,32;1,16;1,1       16    1024 1000000    363201  363201.0 1836.0
-3 2;16,32;1,16;1,1       16    2048 1000000    186496  186495.7 3575.0
-4 2;16,32;1,16;1,1       16     512 2000000   5685080 5685077.7  938.1
-5 2;16,32;1,16;1,1       16    1024 2000000   2861010 2861012.5 1864.0
-6 2;16,32;1,16;1,1       16    2048 2000000   1448900 1448899.1 3681.0
-  simulation_time application_time user_time system_time major_page_fault
-1         2635.10           500.97   2367.19      259.91                0
-2         6037.89          1036.96   5515.36      515.05                0
-3        12391.90          2092.95  11389.36      995.39                0
-4         6934.86          1169.66   6193.80      683.73                0
-5        15198.30          2551.10  13714.01     1430.93                0
-6        32263.60          5236.56  29357.92     2844.89                0
-  minor_page_fault cpu_utilization        uss        rss page_table_size
-1          1916208            0.99  153665536 2317279232        10600000
-2          2002989            0.99  369676288 4837175296        21252000
-3          2154982            0.99 1010696192 7774138368        42908000
-4          3801905            0.99  150765568 2758770688        10604000
-5          3872820            0.99  365555712 5273034752        21220000
-6          4038099            0.99 1009606656 7415914496        42884000
-  memory_size
-1   894443520
-2  1055309824
-3  1581170688
-4  3338420224
-5  3497111552
-6  4027408384
-</pre></li>
-</ul>
-</div>
-</li>
-<li><a id="orgcd3ccd2"></a>Add the Stampede output file in the repository&#xa0;&#xa0;&#xa0;<span class="tag"><span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-5-1-3">
-<ul class="org-ul">
-<li>File <a href="fullrun2.run1.notestmode.20000m.log">fullrun2.run1.notestmode.20000m.log</a></li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-org31c9895" class="outline-4">
-<h4 id="org31c9895"><span class="section-number-4">1.5.2</span> 2017-06-02 Friday</h4>
-<div class="outline-text-4" id="text-1-5-2">
-</div>
-<ol class="org-ol">
-<li><a id="org499092b"></a><span class="done DONE">DONE</span> New scalability tests to run <code>[6/6]</code>&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-5-2-1">
-<ul class="org-ul">
-<li class="on"><code>[X]</code> N=1000000, nbproc=4096, expected time &asymp; 206min &times; 2.2 &asymp; 7.5h</li>
-<li class="on"><code>[X]</code> N=2000000, nbproc=4096, expected time &asymp; 537min &times; 2.2 &asymp; 19.7h</li>
-<li class="on"><code>[X]</code> N=4000000, nbproc=512,  expected time &asymp; 115min &times; 2.6 &asymp; 5h</li>
-<li class="on"><code>[X]</code> N=4000000, nbproc=1024, expected time &asymp; 253min &times; 2.6 &asymp; 11h</li>
-<li class="on"><code>[X]</code> N=4000000, nbproc=2048, expected time &asymp; 537min &times; 2.6 &asymp; 23.3h</li>
-<li class="on"><code>[X]</code> N=4000000, nbproc=4096, expected time &asymp; 537min &times; 2.6 &times; 2.2 &asymp; 51h</li>
-</ul>
-</div>
-</li>
-<li><a id="org9a28442"></a>Cannot connect anymore on G5K nodes in Lyon&#xa0;&#xa0;&#xa0;<span class="tag"><span class="BUG">BUG</span>&#xa0;<span class="G5K">G5K</span></span><br />
-<div class="outline-text-5" id="text-1-5-2-2">
-<ul class="org-ul">
-<li>Reserved a job and made a deployment in <code>lyon</code>. Then, <b>cannot</b> connect to the node (both as <code>tocornebize</code> and as <code>root</code>).</li>
-<li>Reserved a job and made a deployment in <code>grenoble</code>. Then, <b>can</b> connect to the node (both as <code>tocornebize</code> and as <code>root</code>).</li>
-<li>Looked at the <code>.ssh</code> directories of <code>grenoble</code> and <code>lyon</code>, they look the same.</li>
-<li>Can <code>ssh</code> from <code>lyon</code> to <code>grenoble</code> (or any other site) but cannot <code>ssh</code> from <code>grenoble</code> (or any other site) to <code>lyon</code>.</li>
-<li>Fixed by replacing the <code>.ssh</code> folder from <code>lyon</code> by the <code>.ssh</code> folder from <code>grenoble</code> (might have messed up something&#x2026;).</li>
-</ul>
-</div>
-</li>
-<li><a id="org4ebf5aa"></a>First capacity planning test&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="EXPERIMENTS">EXPERIMENTS</span>&#xa0;<span class="HPL">HPL</span>&#xa0;<span class="REPORT">REPORT</span></span><br />
-<div class="outline-text-5" id="text-1-5-2-3">
-<ul class="org-ul">
-<li>Simgrid commit: <code>9a8e2f5bce8c6758d4367d21a66466a497d136fe</code></li>
-<li>HPL commit: <code>41774905395aebcb73650defaa7e2aa462e6e1a3</code></li>
-<li>Script commit: <code>4ff3ccbcbb77e126e454a16dea0535493ff1ff0b</code></li>
-<li><p>
-Compilation and execution (on <code>nova-6</code> and <code>nova-8</code>).
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">make <span class="org-variable-name">SMPI_OPTS</span>=<span class="org-string">"-DSMPI_OPTIMIZATION_LEVEL=4 -DSMPI_DGEMM_COEFFICIENT=1.742435e-10</span>
-<span class="org-string">-DSMPI_DTRSM_COEFFICIENT=8.897459e-11"</span> <span class="org-variable-name">arch</span>=SMPI
-</pre>
-</div>
-<div class="org-src-container">
-<pre class="src src-sh">sysctl -w vm.overcommit_memory=1 &amp;&amp; sysctl -w vm.max_map_count=40000000
-
-mount none /root/huge -t hugetlbfs -o rw,<span class="org-variable-name">mode</span>=0777 &amp;&amp; <span class="org-builtin">echo</span> 1 &gt;&gt; /proc/sys/vm/nr_hugepages
-</pre>
-</div>
-<div class="org-src-container">
-<pre class="src src-sh">./run_measures.py --global_csv result_capacity_50000.csv --nb_runs 1 --size 50000 --nb_proc 512 --topo <span class="org-string">"2;16,32;1,1:16;1,1"</span>
---experiment HPL --running_power 5004882812.500 --hugepage /root/huge
-
-./run_measures.py --global_csv result_capacity_100000.csv --nb_runs 1 --size 100000 --nb_proc 512 --topo <span class="org-string">"2;16,32;1,1:16;1,1"</span>
---experiment HPL --running_power 5004882812.500 --hugepage /root/huge
-</pre>
-</div></li>
-<li><p>
-Results:
-</p>
-<div class="org-src-container">
-<pre class="src src-R">library(ggplot2)
-results &lt;- rbind(read.csv("capacity_planning/result_capacity_50000.csv"), read.csv("capacity_planning/result_capacity_100000.csv"))
-</pre>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">ggplot(results, aes(x=nb_roots, y=Gflops, color=size, group=size)) +
-    stat_summary(fun.y = mean, geom="line")+
-    stat_summary(fun.y = mean, geom="point")+
-    expand_limits(x=0, y=0)+
-    ggtitle("Gflops estimation for different number of root switches and matrix sizes\nUsing 512 MPI processes")
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="capacity_planning/1.png" alt="1.png" />
-</p>
-</div></li>
-
-<li>In this experiment, we use a fat-tree which has a total of 512 nodes, all having only one core. We use 512 processes,
-one per node. We change the number of up-ports of the L1 switches and therefore the number of L2 switches.</li>
-<li>It is strange, there is apparently no impact on the performances of HPL, we get the same performances with only one L2
-switch than with 16 L2 switches.</li>
-<li>Maybe we could try with a bigger matrix, to maybe have some network contention? But the experiment might take some time.</li>
-<li>We could also try with a hostfile randomly shuffled, to maybe have a less good mapping and thus more traffic going
-through the L2 switches?</li>
-<li>We could also try a fat-tree more “high” and less “wide”. We could have a third layer of switches, but decrease the
-number of ports to keep the same number of nodes. For instance, <code>3;8,8,8;1,8,16;1,1,1</code> instead of <code>2;16,32;1,16;1,1</code> (all
-have 512 nodes). But it is a bit artificial, such topology would certainly never happen in “real life”.</li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-orgd160551" class="outline-4">
-<h4 id="orgd160551"><span class="section-number-4">1.5.3</span> 2017-06-03 Saturday</h4>
-<div class="outline-text-4" id="text-1-5-3">
-</div>
-<ol class="org-ol">
-<li><a id="orgfc0601f"></a>New scalability tests&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="EXPERIMENTS">EXPERIMENTS</span>&#xa0;<span class="HPL">HPL</span>&#xa0;<span class="REPORT">REPORT</span></span><br />
-<div class="outline-text-5" id="text-1-5-3-1">
-<ul class="org-ul">
-<li>Simgrid commit: <code>9a8e2f5bce8c6758d4367d21a66466a497d136fe</code></li>
-<li>HPL commit: <code>41774905395aebcb73650defaa7e2aa462e6e1a3</code></li>
-<li>Script commit: <code>4ff3ccbcbb77e126e454a16dea0535493ff1ff0b</code></li>
-<li><p>
-Compilation and execution (made on <code>nova-5</code>, <code>nova-11</code>, <code>nova-13</code>, <code>nova-14</code>):
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">make <span class="org-variable-name">SMPI_OPTS</span>=<span class="org-string">"-DSMPI_OPTIMIZATION_LEVEL=4 -DSMPI_DGEMM_COEFFICIENT=1.742435e-10</span>
-<span class="org-string">-DSMPI_DTRSM_COEFFICIENT=8.897459e-11"</span> <span class="org-variable-name">arch</span>=SMPI
-</pre>
-</div>
-<div class="org-src-container">
-<pre class="src src-sh">sysctl -w vm.overcommit_memory=1 &amp;&amp; sysctl -w vm.max_map_count=2000000000
-
-mount none /root/huge -t hugetlbfs -o rw,<span class="org-variable-name">mode</span>=0777 &amp;&amp; <span class="org-builtin">echo</span> 1 &gt;&gt; /proc/sys/vm/nr_hugepages
-</pre>
-</div>
-<div class="org-src-container">
-<pre class="src src-sh">./run_measures.py --global_csv result_size_1000000_4096.csv --nb_runs 1 --size 1000000 --nb_proc 4096 --topo
-<span class="org-string">"2;16,32;1,16;1,1;8"</span> --experiment HPL --running_power 5004882812.500 --hugepage /root/huge
-
-./run_measures.py --global_csv result_size_4000000_512.csv --nb_runs 1 --size 4000000 --nb_proc 512 --topo
-<span class="org-string">"2;16,32;1,16;1,1;8"</span> --experiment HPL --running_power 5004882812.500 --hugepage /root/huge
-
-./run_measures.py --global_csv result_size_4000000_1024.csv --nb_runs 1 --size 4000000 --nb_proc 1024 --topo
-<span class="org-string">"2;16,32;1,16;1,1;8"</span> --experiment HPL --running_power 5004882812.500 --hugepage /root/huge
-
-./run_measures.py --global_csv result_size_2000000_4096.csv --nb_runs 1 --size 2000000 --nb_proc 4096 --topo
-<span class="org-string">"2;16,32;1,16;1,1;8"</span> --experiment HPL --running_power 5004882812.500 --hugepage /root/huge
-
-./run_measures.py --global_csv result_size_4000000_2048.csv --nb_runs 1 --size 4000000 --nb_proc 2048 --topo
-<span class="org-string">"2;16,32;1,16;1,1;8"</span> --experiment HPL --running_power 5004882812.500 --hugepage /root/huge
-
-./run_measures.py --global_csv result_size_4000000_4096.csv --nb_runs 1 --size 4000000 --nb_proc 4096 --topo
-<span class="org-string">"2;16,32;1,16;1,1;8"</span> --experiment HPL --running_power 5004882812.500 --hugepage /root/huge
-
-</pre>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">rbind(
-    read.csv('scalability/result_500000_512.csv'),
-    read.csv('scalability/result_500000_1024.csv'),
-    read.csv('scalability/result_500000_2048.csv'),
-    read.csv('scalability/result_500000_4096.csv'),
-    read.csv('scalability/result_1000000_4096.csv'),
-    read.csv('scalability/result_2000000_4096.csv'),
-    read.csv('scalability/result_4000000_512.csv'),
-    read.csv('scalability/result_4000000_1024.csv'),
-    read.csv('scalability/result_4000000_2048.csv'),
-    read.csv('scalability/result_4000000_4096.csv')
-)
-</pre>
-</div>
-
-<pre class="example">
-             topology nb_roots nb_proc    size  full_time        time Gflops
-1  2;16,32;1,16;1,1;8       16     512  500000    91246.1    91246.02  913.3
-2  2;16,32;1,16;1,1;8       16    1024  500000    46990.1    46990.02 1773.0
-3  2;16,32;1,16;1,1;8       16    2048  500000    24795.5    24795.50 3361.0
-4  2;16,32;1,16;1,1;8       16    4096  500000    13561.0    13561.01 6145.0
-5  2;16,32;1,16;1,1;8       16    4096 1000000    97836.6    97836.54 6814.0
-6  2;16,32;1,16;1,1;8       16    4096 2000000   742691.0   742690.59 7181.0
-7  2;16,32;1,16;1,1;8       16     512 4000000 45305100.0 45305083.56  941.8
-8  2;16,32;1,16;1,1;8       16    1024 4000000 22723800.0 22723820.45 1878.0
-9  2;16,32;1,16;1,1;8       16    2048 4000000 11432900.0 11432938.62 3732.0
-10 2;16,32;1,16;1,1;8       16    4096 4000000  5787160.0  5787164.09 7373.0
-   simulation_time application_time user_time system_time major_page_fault
-1          1191.99          204.992   1098.25       93.12                0
-2          2482.28          441.897   2296.51      184.70                0
-3          5091.97          872.425   4741.26      349.79                0
-4         11321.60         1947.320  10640.63      679.53                0
-5         26052.50         4362.660  24082.38     1966.10                0
-6         64856.30        10643.600  59444.40     5402.24                0
-7         17336.50         3030.400  15090.31     1945.23                0
-8         38380.90         6435.870  34249.71     3827.36                0
-9         83535.20        13080.500  75523.95     7684.52                0
-10       169659.00        26745.400 154314.76    15085.08                0
-   minor_page_fault cpu_utilization        uss         rss page_table_size
-1            960072            0.99  155148288  2055086080        10604000
-2           1054062            0.99  369696768  4383203328        21240000
-3           1282294            0.99 1012477952  9367576576        42912000
-4           1852119            0.99 3103875072 15318568960        87740000
-5           2768705            0.99 3103895552 16934834176        87748000
-6           4704339            0.99 3102445568 19464646656        87748000
-7           7663911            0.98  151576576  2056916992        10604000
-8           7725625            0.99  369872896  4120702976        21212000
-9           7917525            0.99 1012191232  9221050368        42880000
-10          8550745            0.99 3113381888 20408209408        87808000
-   memory_size
-1    282558464
-2    429948928
-3    962826240
-4   2814042112
-5   3425406976
-6   5910134784
-7  13079060480
-8  13275557888
-9  13825183744
-10 15763668992
-</pre></li>
-<li><del>Memory measurement failed for the experiments with 4096 nodes (<code>smpimain</code> took too much time to start, so its PID was</del>
-<del>not found by <code>run_measure.py</code> at the beginning, so it assumed it was already terminated&#x2026; really need to find something</del>
-<del>more robust).</del></li>
-<li><p>
-For the record, ran this command on the nodes (same command used in the script to estimate the memory consumption):
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">python3 -c <span class="org-string">"import psutil; print(psutil.virtual_memory().available)"</span>
-</pre>
-</div></li>
-<li>Result:
-<ul class="org-ul">
-<li>For size=2000000 and nbproc=4096: 60468817920</li>
-<li>For size=4000000 and nbproc=1024: 53105373184</li>
-<li>For size=4000000 and nbproc=2048: 52539293696</li>
-<li>For size=4000000 and nbproc=4096: 50614239232</li>
-</ul></li>
-<li>On a freshly deployed node, the same command returns 66365100032</li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-orgcafc1a9" class="outline-4">
-<h4 id="orgcafc1a9"><span class="section-number-4">1.5.4</span> 2017-06-04 Sunday</h4>
-<div class="outline-text-4" id="text-1-5-4">
-</div>
-<ol class="org-ol">
-<li><a id="org3aff34f"></a>Investigate capacity planning: small test program&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-5-4-1">
-<ul class="org-ul">
-<li>As mentionned in <span class="timestamp-wrapper"><span class="timestamp">[2017-06-02 Fri]</span></span>, the duration of HPL does not seem to be impacted by the topology, which is strange.</li>
-<li>Implemented a small test program, called <code>network_test</code>. It takes as argument a size and a number of iterations. Every
-process sends the given number of messages, each having the given size, to the next process (and thus receives from
-the previous one).</li>
-<li><p>
-Tested with the following topology (only changing the fat-tree description):
-</p>
-<pre class="example">
-&lt;?xml version='1.0' encoding='ASCII'?&gt;
-&lt;!DOCTYPE platform SYSTEM "http://simgrid.gforge.inria.fr/simgrid/simgrid.dtd"&gt;
-&lt;!--2-level fat-tree with 16 nodes--&gt;
-&lt;platform version="4"&gt;
-  &lt;AS id="AS0" routing="Full"&gt;
-    &lt;cluster id="cluster0" prefix="host-" suffix=".hawaii.edu" radical="0-15" speed="1Gf" bw="10Gbps" lat="2.4E-5s" loopback_bw="5120MiBps" loopback_lat="1.5E-9s" core="1" topology="FAT_TREE" topo_parameters="2;4,4;1,4;1,1"/&gt;
-  &lt;/AS&gt;
-&lt;/platform&gt;
-</pre></li>
-<li>Results for one iteration:
-<ul class="org-ul">
-<li>With a size of <code>200000000</code> and the fat-tree <code>2;4,4;1,4;1,1</code>, takes a time of 1.28   seconds.</li>
-<li>With a size of <code>200000000</code> and the fat-tree <code>2;4,4;1,1;1,1</code>, takes a time of 2.69   seconds.</li>
-<li>With a size of <code>200000</code>    and the fat-tree <code>2;4,4;1,4;1,1</code>, takes a time of 0.0025 seconds.</li>
-<li>With a size of <code>200000</code>    and the fat-tree <code>2;4,4;1,1;1,1</code>, takes a time of 0.0040 seconds.</li>
-<li>With a size of <code>2000</code>      and the fat-tree <code>2;4,4;1,4;1,1</code>, takes a time of 0.0004 seconds.</li>
-<li>With a size of <code>2000</code>      and the fat-tree <code>2;4,4;1,1;1,1</code>, takes a time of 0.0004 seconds.</li>
-</ul></li>
-<li>Thus, for large enough size, the difference is very clear, the topology <b>does</b> have a high impact. For small messages
-however, this is not the case.</li>
-<li>It does not seem to change for several iterations.</li>
-</ul>
-</div>
-</li>
-<li><a id="orgade6925"></a><span class="todo TODO">TODO</span> Check whas are the sizes of the messages in HPL.&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-5-4-2">
-</div>
-</li>
-<li><a id="org6dd8dd9"></a>Investigate capacity planning: odd networks&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-5-4-3">
-<ul class="org-ul">
-<li>Simgrid commit: <code>9a8e2f5bce8c6758d4367d21a66466a497d136fe</code></li>
-<li>HPL commit: <code>41774905395aebcb73650defaa7e2aa462e6e1a3</code></li>
-<li>Script commit: <code>4ff3ccbcbb77e126e454a16dea0535493ff1ff0b</code></li>
-<li>Try several topologies for HPL with absurdly good or bad networks (e.g. high/null bandwidth and/or high/null latency).</li>
-<li>The idea is that if doing so has a little impact on performances, then it is hopeless to observe any impact from
-adding/removing switches.</li>
-<li>Quick and dirty experiments: do not add any option to the script, just modify the values in <code>topology.py</code> (lines
-161-164).</li>
-<li><p>
-Note that in the previous experiments, where nearly no impact was observed, the different values were:
-</p>
-<div class="org-src-container">
-<pre class="src src-python"><span class="org-variable-name">bw</span> = <span class="org-string">'10Gbps'</span>
-<span class="org-variable-name">lat</span> = <span class="org-string">'2.4E-5s'</span>
-<span class="org-variable-name">loopback_bw</span> = <span class="org-string">'5120MiBps'</span>
-<span class="org-variable-name">loopback_lat</span> = <span class="org-string">'1.5E-9s'</span>
-</pre>
-</div></li>
-<li><p>
-Run this command, which outputs the Gflops:
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">./run_measures.py --global_csv /tmp/bla.csv --nb_runs 1 --size 10000 --nb_proc 16 --topo <span class="org-string">"2;4,4;1,4;1,1"</span> --experiment
-HPL --running_power 6217956542.969 &amp;&amp; tail -n 1 /tmp/bla.csv | cut -f10 -d<span class="org-string">','</span>
-</pre>
-</div></li>
-<li>Result for the same network characteristics: 21.96</li>
-<li>Results with other characteristics:
-<ul class="org-ul">
-<li><p>
-Very high bandwidth: 22.15
-</p>
-<div class="org-src-container">
-<pre class="src src-python"><span class="org-variable-name">bw</span> = <span class="org-string">'1000000Gbps'</span>
-<span class="org-variable-name">lat</span> = <span class="org-string">'2.4E-5s'</span>
-<span class="org-variable-name">loopback_bw</span> = <span class="org-string">'1000000GBps'</span>
-<span class="org-variable-name">loopback_lat</span> = <span class="org-string">'1.5E-9s'</span>
-</pre>
-</div></li>
-<li><p>
-Very low bandwidth: 1.505
-</p>
-<div class="org-src-container">
-<pre class="src src-python"><span class="org-variable-name">bw</span> = <span class="org-string">'10Mbps'</span>
-<span class="org-variable-name">lat</span> = <span class="org-string">'2.4E-5s'</span>
-<span class="org-variable-name">loopback_bw</span> = <span class="org-string">'10Mbps'</span>
-<span class="org-variable-name">loopback_lat</span> = <span class="org-string">'1.5E-9s'</span>
-</pre>
-</div></li>
-<li><p>
-Low bandwidth: 19.95
-</p>
-<div class="org-src-container">
-<pre class="src src-python"><span class="org-variable-name">bw</span> = <span class="org-string">'1Gbps'</span>
-<span class="org-variable-name">lat</span> = <span class="org-string">'2.4E-5s'</span>
-<span class="org-variable-name">loopback_bw</span> = <span class="org-string">'512MiBps'</span>
-<span class="org-variable-name">loopback_lat</span> = <span class="org-string">'1.5E-9s'</span>
-</pre>
-</div></li>
-<li><p>
-Very low latency: 25.95
-</p>
-<div class="org-src-container">
-<pre class="src src-python"><span class="org-variable-name">bw</span> = <span class="org-string">'10Gbps'</span>
-<span class="org-variable-name">lat</span> = <span class="org-string">'0s'</span>
-<span class="org-variable-name">loopback_bw</span> = <span class="org-string">'5120MiBps'</span>
-<span class="org-variable-name">loopback_lat</span> = <span class="org-string">'0s'</span>
-</pre>
-</div></li>
-<li><p>
-Very high latency: 0.1534
-</p>
-<div class="org-src-container">
-<pre class="src src-python"><span class="org-variable-name">bw</span> = <span class="org-string">'10Gbps'</span>
-<span class="org-variable-name">lat</span> = <span class="org-string">'2.4E-2s'</span>
-<span class="org-variable-name">loopback_bw</span> = <span class="org-string">'5120MiBps'</span>
-<span class="org-variable-name">loopback_lat</span> = <span class="org-string">'1.5E-5s</span>
-</pre>
-</div></li>
-<li><p>
-High latency: 9.477
-</p>
-<div class="org-src-container">
-<pre class="src src-python"><span class="org-variable-name">bw</span> = <span class="org-string">'10Gbps'</span>
-<span class="org-variable-name">lat</span> = <span class="org-string">'2.4E-4s'</span>
-<span class="org-variable-name">loopback_bw</span> = <span class="org-string">'5120MiBps'</span>
-<span class="org-variable-name">loopback_lat</span> = <span class="org-string">'1.5E-8s'</span>
-</pre>
-</div></li>
-</ul></li>
-<li>Improving the network performances has a limited impact. Using a nearly infinite bandwidth increases the Gflops by
-less than 1%. Using a null latency has more impact, but still limited, it increases the Gflops by 18%.</li>
-<li>Degrading the network performances has more impact. Using a bandwidth 1000 times lower divides by 15 the Gflops, but
-using a bandwidth 10 times lower decreases the Gflops by only 9%. Both the very high latency and the high latency have
-a great impact.</li>
-<li>To sum up, the latency seems to have an higher impact on HPL performances than the bandwidth.</li>
-<li>It is not clear if the contention created by using less switches will only decrease the bandwidth, or also increase
-the latency. It depends if there is on the switches one queue per port, or one queue for all the ports (in the former
-case, contention will have a much lower impact on the latency than in the later case).</li>
-<li>Hypothesis: in the case of a one queue per port model, removing switches will not increase too much the latency and
-therefore have a very limited impact on HPL performances.</li>
-</ul>
-</div>
-</li>
-<li><a id="org83be41a"></a><span class="todo TODO">TODO</span> Ask what model is used in Simgrid’s switches&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SIMGRID">SIMGRID</span></span><br />
-<div class="outline-text-5" id="text-1-5-4-4">
-<ul class="org-ul">
-<li>Is it one queue per port, or one single queue for all the ports?</li>
-</ul>
-</div>
-</li>
-<li><a id="org5d95bd2"></a>More thoughts on capacity planning&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="HPL">HPL</span></span><br />
-<div class="outline-text-5" id="text-1-5-4-5">
-<ul class="org-ul">
-<li>The plot of the Gflops as a function of the bandwidth (resp. inverse of latency) seems to look like the plot of the
-Gflops as a function of the number of processes or the size. It is a concave function converging to some finite limit.</li>
-<li>In the settings currently used for HPL, the bandwidth of 10Gbps seems to be already very close th the limit (since
-using a bandwidth thousands of time larger has little to no impact). This is why decreasing the bandwidth a bit has a
-very little impact. If we want to observe something when we remove switches, we should use lower bandwidths.</li>
-<li><p>
-Quick test, using the same command than previous section and with these values:
-</p>
-<div class="org-src-container">
-<pre class="src src-python"><span class="org-variable-name">bw</span> = <span class="org-string">'10Mbps'</span>
-<span class="org-variable-name">lat</span> = <span class="org-string">'2.4E-5s'</span>
-<span class="org-variable-name">loopback_bw</span> = <span class="org-string">'5120MiBps'</span>
-<span class="org-variable-name">loopback_lat</span> = <span class="org-string">'1.5E-9s'</span>
-</pre>
-</div>
-<ul class="org-ul">
-<li>With <code>2;4,4;1,4;1,1</code>, 1.505 Gflops.</li>
-<li>With <code>2;4,4;1,1;1,1</code>, 1.025 Gflops.</li>
-<li>With <code>2;4,4;1,4;1,1</code> and a random mapping, 1.268 Gflops.</li>
-<li>With <code>2;4,4;1,1;1,1</code> and a random mapping, 0.6464 Gflops.</li>
-</ul></li>
-<li>The hypothesis seems to be confirmed. With a lower bandwidth, a difference of bandwidth has much more impact. Thus,
-removing a switch and/or using a random mapping has also much more impact.</li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-org2d8f80c" class="outline-4">
-<h4 id="org2d8f80c"><span class="section-number-4">1.5.5</span> 2017-06-05 Monday</h4>
-<div class="outline-text-4" id="text-1-5-5">
-</div>
-<ol class="org-ol">
-<li><a id="orgcb245b4"></a>Comparison with real Taurus experiment&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="EXPERIMENTS">EXPERIMENTS</span>&#xa0;<span class="HPL">HPL</span>&#xa0;<span class="REPORT">REPORT</span></span><br />
-<div class="outline-text-5" id="text-1-5-5-1">
-<ul class="org-ul">
-<li><p>
-File <a href="hpl_analysis/taurus/real.csv">hpl_analysis/taurus/real.csv</a> holds real experiment data. It has been created manually, thanks to the energy
-paper <a href="https://gitlab.inria.fr/fheinric/paper-simgrid-energy/tree/master/experiments/mpi_hpl_dvfs/taurus_2017-01-17/original-data">repository</a>.
-</p>
-
-<div class="org-src-container">
-<pre class="src src-R">library(ggplot2)
-library(reshape2)
-library(gridExtra)
-
-get_results &lt;- function(nb_proc) {
-    result &lt;- read.csv(paste('hpl_analysis/taurus/hpl_paper_', nb_proc, '.csv', sep=''))
-    result$full_time = max(result$time)
-    result$total_energy = sum(result$power_consumption)
-
-    used_energy = 0
-    result = result[with(result, order(-power_consumption)),] # sort by power consumption
-    result$used_energy = sum(head(result, nb_proc/12)$power_consumption)
-    result$nb_proc = nb_proc
-    return(unique(result[c('nb_proc', 'full_time', 'total_energy', 'used_energy')]))
-}
-simulation_vanilla_results = data.frame()
-#  for(i in (c(1,4,8,12,48,96,144))) {
-for(i in (c(12,48,96,144))) {
-  simulation_vanilla_results = rbind(simulation_vanilla_results, get_results(i))
-}
-simulation_vanilla_results$type = 'Vanilla simulation'
-simulation_vanilla_results$time = -1 # do not have it
-simulation_vanilla_results$Gflops = -1 # do not have it
-
-real_results = read.csv('hpl_analysis/taurus/real.csv')
-real_results$type = 'Real execution'
-real_results$used_energy = real_results$used_energy * 1e3 # kJ -&gt; J
-sim_results &lt;- read.csv('hpl_analysis/taurus/hpl2.csv')
-sim_results$type = 'Optimized simulation'
-results = rbind(real_results[c('nb_proc', 'full_time', 'time', 'Gflops', 'used_energy', 'type')],
-		sim_results[c('nb_proc', 'full_time', 'time', 'Gflops', 'used_energy', 'type')],
-		simulation_vanilla_results[c('nb_proc', 'full_time', 'time', 'Gflops', 'used_energy', 'type')])
-results$type &lt;- factor(results$type, levels = c('Optimized simulation', 'Vanilla simulation', 'Real execution'))
-</pre>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">p1 = generic_do_plot(ggplot(results, aes(x=nb_proc, y=full_time, color=type, shape=type)), fixed_shape=FALSE) +
-	 xlab("Number of processes")+
-	 ylab("Duration (seconds)")+
-	 scale_shape_manual(values = c(0, 1, 2))+
-	 labs(colour="Experiment type")+
-	 labs(shape="Experiment type")+
-	 ggtitle("HPL duration for different numbers of processes\nMatrix size: 20,000")
-p2 = generic_do_plot(ggplot(results, aes(x=nb_proc, y=used_energy, color=type, shape=type)), fixed_shape=FALSE) +
-	 xlab("Number of processes")+
-	 ylab("Energy consumption (joules)")+
-	 scale_shape_manual(values = c(0, 1, 2))+
-	 labs(colour="Experiment type")+
-	 labs(shape="Experiment type")+
-	 ggtitle("HPL energy consumption for different numbers of processes\nMatrix size: 20,000")
-grid_arrange_shared_legend(p1, p2, nrow=1, ncol=2)
-</pre>
-</div>
-
-<p>
-<a href="hpl_analysis/taurus/validation.pdf">hpl_analysis/taurus/validation.pdf</a>
-</p>
-
-<div class="org-src-container">
-<pre class="src src-R">tmp_results = results[results$type != "Vanilla simulation",]
-grid_arrange_shared_legend(
-    generic_do_plot(ggplot(tmp_results, aes(x=nb_proc, y=time, color=type))) +
-      xlab("Number of processes")+
-      ylab("Duration (seconds)")+
-      labs(colour="Simulated")+
-      ggtitle("HPL “short” duration for different numbers of processes\nMatrix size: 20,000"),
-    generic_do_plot(ggplot(tmp_results, aes(x=nb_proc, y=Gflops, color=type))) +
-      xlab("Number of processes")+
-      ylab("Energy consumption (joules)")+
-      labs(colour="Simulated")+
-      ggtitle("HPL performances for different numbers of processes\nMatrix size: 20,000"),
-    nrow=1, ncol=2
-)
-</pre>
-</div>
-
-<p>
-<a href="hpl_analysis/taurus/validation2.pdf">hpl_analysis/taurus/validation2.pdf</a>
-</p>
-
-<div class="org-src-container">
-<pre class="src src-R">library(data.table)
-aggregate_results &lt;- function(results) {
-    x = data.table(results)
-    x = x[x$nb_proc %in% c(12, 48, 96, 144)]
-    x = as.data.frame(x[, list(time=mean(full_time), energy=mean(used_energy)), by=c("nb_proc")])
-    return(x[with(x, order(nb_proc)),])
-}
-aggr_real = aggregate_results(real_results)
-aggr_sim = aggregate_results(sim_results)
-aggr_vanilla = aggregate_results(simulation_vanilla_results)
-aggr_sim$time_error = (aggr_sim$time - aggr_real$time)/aggr_real$time * 100
-aggr_sim$energy_error = (aggr_sim$energy - aggr_real$energy)/aggr_real$energy * 100
-aggr_sim$optimized = TRUE
-aggr_vanilla$time_error = (aggr_vanilla$time - aggr_real$time)/aggr_real$time * 100
-aggr_vanilla$energy_error = (aggr_vanilla$energy - aggr_real$energy)/aggr_real$energy * 100
-aggr_vanilla$optimized = FALSE
-aggr_results = rbind(aggr_vanilla, aggr_sim)
-aggr_results$optimized &lt;- factor(aggr_results$optimized, levels = c(TRUE, FALSE))
-</pre>
-</div></li>
-
-<li><p>
-Get the three colors used for the previous plots to use the ones corresponding to vanilla and optimized.
-</p>
-<div class="org-src-container">
-<pre class="src src-R">x = unique(ggplot_build(p1)$data[[1]]$colour)
-x
-colors = x[c(1, 2)]
-colors
-</pre>
-</div>
-
-<pre class="example">
-[1] "#F8766D" "#00BA38" "#619CFF"
-[1] "#F8766D" "#00BA38"
-</pre>
-
-
-<div class="org-src-container">
-<pre class="src src-R">grid_arrange_shared_legend(
-    generic_do_plot(ggplot(aggr_results, aes(x=nb_proc, y=time_error, color=optimized))) +
-      geom_hline(yintercept=0) +
-      scale_color_manual(values=colors) +
-      xlab("Number of processes")+
-      ylab("Relative error (percent)")+
-      labs(colour="Optimized simulation")+
-      ggtitle("Error on the duration prediction")+
-      expand_limits(y=15)+
-      expand_limits(y=-15),
-    generic_do_plot(ggplot(aggr_results, aes(x=nb_proc, y=energy_error, color=optimized))) +
-      geom_hline(yintercept=0) +
-      scale_color_manual(values=colors) +
-      xlab("Number of processes")+
-      ylab("Relative error (percent)")+
-      labs(colour="Optimized simulation")+
-      ggtitle("Error on the energy consumption prediction")+
-      expand_limits(y=15)+
-      expand_limits(y=-15),
-    nrow=1, ncol=2
-)
-</pre>
-</div>
-
-<p>
-<a href="hpl_analysis/taurus/errors.pdf">hpl_analysis/taurus/errors.pdf</a>
-</p></li>
-
-<li>The plots are funny. The shapes of the error plots for optimized and vanilla look similar, but shifted. They both
-reach some high errors (~ 10%), but not for the same number of processes. Also, the optimized version is always above
-0 while the vanill is below 0 for some points.</li>
-<li>There are some mismatches between time prediction and energy prediction. For instance, optimized has a large error for
-the time prediction of 144 processes, but nearly no error for the energy prediction. Similarly, vanilla
-over-estimates the duration for 48 processes but under-estimates the energy consumption, which seems odd.</li>
-</ul>
-</div>
-</li>
-<li><a id="org810155b"></a>Plots for scalability test&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="EXPERIMENTS">EXPERIMENTS</span>&#xa0;<span class="HPL">HPL</span>&#xa0;<span class="REPORT">REPORT</span></span><br />
-<div class="outline-text-5" id="text-1-5-5-2">
-<div class="org-src-container">
-<pre class="src src-R">library(ggplot2)
-library(ggrepel)
-library(reshape2)
-library(gridExtra)
-results = rbind(
-    read.csv('scalability/result_500000_512.csv'),
-    read.csv('scalability/result_500000_1024.csv'),
-    read.csv('scalability/result_500000_2048.csv'),
-    read.csv('scalability/result_500000_4096.csv'),
-    read.csv('scalability/result_1000000_512.csv'),
-    read.csv('scalability/result_1000000_1024.csv'),
-    read.csv('scalability/result_1000000_2048.csv'),
-    read.csv('scalability/result_1000000_4096.csv'),
-    read.csv('scalability/result_2000000_512.csv'),
-    read.csv('scalability/result_2000000_1024.csv'),
-    read.csv('scalability/result_2000000_2048.csv'),
-    read.csv('scalability/result_2000000_4096.csv'),
-    read.csv('scalability/result_4000000_512.csv'),
-    read.csv('scalability/result_4000000_1024.csv'),
-    read.csv('scalability/result_4000000_2048.csv'),
-    read.csv('scalability/result_4000000_4096.csv')
-)
-results$simulation_time = results$simulation_time/3600
-results$memory_size = results$memory_size * 1e-9
-number_verb &lt;- function(n) {
-    return(format(n,big.mark=",",scientific=FALSE))
-}
-results$size_verb = factor(unlist(lapply(results$size, number_verb)), levels = c('500,000','1,000,000','2,000,000','4,000,000'))
-results$nb_proc_verb = factor(unlist(lapply(results$nb_proc, number_verb)), levels = c('512', '1,024', '2,048', '4,096'))
-results
-</pre>
-</div>
-
-<pre class="example">
-             topology nb_roots nb_proc    size  full_time        time Gflops
-1  2;16,32;1,16;1,1;8       16     512  500000    91246.1    91246.02  913.3
-2  2;16,32;1,16;1,1;8       16    1024  500000    46990.1    46990.02 1773.0
-3  2;16,32;1,16;1,1;8       16    2048  500000    24795.5    24795.50 3361.0
-4  2;16,32;1,16;1,1;8       16    4096  500000    13561.0    13561.01 6145.0
-5    2;16,32;1,16;1,1       16     512 1000000   716521.0   716521.00  930.4
-6    2;16,32;1,16;1,1       16    1024 1000000   363201.0   363201.04 1836.0
-7    2;16,32;1,16;1,1       16    2048 1000000   186496.0   186495.70 3575.0
-8  2;16,32;1,16;1,1;8       16    4096 1000000    97836.6    97836.54 6814.0
-9    2;16,32;1,16;1,1       16     512 2000000  5685080.0  5685077.72  938.1
-10   2;16,32;1,16;1,1       16    1024 2000000  2861010.0  2861012.55 1864.0
-11   2;16,32;1,16;1,1       16    2048 2000000  1448900.0  1448899.09 3681.0
-12 2;16,32;1,16;1,1;8       16    4096 2000000   742691.0   742690.59 7181.0
-13 2;16,32;1,16;1,1;8       16     512 4000000 45305100.0 45305083.56  941.8
-14 2;16,32;1,16;1,1;8       16    1024 4000000 22723800.0 22723820.45 1878.0
-15 2;16,32;1,16;1,1;8       16    2048 4000000 11432900.0 11432938.62 3732.0
-16 2;16,32;1,16;1,1;8       16    4096 4000000  5787160.0  5787164.09 7373.0
-   simulation_time application_time user_time system_time major_page_fault
-1        0.3311083          204.992   1098.25       93.12                0
-2        0.6895222          441.897   2296.51      184.70                0
-3        1.4144361          872.425   4741.26      349.79                0
-4        3.1448889         1947.320  10640.63      679.53                0
-5        0.7319722          500.970   2367.19      259.91                0
-6        1.6771917         1036.960   5515.36      515.05                0
-7        3.4421944         2092.950  11389.36      995.39                0
-8        7.2368056         4362.660  24082.38     1966.10                0
-9        1.9263500         1169.660   6193.80      683.73                0
-10       4.2217500         2551.100  13714.01     1430.93                0
-11       8.9621111         5236.560  29357.92     2844.89                0
-12      18.0156389        10643.600  59444.40     5402.24                0
-13       4.8156944         3030.400  15090.31     1945.23                0
-14      10.6613611         6435.870  34249.71     3827.36                0
-15      23.2042222        13080.500  75523.95     7684.52                0
-16      47.1275000        26745.400 154314.76    15085.08                0
-   minor_page_fault cpu_utilization        uss         rss page_table_size
-1            960072            0.99  155148288  2055086080        10604000
-2           1054062            0.99  369696768  4383203328        21240000
-3           1282294            0.99 1012477952  9367576576        42912000
-4           1852119            0.99 3103875072 15318568960        87740000
-5           1916208            0.99  153665536  2317279232        10600000
-6           2002989            0.99  369676288  4837175296        21252000
-7           2154982            0.99 1010696192  7774138368        42908000
-8           2768705            0.99 3103895552 16934834176        87748000
-9           3801905            0.99  150765568  2758770688        10604000
-10          3872820            0.99  365555712  5273034752        21220000
-11          4038099            0.99 1009606656  7415914496        42884000
-12          4704339            0.99 3102445568 19464646656        87748000
-13          7663911            0.98  151576576  2056916992        10604000
-14          7725625            0.99  369872896  4120702976        21212000
-15          7917525            0.99 1012191232  9221050368        42880000
-16          8550745            0.99 3113381888 20408209408        87808000
-   memory_size size_verb nb_proc_verb
-1    0.2825585   500,000          512
-2    0.4299489   500,000        1,024
-3    0.9628262   500,000        2,048
-4    2.8140421   500,000        4,096
-5    0.8944435 1,000,000          512
-6    1.0553098 1,000,000        1,024
-7    1.5811707 1,000,000        2,048
-8    3.4254070 1,000,000        4,096
-9    3.3384202 2,000,000          512
-10   3.4971116 2,000,000        1,024
-11   4.0274084 2,000,000        2,048
-12   5.9101348 2,000,000        4,096
-13  13.0790605 4,000,000          512
-14  13.2755579 4,000,000        1,024
-15  13.8251837 4,000,000        2,048
-16  15.7636690 4,000,000        4,096
-</pre>
-
-<div class="org-src-container">
-<pre class="src src-R">size_time = generic_do_plot(ggplot(results, aes(x=size, y=simulation_time, color=nb_proc_verb))) +
-    xlab("Matrix size") +
-    ylab("Simulation time (hours)") +
-    labs(colour="Number of processes")+
-    ggtitle("Simulation time for different matrix sizes")+
-    theme(legend.position = "none")+
-    geom_text_repel(
-	data = subset(results, size == max(size)),
-	aes(label = nb_proc_verb),
-	nudge_x = 45,
-	segment.color = NA,
-	show.legend = FALSE
-      )
-size_time
-</pre>
-</div>
-
-<p>
-<a href="scalability/1.pdf">scalability/1.pdf</a>
-</p>
-
-<div class="org-src-container">
-<pre class="src src-R">nbproc_time = generic_do_plot(ggplot(results, aes(x=nb_proc, y=simulation_time, color=size_verb))) +
-    xlab("Number of processes") +
-    ylab("Simulation time (hours)") +
-    labs(colour="Matrix size")+
-    ggtitle("Simulation time for different number of processes")+
-    theme(legend.position = "none")+
-    geom_text_repel(
-	data = subset(results, nb_proc == max(nb_proc)),
-	aes(label = size_verb),
-	nudge_x = 45,
-	segment.color = NA,
-	show.legend = FALSE
-      )
-nbproc_time
-</pre>
-</div>
-
-<p>
-<a href="scalability/2.pdf">scalability/2.pdf</a>
-</p>
-
-<div class="org-src-container">
-<pre class="src src-R">size_mem = generic_do_plot(ggplot(results, aes(x=size, y=memory_size, color=nb_proc_verb))) +
-    xlab("Matrix size") +
-    ylab("Memory consumption (gigabytes)") +
-    labs(colour="Number of processes")+
-    ggtitle("Memory consumption for different matrix sizes")+
-    theme(legend.position = "none")+
-    geom_text_repel(
-	data = subset(results, size == max(size)),
-	aes(label = nb_proc_verb),
-	nudge_x = 45,
-	segment.color = NA,
-	show.legend = FALSE
-      )
-size_mem
-</pre>
-</div>
-
-<p>
-<a href="scalability/3.pdf">scalability/3.pdf</a>
-</p>
-
-<div class="org-src-container">
-<pre class="src src-R">nbproc_mem = generic_do_plot(ggplot(results, aes(x=nb_proc, y=memory_size, color=size_verb))) +
-    xlab("Number of processes") +
-    ylab("Memory consumption (gigabytes)") +
-    labs(colour="Matrix size")+
-    ggtitle("Memory consumption for different number of processes")+
-    theme(legend.position = "none")+
-    geom_text_repel(
-	data = subset(results, nb_proc == max(nb_proc)),
-	aes(label = size_verb),
-	nudge_x = 45,
-	segment.color = NA,
-	show.legend = FALSE
-    )
-nbproc_mem
-</pre>
-</div>
-
-<p>
-<a href="scalability/4.pdf">scalability/4.pdf</a>
-</p>
-
-<div class="org-src-container">
-<pre class="src src-R">grid_arrange_shared_legend(size_time, size_mem, nrow=1, ncol=2)
-</pre>
-</div>
-
-<p>
-<a href="scalability/plot_size.pdf">scalability/plot_size.pdf</a>
-</p>
-
-<div class="org-src-container">
-<pre class="src src-R">grid_arrange_shared_legend(nbproc_time, nbproc_mem, nrow=1, ncol=2)
-</pre>
-</div>
-
-<p>
-<a href="scalability/plot_nbproc.pdf">scalability/plot_nbproc.pdf</a>
-</p>
-</div>
-</li>
-</ol>
-</div>
-
-<div id="outline-container-org0f48f63" class="outline-4">
-<h4 id="org0f48f63"><span class="section-number-4">1.5.6</span> 2017-06-06 Tuesday</h4>
-<div class="outline-text-4" id="text-1-5-6">
-</div>
-<ol class="org-ol">
-<li><a id="org07f8146"></a>Discussion about the report&#xa0;&#xa0;&#xa0;<span class="tag"><span class="MEETING">MEETING</span>&#xa0;<span class="REPORT">REPORT</span></span><br />
-<ol class="org-ol">
-<li><a id="orgdf8b619"></a>State of the art<br />
-<ol class="org-ol">
-<li><a id="org487659d"></a>Important features<br />
-<div class="outline-text-7" id="text-1-5-6-1-1-1">
-<ul class="org-ul">
-<li>offline vs. online, en particulier pour HPL (probes pour le pipeline des comms)</li>
-<li>si online: language scope, besoin de modifier le code pour que ça passe</li>
-<li>modèles: notion de topologies et prise en compte de la contention (super important a priori), prise en compte des
-spécificités des communications avec MPI (sémantique de synchronisation, différents ranges de performance,
-probablement pas trop grave dans le cas de HPL), collectives (dans le cas de HPL, on s’en fiche)
-<ul class="org-ul">
-<li>modèle classique dans ce contexte = LogP* mais prend mal en compte la contention (au niveau des noeuds mais pas du
-tout au niveau de la topologie)</li>
-<li>deux approches principales: packet level et flow level</li>
-</ul></li>
-<li>passage à l’échelle: ça motive l’utilisation de Parallel DES et de techniques d’émulation d’applications MPI un peu
-“système”</li>
-</ul>
-</div>
-</li>
-<li><a id="orgdc11865"></a>Projects:<br />
-<div class="outline-text-7" id="text-1-5-6-1-1-2">
-<ul class="org-ul">
-<li>Dimemas (Barcelona Supercomputing center), offline (extrae/paraver), “performance debugging” (sensibility analysis, what if,  <del>performance prediction</del>)</li>
-<li>LogoPsim (Torsten Hoefler), offline (dag, GOAL), collective algorithms @ scale</li>
-<li>SST macro, online/offline (DUMPI), MPI only, skeletonization/templating, more robust but more specialized (C++)</li>
-<li><del>BigSIM (?)</del>, offline, PDES éventuellement, projet mort. source-to-source transformation for privatization for CHARM++/AMPI</li>
-<li>xSim, online, PDES aux modèles sous-jacents à la validité plus que discutable, mais scalable, privatization à coup de
-copie du segment data et pas de mmap</li>
-<li>CODES, offline, PDES, new kid on the block</li>
-</ul>
-</div>
-</li>
-</ol>
-</li>
-<li><a id="org35de6a1"></a>Validation and capacity planning<br />
-<div class="outline-text-6" id="text-1-5-6-1-2">
-<ul class="org-ul">
-<li>For the comparison with a real execution (Taurus), get the data for real experiment by executing the org-file. Long (~
-5 minutes).</li>
-<li>On capacity planning, it is expected that removing switches has little to no impact. Computation is in O(n<sup>3</sup>) while
-communication is in O(n<sup>2</sup>) (and most of the communications are asynchronous, so happen during computations).</li>
-</ul>
-</div>
-</li>
-</ol>
-</li>
-<li><a id="orgb0cbb42"></a>Webinar&#xa0;&#xa0;&#xa0;<span class="tag"><span class="MEETING">MEETING</span></span><br />
-<div class="outline-text-5" id="text-1-5-6-2">
-<ul class="org-ul">
-<li><a href="https://github.com/alegrand/RR_webinars/blob/master/9_experimental_testbeds/index.org">Testbeds in computer science</a></li>
-<li>Lucas Nussbaum</li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-org785bf80" class="outline-4">
-<h4 id="org785bf80"><span class="section-number-4">1.5.7</span> 2017-06-07 Wednesday</h4>
-<div class="outline-text-4" id="text-1-5-7">
-</div>
-<ol class="org-ol">
-<li><a id="orge2aac9f"></a><span class="done DONE">DONE</span> Some text is displayed in the pdf but not in the printed version&#xa0;&#xa0;&#xa0;<span class="tag"><span class="REPORT">REPORT</span></span><br />
-<div class="outline-text-5" id="text-1-5-7-1">
-<ul class="org-ul">
-<li>It seems that no text entered between <code>=</code> signs (translated to <code>\texttt</code> in Latex) appear in the printed version of the
-report. It is displayed correctly in the pdf. Fix this.</li>
-<li>Reprint the first page of the same file, it is fixed now. The difference is that I printed it by the network and not
-by plugging a USB stick in the printer.</li>
-</ul>
-</div>
-</li>
-<li><a id="org2506cc1"></a>Network printer setup&#xa0;&#xa0;&#xa0;<span class="tag"><span class="TOOLS">TOOLS</span></span><br />
-<div class="outline-text-5" id="text-1-5-7-2">
-<ul class="org-ul">
-<li><p>
-Make sure the right package is installed:
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">sudo aptitude install cups-browsed
-</pre>
-</div></li>
-<li><p>
-Add these lines to the file <code>/etc/cups/cups-browsed.conf</code>:
-</p>
-<pre class="example">
-BrowseRemoteProtocols cups
-BrowsePoll print.imag.fr:631
-</pre></li>
-<li><p>
-Enable the service:
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">sudo systemctl enable cups-browsed
-</pre>
-</div></li>
-<li><p>
-Restart the service:
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">sudo service cups-browsed restart
-</pre>
-</div></li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-org75cac9a" class="outline-4">
-<h4 id="org75cac9a"><span class="section-number-4">1.5.8</span> 2017-06-08 Thursday</h4>
-<div class="outline-text-4" id="text-1-5-8">
-</div>
-<ol class="org-ol">
-<li><a id="org544c648"></a>Capacity planning: components&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="EXPERIMENTS">EXPERIMENTS</span>&#xa0;<span class="HPL">HPL</span>&#xa0;<span class="REPORT">REPORT</span></span><br />
-<div class="outline-text-5" id="text-1-5-8-1">
-<ul class="org-ul">
-<li>Simgrid commit: <code>9a8e2f5bce8c6758d4367d21a66466a497d136fe</code></li>
-<li>HPL commit: <code>41774905395aebcb73650defaa7e2aa462e6e1a3</code></li>
-<li>Script commit: <code>c2d1d734c80f084157ad70d702e8c669772fb2e4</code></li>
-<li><p>
-Command (used on <code>nova-21</code>, configured as above experiments):
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">bash run_capacity_planning.sh 100000 512
-
-bash run_capacity_planning.sh 50000 512
-</pre>
-</div></li>
-<li><p>
-Results:
-</p>
-<div class="org-src-container">
-<pre class="src src-R">library(ggplot2)
-library(reshape2)
-library(gridExtra)
-
-get_results &lt;- function(directory, name) {
-    result &lt;- read.csv(paste('capacity_planning/', directory, '/', name, '.csv', sep=''))
-    result$name = name
-    return(result)
-}
-get_all_results &lt;- function(directory) {
-    results = data.frame()
-    for(type in c('bandwidth', 'latency', 'speed')) {
-	for(subtype in c('high', 'low')) {
-	    name = paste(type, subtype, sep='_')
-	    tmp = get_results(directory, name)
-	    tmp$type = type
-	    if(type == 'latency'){
-		if(subtype == 'high')
-		    tmp$subtype = 'bad'
-		else
-		    tmp$subtype = 'good'
-	    }
-	    else {
-		if(subtype == 'high')
-		    tmp$subtype = 'good'
-		else
-		    tmp$subtype = 'bad'
-	    }
-	    results = rbind(results, tmp)
-	}
-	default = get_results(directory, 'default')
-	default$type = type
-	default$subtype = 'default'
-	results = rbind(results, default)
-    }
-    return(results[c('size', 'Gflops', 'type', 'subtype')])
-}
-results_1E5 = get_all_results('exp_100000_512')
-results_5E4 = get_all_results('exp_50000_512')
-results_1E5
-results_5E4
-</pre>
-</div>
-
-<pre class="example">
-    size  Gflops      type subtype
-1 100000  710.40 bandwidth    good
-2 100000  702.20 bandwidth     bad
-3 100000  722.70 bandwidth default
-4 100000  349.10   latency     bad
-5 100000  823.70   latency    good
-6 100000  722.70   latency default
-7 100000 3419.00     speed    good
-8 100000   83.94     speed     bad
-9 100000  722.70     speed default
-   size  Gflops      type subtype
-1 50000  458.80 bandwidth    good
-2 50000  477.00 bandwidth     bad
-3 50000  475.20 bandwidth default
-4 50000  127.30   latency     bad
-5 50000  697.60   latency    good
-6 50000  475.20   latency default
-7 50000 1346.00     speed    good
-8 50000   71.95     speed     bad
-9 50000  475.20     speed default
-</pre></li>
-</ul>
-
-<div class="org-src-container">
-<pre class="src src-R">do_plot &lt;- function(results, type) {
-    tmp = results[results$type == type,]
-    title = paste('HPL performance estimation for different components\nMatrix size of',
-	format(unique(results$size),big.mark=",",scientific=FALSE))
-    plot = ggplot(results, aes(x=type, y=Gflops, color=subtype, shape=subtype)) +
-	geom_point(size=4, stroke=1) +
-	scale_shape_manual(values = c(0, 1, 2))+
-	theme_bw()+
-	expand_limits(x=0, y=0)+
-	ggtitle(title)+
-	xlab('Component')+
-	ylab('Performance estimation (Gflops)')+
-	labs(colour='Metric')+
-	labs(shape='Metric')
-    return(plot)
-}
-</pre>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">grid_arrange_shared_legend(
-    do_plot(results_5E4, 'bandwidth') + expand_limits(x=0, y=max(results_1E5$Gflops)),
-    do_plot(results_1E5, 'bandwidth') + expand_limits(x=0, y=max(results_1E5$Gflops)),
-    nrow=1,
-    ncol=2
-)
-</pre>
-</div>
-
-<p>
-<a href="capacity_planning/components_perf.pdf">capacity_planning/components_perf.pdf</a>
-</p>
-</div>
-</li>
-<li><a id="org8a4af1d"></a>Capacity planning: topology&#xa0;&#xa0;&#xa0;<span class="tag"><span class="SMPI">SMPI</span>&#xa0;<span class="EXPERIMENTS">EXPERIMENTS</span>&#xa0;<span class="HPL">HPL</span>&#xa0;<span class="REPORT">REPORT</span></span><br />
-<div class="outline-text-5" id="text-1-5-8-2">
-<ul class="org-ul">
-<li>Simgrid commit: <code>9a8e2f5bce8c6758d4367d21a66466a497d136fe</code></li>
-<li>HPL commit: <code>41774905395aebcb73650defaa7e2aa462e6e1a3</code></li>
-<li>Script commit: <code>c2d1d734c80f084157ad70d702e8c669772fb2e4</code></li>
-<li>Four series of experiments:
-<ul class="org-ul">
-<li>Bandwidth of 10Gbps, sequential mapping of the processes</li>
-<li>Bandwidth of 10Gbps, random mapping of the processes</li>
-<li>Bandwidth of 10Mbps, sequential mapping of the processes</li>
-<li>Bandwidth of 10Mbps, random mapping of the processes</li>
-</ul></li>
-<li><p>
-For the series with a bandwidth of 10MBps, the file <code>topology.py</code> has been locally modified to use a bandwidth 1000
-times lower:
-</p>
-<pre class="example">
-176 % git diff                                                        -- INSERT -- 15:42:08
-diff --git a/topology.py b/topology.py
-index 2d7d76c..1a3cd67 100644
---- a/topology.py
-+++ b/topology.py
-@@ -158,7 +158,7 @@ class FatTree:
-     prefix = 'host-'
-     suffix = '.hawaii.edu'
-     speed = '1Gf'
--    bw = '10Gbps'
-+    bw = '10Mbps'
-     lat = '2.4E-5s'
-     loopback_bw = '5120MiBps'
-     loopback_lat = '1.5E-9s'
-</pre></li>
-<li>Command (used on <code>nova-2</code>, <code>nova-8</code>, <code>nova-15</code> and <code>nova-16</code> configured as above experiments):
-<ul class="org-ul">
-<li><p>
-For sequential mapping:
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">./run_measures.py --global_csv result_capacity_50000.csv --nb_runs 3 --size 50000 --nb_proc 512 --topo
-<span class="org-string">"2;16,32;1,1:16;1,1"</span> --experiment HPL --running_power 5004882812.500 --hugepage /root/huge 
-</pre>
-</div></li>
-<li><p>
-For random mapping:
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">./run_measures.py --global_csv result_capacity_50000.csv --nb_runs 3 --size 50000 --nb_proc 512 --topo
-<span class="org-string">"2;16,32;1,1:16;1,1"</span> --experiment HPL --running_power 5004882812.500 --hugepage /root/huge --shuffle_hosts
-</pre>
-</div></li>
-</ul></li>
-<li>For the random mapping with 10Mbps bandwidth, more runs have been done (8 instead of 3) to get rid of any bias.</li>
-<li><p>
-Results:
-</p>
-<div class="org-src-container">
-<pre class="src src-R">library(ggplot2)
-results_highbw_sequential &lt;- read.csv("capacity_planning/exp_topo_50000_512/result_capacity_50000.csv")
-results_highbw_random &lt;- read.csv("capacity_planning/exp_topo_50000_512/result_capacity_50000_shuffled.csv")
-results_lowbw_sequential &lt;- read.csv("capacity_planning/exp_topo_50000_512/result_capacity_50000_lowbw.csv")
-results_lowbw_random &lt;- read.csv("capacity_planning/exp_topo_50000_512/result_capacity_50000_lowbw_shuffled.csv")
-results_highbw_sequential$mapping = "Sequential"
-results_highbw_random$mapping = "Random"
-results_lowbw_sequential$mapping = "Sequential"
-results_lowbw_random$mapping = "Random"
-results_highbw = rbind(results_highbw_sequential, results_highbw_random)
-results_highbw$bandwidth = '10Gbps'
-results_lowbw = rbind(results_lowbw_sequential, results_lowbw_random)
-results_lowbw$bandwidth = '10Mbps'
-</pre>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">do_plot &lt;- function(results) {
-    title = paste('HPL performance estimation for different topologies\nBandwidth of', unique(results$bandwidth))
-    plot = generic_do_plot(ggplot(results, aes(x=nb_roots, y=Gflops, color=mapping, shape=mapping)), fixed_shape=FALSE) +
-	ggtitle(title)+
-	xlab('Number of L2 switches')+
-	ylab('Performance estimation (Gflops)')+
-	scale_shape_manual(values = c(1, 2))+
-	labs(colour='Mapping')+
-	labs(shape='Mapping')
-    return(plot)
-}
-</pre>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">grid_arrange_shared_legend(
-    do_plot(results_lowbw),
-    do_plot(results_highbw),
-    nrow=1, ncol=2
-)
-</pre>
-</div>
-
-<p>
-<a href="capacity_planning/topology.pdf">capacity_planning/topology.pdf</a>
-</p></li>
-
-<li>The results for 10MBps are somehow expected. Removing switches deteriorates the performances, using a random mapping
-of the processes makes things even worse. Also, we can observe some performance peaks for 4, 8 and 16 root
-switches. Maybe this is due to the D mod K algorithm (TODO: check that this is indeed this algorithm). For instance,
-16 divides 512 but 15 does not. So the load of all messages should be spread more uniformly with 16 root switches than
-with 15.</li>
-<li><p>
-For 10GBps however, this is more strange. The number of switches has no impact, but this has already been observed on
-previous experiments (see <span class="timestamp-wrapper"><span class="timestamp">[2017-06-02 Fri]</span></span>). What is more surprising however is that the random mapping yields to
-betterperformances than the sequential mapping. Is it a bug?
-</p>
-
-<div class="org-src-container">
-<pre class="src src-R">tmp = rbind(results_lowbw, results_highbw)
-tmp$bandwidth &lt;- factor(tmp$bandwidth, levels = c('10Mbps', '10Gbps'))
-generic_do_plot(ggplot(tmp, aes(x=nb_roots, y=simulation_time, color=mapping, shape=mapping, linetype=bandwidth)), fixed_shape=FALSE)+
-	ggtitle('Simulation time for different networks')+
-	xlab('Number of L2 switches')+
-	ylab('Simulation time (seconds)')+
-	scale_shape_manual(values = c(1, 2))+
-	labs(colour='Mapping')+
-	labs(shape='Mapping')+
-	labs(linetype='Bandwidth')
-</pre>
-</div>
-
-<p>
-<a href="capacity_planning/topology_sim_time.pdf">capacity_planning/topology_sim_time.pdf</a>
-</p>
-
-<div class="org-src-container">
-<pre class="src src-R">results_lowbw$simgrid_time = results_lowbw$simulation_time - results_lowbw$application_time
-generic_do_plot(ggplot(results_lowbw, aes(x=nb_roots, y=simgrid_time, color=mapping)))
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="capacity_planning/2.png" alt="2.png" />
-</p>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">library(data.table)
-aggregate_results &lt;- function(results) {
-    x = data.table(results)
-    x = as.data.frame(x[, list(Gflops=mean(Gflops)), by=c("nb_roots")])
-    return(x[with(x, order(nb_roots)),])
-}
-aggr_seq = aggregate_results(results_lowbw_sequential)
-aggr_rand = aggregate_results(results_lowbw_random)
-aggr_rand$gflops_ratio = aggr_seq$Gflops / aggr_rand$Gflops
-</pre>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">generic_do_plot(ggplot(aggr_rand, aes(x=nb_roots, y=gflops_ratio)))
-</pre>
-</div>
-
-
-<div class="figure">
-<p><img src="capacity_planning/3.png" alt="3.png" />
-</p>
-</div></li>
-
-<li>There are <b>huge</b> differences (factor 10) in the simulation time depending on the mapping and the number of root
-switches. This time is spent in Simgrid. It certainly comes from more complex communication behaviors (congestion)
-that gives much more work to the network part of Simgrid.</li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-
-<div id="outline-container-org393ef27" class="outline-4">
-<h4 id="org393ef27"><span class="section-number-4">1.5.9</span> 2017-06-09 Friday</h4>
-<div class="outline-text-4" id="text-1-5-9">
-</div>
-<ol class="org-ol">
-<li><a id="org9fca721"></a><span class="todo TODO">TODO</span> Work on <code>run_measure.py</code> script <code>[0/5]</code>&#xa0;&#xa0;&#xa0;<span class="tag"><span class="PYTHON">PYTHON</span></span><br />
-<div class="outline-text-5" id="text-1-5-9-1">
-<ul class="org-ul">
-<li class="off"><code>[&#xa0;]</code> Clean the code. In particular, remove the stuff related to the small matrix product test.</li>
-<li class="off"><code>[&#xa0;]</code> Write some unit tests.</li>
-<li class="off"><code>[&#xa0;]</code> Add options, e.g. to set the bandwidth or the latency without modifying the code.</li>
-<li class="off"><code>[&#xa0;]</code> Add flexibility in the way the series of experiments are described. Maybe describe them with Python code in a
-separate file? Or a JSON file?</li>
-<li class="off"><code>[&#xa0;]</code> Parallelism: allow to launch experiments on remote machines by ssh.</li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-org0cf0c61" class="outline-4">
-<h4 id="org0cf0c61"><span class="section-number-4">1.5.10</span> 2017-06-12 Monday</h4>
-<div class="outline-text-4" id="text-1-5-10">
-</div>
-<ol class="org-ol">
-<li><a id="orgac6ea2b"></a>Add <a href="hplpaper.pdf">The LINPACK Benchmark: Past, Present and Future</a>&#xa0;&#xa0;&#xa0;<span class="tag"><span class="PAPER">PAPER</span></span><br />
-<div class="outline-text-5" id="text-1-5-10-1">
-<p>
-Bibtex: Dongarra03thelinpack
-</p>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-org8c5c59a" class="outline-4">
-<h4 id="org8c5c59a"><span class="section-number-4">1.5.11</span> 2017-06-14 Wednesday</h4>
-<div class="outline-text-4" id="text-1-5-11">
-</div>
-<ol class="org-ol">
-<li><a id="org0166875"></a>Add <a href="simgrid.pdf">Versatile, Scalable and Accurate Simulation of Distributed Applications and Platforms</a>&#xa0;&#xa0;&#xa0;<span class="tag"><span class="PAPER">PAPER</span></span><br />
-<div class="outline-text-5" id="text-1-5-11-1">
-<p>
-Bibtex: casanova:hal-01017319
-</p>
-</div>
-</li>
-<li><a id="org72d9974"></a>Add <a href="CKP93.pdf">LogP: Towards a Realistic Model of Parallel Computation</a>&#xa0;&#xa0;&#xa0;<span class="tag"><span class="PAPER">PAPER</span></span><br />
-<div class="outline-text-5" id="text-1-5-11-2">
-<p>
-Bibtex: Culler<sub>1993</sub>
-</p>
-</div>
-</li>
-<li><a id="org3f9b861"></a>Finally found a grammar checker \o/&#xa0;&#xa0;&#xa0;<span class="tag"><span class="TOOLS">TOOLS</span></span><br />
-<div class="outline-text-5" id="text-1-5-11-3">
-<ul class="org-ul">
-<li>Check <a href="https://www.languagetool.org/">https://www.languagetool.org/</a></li>
-<li>It can be used in several ways, one of which is a command line tool</li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-orgc67c2b4" class="outline-4">
-<h4 id="orgc67c2b4"><span class="section-number-4">1.5.12</span> 2017-06-19 Monday</h4>
-<div class="outline-text-4" id="text-1-5-12">
-</div>
-<ol class="org-ol">
-<li><a id="org047f946"></a>Discussion about the slides&#xa0;&#xa0;&#xa0;<span class="tag"><span class="MEETING">MEETING</span></span><br />
-<div class="outline-text-5" id="text-1-5-12-1">
-<ul class="org-ul">
-<li>Grosse partie sur le contexte (~10min).
-<ul class="org-ul">
-<li>Supercalculateurs du top500 (dont Stampede), avec leur topologie. Aussi Piz-daint (dragonfly, en Suisse). Monter la
-variabilité des topologies. Photos des supercalculateurs et schéma de la topo.</li>
-<li>Routage, workload, placement des processus.</li>
-<li>HPL, HPL sur Stampede, lutte contre le n<sup>3</sup>, et aussi contre le p (p*n<sup>2</sup> dans la complexité).</li>
-</ul></li>
-<li>Pas d’état de l’art (ou seulement on-line vs off-line).</li>
-<li>Dessins: inkscape ou xfig</li>
-<li>Contribution: pas trop long (~7min).</li>
-<li>Validation: but final, comparer avec Stampede.</li>
-<li>Ouverture: capacity planning, étude de topologie.</li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-org06eb321" class="outline-4">
-<h4 id="org06eb321"><span class="section-number-4">1.5.13</span> 2017-06-20 Tuesday</h4>
-<div class="outline-text-4" id="text-1-5-13">
-</div>
-<ol class="org-ol">
-<li><a id="org1b5ea74"></a>Pré-soutenance Tom&#xa0;&#xa0;&#xa0;<span class="tag"><span class="MEETING">MEETING</span></span><br />
-<div class="outline-text-5" id="text-1-5-13-1">
-<ul class="org-ul">
-<li>Numéroter les slides 1/18</li>
-<li>Essayer d'inclure le schéma général des modifications</li>
-<li>distiller des informations sur le type de gain.</li>
-</ul>
-
-<p>
-Slides:
-</p>
-<ol class="org-ol">
-<li>Rank ?
-<ul class="org-ul">
-<li><del>On peut plus accélérer les processeurs?</del></li>
-<li><p>
-Informations sur l'échelle, sur la topologie, la diversité
-</p>
-<blockquote>
-<p>
-As an answer to the power and heat challenges, processor
-constructors have increased the amount of computing units (or
-cores) per processor. Modern High Performance Computing (HPC)
-systems comprise thousands of nodes, each of them holding several
-multi-core processors. For example, one of the world fastest
-computers, the IBM Sequoia system 1 Laurence Livermoor National
-Laboratory (USA), contains 96 racks of 98,304 nodes
-interconnected through a 5-dimensional torus and comprising
-16-core each, for a total of 1,572,864 cores. The Cray Titan
-system 2 at Oak Ridge National Laboratory is made of 18,688 AMD
-Opteron (16-core CPUs) and 18,688 Nvidia Tesla K20X GPUs
-interconnected through a Gemini three-dimensional torus. Another
-recent Cray machine, Piz Daint 3 at the Swiss National
-Supercomputing Centre, comprises 5,272 nodes (with 8 cores and a
-Nvidia Tesla K20X GPU each) interconnected through a custom Aries
-dragonfly topology. More recently, the Tianhe-2 4 was built with
-32,000 Intel Xeon (12 cores) and 48,000 Xeon Phi 31S1P
-interconnected through a TH-Express fat tree. Finally the Sunway
-TaihuLight 5 (Jiangsu, China), which is currently the fastest
-supercomputer in the world, is made of 40,950 nodes
-interconnected through a custom five level hierarchy of cabinets
-and comprising each 260 custom RISC cores for a total 10,649,600
-cores
-</p>
-</blockquote></li>
-</ul></li>
-<li>HPL:
-<ul class="org-ul">
-<li>où N est le rang de la matrice</li>
-<li>ça marche comme un pivot de Gauss
-<ul class="org-ul">
-<li>recherche du maximum, petite factorization, diffusion, update
-et on recommence</li>
-<li>dans le code, c'est un peu mélangé afin de bien recouvrir les
-calculs et les communications</li>
-</ul></li>
-</ul></li>
-<li>Lien avec le slide 1&#x2026;
-<ul class="org-ul">
-<li>Transitions un peu maladroites. Expliquer que c'est un domaine
-très actif.</li>
-</ul></li>
-<li><del>SimGrid</del> Simulation of HPC applications
-<ul class="org-ul">
-<li>Trace.
-<ul class="org-ul">
-<li>Deux problèmes (taille pour obtention de la trace, application
-dynamique -faire le lien avec HPL-)</li>
-<li>Émulation à la simgrid: <b>exclusion mutuelle</b>. Avantage =
-émulation sans modification mais ne passe pas à l'échelle. Il
-faut des approches hybrides.</li>
-</ul></li>
-<li>Plein de projets. Majoritairement offline. SimGrid permet les deux.</li>
-</ul></li>
-<li>10:36. Pourquoi Stampede ? On s'en fiche. On est là pour donner un
-ordre de grandeur
-<ul class="org-ul">
-<li>500 jours de calcul sans même compter la simulation de
-l'application elle même.</li>
-</ul></li>
-<li><ul class="org-ul">
-<li>Laboratory notebook and scripts</li>
-<li>Modified HPL</li>
-<li>Modifications to SimGrid</li>
-</ul></li>
-<li><p>
-Intégrer le "To sum up" dans cette série de slides.
-</p>
-<ul class="org-ul">
-<li>T<sub>dgemm</sub> = .4354*M*N*L</li>
-<li>T<sub>dtrsm</sub> = .234234*M*N<sup>2</sup></li>
-</ul>
-<p>
-Gain = ??? ordre de grandeur. Illustration sur une config donnée
-(une petite et une grande ?).
-</p></li>
-<li><ul class="org-ul">
-<li>Négligeable mais gain important ?</li>
-<li>Quantité de modifications sur HPL ?</li>
-<li>À ce stade, On ne fait quasiment plus de calcul, mais la
-consommation mémoire reste importante.</li>
-</ul></li>
-<li><ul class="org-ul">
-<li>L'application accède de temps en temps à ces zones donc on ne
-peut pas simplement supprimer ces allocations&#x2026;</li>
-</ul></li>
-<li>Panel = information échangée entre les processus au cours de
-l'exécution.</li>
-<li>10:44
-<ul class="org-ul">
-<li>Cette allocation pose un "problème".</li>
-<li>Modification HPL ?</li>
-</ul></li>
-<li><ul class="org-ul">
-<li>Consequences = observation à grande échelle.</li>
-</ul></li>
-<li><ul class="org-ul">
-<li></li>
-</ul></li>
-<li></li>
-
-<li></li>
-
-<li><p>
-10:50
-Cas difficile, erreur principalement sur 1 noeud et qui diminue
-apprès, <b>sous-estimation systématique</b>
-</p>
-<ul class="org-ul">
-<li>Expérience à petite échelle</li>
-<li>Sous-estimation systématique</li>
-<li>Facteur 2 sur les outliers ?</li>
-</ul>
-
-<p>
-Optimistic après dgemm.
-</p></li>
-<li>Conclusion
-<ul class="org-ul">
-<li>Modifications légères de HPL</li>
-<li>Nouvelles fonctionnalités dans SG</li>
-<li>Démontré qu'on pouvait simuler à cette échelle tout en prenant
-en compte les caractéristiques fines de la topologie, du
-placement, &#x2026;</li>
-</ul></li>
-<li>Rajouter capacity planning ?</li>
-</ol>
-</div>
-</li>
-<li><a id="org25cedf3"></a>Last remarks from Arnaud&#xa0;&#xa0;&#xa0;<span class="tag"><span class="MEETING">MEETING</span></span><br />
-<div class="outline-text-5" id="text-1-5-13-2">
-<ul class="org-ul">
-<li>Various functions → swap, max, &#x2026;</li>
-<li>Simulation of HPC application → parler de Simgrid</li>
-<li>Slide 7: écrire que c’est très optimiste (remplacer &asymp; par &ge; )</li>
-<li>Slide 18: ajouter un mot sur les aspects failure et énergétiques</li>
-<li>Slide 16 → systematically</li>
-<li>Slide 1: ajouter nom, rang, nombre de noeuds et de coeurs, topologie</li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-org311621b" class="outline-4">
-<h4 id="org311621b"><span class="section-number-4">1.5.14</span> 2017-06-21 Wednesday</h4>
-<div class="outline-text-4" id="text-1-5-14">
-</div>
-<ol class="org-ol">
-<li><a id="org013b107"></a>Pré-soutenance Tom V2&#xa0;&#xa0;&#xa0;<span class="tag"><span class="MEETING">MEETING</span></span><br />
-<div class="outline-text-5" id="text-1-5-14-1">
-<p>
-Intro: simulation MPI à large échelle&#x2026; et capacity planning ?
-</p>
-<ol class="org-ol">
-<li></li>
-
-<li>Expliquer l'algo avant l'animation
-<ul class="org-ul">
-<li>mentionner l'overlapping</li>
-</ul></li>
-<li>Il y a des <del>questions</del>&#x2026; Il y a plusieurs leviers sur lesquels on
-peut agir pour que ça aille plus vite.
-<ul class="org-ul">
-<li>Il y a des "recettes". Les gens disent "je veux ça" mais c'est
-leur expérience/avis et l'argumentation est limitée</li>
-</ul></li>
-<li><ul class="org-ul">
-<li>applications adaptatives. C'est d'ailleurs le cas de HPL</li>
-<li>avantage/inconvénient de l'approche émulée?</li>
-</ul></li>
-<li></li>
-
-<li><ul class="org-ul">
-<li>several optimizations (certaines assez logiques et d'autres qui
-étaient moins évidentes)</li>
-</ul></li>
-<li></li>
-
-<li></li>
-
-<li><ul class="org-ul">
-<li>pourquoi ne pas simplement enlever les mallocs ?</li>
-</ul></li>
-<li></li>
-
-<li>Bon, ben maintenant, on a enlevé tous les calculs, toutes les
-allocations, il ne reste quasiment plus que le contrôle. Et
-pourtant, à grande échelle, ça ne passe toujours pas.</li>
-<li></li>
-
-<li><ul class="org-ul">
-<li>Vous voyez, l'effet quadratique en N et en P est toujours là et
-c'est ça qui était dur.</li>
-</ul></li>
-<li><ul class="org-ul">
-<li>Expliquer la courbe! C'est très petit.</li>
-<li>Pas d'outlier. Dire plutôt pas de variabilité, et du coup pas
-d'outlier. Problème: ça à des conséquences à cause des synchros.</li>
-<li>Modèle optimiste (pas de variabilité injectée, partage de bande
-passante parfait)</li>
-</ul></li>
-<li></li>
-</ol>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-org4721195" class="outline-4">
-<h4 id="org4721195"><span class="section-number-4">1.5.15</span> 2017-06-23 Friday</h4>
-<div class="outline-text-4" id="text-1-5-15">
-</div>
-<ol class="org-ol">
-<li><a id="orgb81c239"></a>Trying to understand the low CPU utilization for large allocations&#xa0;&#xa0;&#xa0;<span class="tag"><span class="C">C</span>&#xa0;<span class="EXPERIMENTS">EXPERIMENTS</span></span><br />
-<div class="outline-text-5" id="text-1-5-15-1">
-<ul class="org-ul">
-<li>According to Olivier, the low CPU utilization when doing large allocations (without huge pages) is not expected. Let’s
-investigate.</li>
-<li>Script commit: <code>80c6cd6f0853821a08da3994ce89572c9996b5ea</code></li>
-<li><p>
-Command (the size correspond to an allocation of a matrix of size at most 600,000):
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">./cpu_utilization.py 8 2880000000000 /tmp/cpu_exp.csv
-</pre>
-</div></li>
-<li><p>
-Analysis:
-</p>
-<div class="org-src-container">
-<pre class="src src-R">library(ggplot2)
-results &lt;- read.csv('cpu_utilization/cpu_exp.csv')
-</pre>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">ggplot(results, aes(x=size, y=cpu_utilization)) +
-    geom_point() + geom_line()
-</pre>
-</div></li>
-
-<li>So we reproduce this behavior outside of HPL and Simgrid.</li>
-</ul>
-</div>
-</li>
-<li><a id="orgba84d41"></a><span class="done DONE">DONE</span> Draw a flame graph with this small program and a large allocation.<br />
-<div class="outline-text-5" id="text-1-5-15-2">
-</div>
-</li>
-<li><a id="orgbb35438"></a>Flame graph for the CPU utilization&#xa0;&#xa0;&#xa0;<span class="tag"><span class="C">C</span>&#xa0;<span class="EXPERIMENTS">EXPERIMENTS</span></span><br />
-<div class="outline-text-5" id="text-1-5-15-3">
-<ul class="org-ul">
-<li>Script commit: <code>80c6cd6f0853821a08da3994ce89572c9996b5ea</code></li>
-<li><p>
-Command (the size correspond to an allocation of a matrix of size 600,000):
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">sudo perf record -F1000 --call-graph dwarf ./page_faults 1 2880000000000 1
-
-sudo perf script | ~/Documents/FlameGraph/stackcollapse-perf.pl --kernel | ~/Documents/FlameGraph/flamegraph.pl &gt; /tmp/flame_2880000000000.svg
-</pre>
-</div></li>
-<li><p>
-Kernel version:
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">uname -r
-</pre>
-</div></li>
-
-<li>Result:
-<object type="image/svg+xml" data="cpu_utilization/flame_2880000000000.svg" class="org-svg">
-Sorry, your browser does not support SVG.</object></li>
-<li>This flame graph is very interesting, although incomplete. First, note that the function <code>main</code> accounts for less than
-40% of the samples, which is approximately equal to the CPU utilization. It means that this approach also captures
-what is done when the process is <b>not</b> executed.</li>
-<li>Most of the time spent in the function <code>main</code> is spent in a function <code>do_page_fault</code>.</li>
-<li>The remaining 60% of the whole execution time is spent on two functions, one unknown, and one called <code>native_irq_return_iret</code>.</li>
-<li>It is also strange to see this very large function <code>page_faults</code> located below the function <code>main</code> (and <code>_start</code>) and not on
-the side, although these functions are (a priori) not called by the function <code>page_faults</code>. Maybe a bug of <code>perf</code>?</li>
-</ul>
-</div>
-</li>
-<li><a id="orgf657ce8"></a><span class="todo TODO">TODO</span> Next steps in the investigation of low CPU utilization <code>[4/7]</code><br />
-<div class="outline-text-5" id="text-1-5-15-4">
-<ul class="org-ul">
-<li class="on"><code>[X]</code> Plot the CPU utilization for different number of calls to <code>memset</code> (including 0).</li>
-<li class="off"><code>[&#xa0;]</code> Draw the flame graph with more calls to <code>memset</code>.</li>
-<li class="off"><code>[&#xa0;]</code> Draw the flame graph with no call to <code>memset</code>.</li>
-<li class="on"><code>[X]</code> Try other flags for the <code>mmap</code>, try adding the flag <code>MAP_POPULATE</code>.</li>
-<li class="on"><code>[X]</code> Try with another kernel version.</li>
-<li class="on"><code>[X]</code> Try with huge pages, to see the difference.</li>
-<li class="off"><code>[&#xa0;]</code> Speak with someone (Olivier? Samuel? Vincent? Stack Overflow?).</li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-org0cbc775" class="outline-4">
-<h4 id="org0cbc775"><span class="section-number-4">1.5.16</span> 2017-06-24 Saturday</h4>
-<div class="outline-text-4" id="text-1-5-16">
-</div>
-<ol class="org-ol">
-<li><a id="org7e34eb5"></a>Small test: several calls to <code>memset</code>&#xa0;&#xa0;&#xa0;<span class="tag"><span class="C">C</span>&#xa0;<span class="EXPERIMENTS">EXPERIMENTS</span></span><br />
-<div class="outline-text-5" id="text-1-5-16-1">
-<ul class="org-ul">
-<li>Script commit: <code>b8a110e9a57c821b37a3843738b97bc0affb52f6</code></li>
-<li><p>
-No call to <code>memset</code>:
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">/usr/bin/time ./page_faults 1 2880000000000 0
-</pre>
-</div>
-<pre class="example">
-2.00202
-0.04user 1.95system 0:02.00elapsed 99%CPU (0avgtext+0avgdata 5108maxresident)k
-0inputs+4096outputs (0major+521minor)pagefaults 0swaps
-</pre></li>
-<li><p>
-One call to <code>memset</code>:
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">/usr/bin/time ./page_faults 1 2880000000000 1
-</pre>
-</div>
-<pre class="example">
-2013.29
-158.71user 604.73system 33:33.29elapsed 37%CPU (0avgtext+0avgdata 2812501956maxresident)k
-0inputs+102400outputs (0major+703125270minor)pagefaults 0swaps
-</pre></li>
-<li><p>
-Ten call to <code>memset</code>:
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">/usr/bin/time ./page_faults 1 2880000000000 10
-</pre>
-</div>
-<pre class="example">
-23344.3
-1622.97user 5224.14system 6:29:04elapsed 29%CPU (0avgtext+0avgdata 2812502520maxresident)k
-0inputs+958464outputs (0major+7031250411minor)pagefaults 0swaps
-</pre></li>
-<li><p>
-No call to <code>memset</code>, but using the flag <code>MAP_POPULATE</code>:
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">/usr/bin/time ./page_faults 1 2880000000000 0
-</pre>
-</div>
-<pre class="example">
-136.016
-0.04user 103.22system 2:16.01elapsed 75%CPU (0avgtext+0avgdata 2812501680maxresident)k
-0inputs+4096outputs (0major+43946592minor)pagefaults 0swaps
-</pre></li>
-<li>When no accesses are made and the flag <code>MAP_POPULATE</code> is not used, then the execution is very fast, there is nearly no
-page fault and the CPU utilization is high.</li>
-<li>With one access, we get the very low CPU utilization and the very large time.</li>
-<li>With ten accesses, the CPU utilization is even lower, the number of page faults is ten times higher and both user time
-and system time are also about ten times higher. This is very strange.</li>
-<li>With no access but with the flag <code>MAP_POPULATE</code>, there is a much larger time and number of page faults, but still about
-ten times lower than with one access and no <code>MAP_POPULATE</code>.</li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-org2c9c5f2" class="outline-4">
-<h4 id="org2c9c5f2"><span class="section-number-4">1.5.17</span> 2017-06-25 Sunday</h4>
-<div class="outline-text-4" id="text-1-5-17">
-</div>
-<ol class="org-ol">
-<li><a id="org59ea26b"></a>More experments about the low CPU utilization for large allocations&#xa0;&#xa0;&#xa0;<span class="tag"><span class="C">C</span>&#xa0;<span class="R">R</span>&#xa0;<span class="EXPERIMENTS">EXPERIMENTS</span></span><br />
-<div class="outline-text-5" id="text-1-5-17-1">
-<ul class="org-ul">
-<li>Script commit: <code>b8a110e9a57c821b37a3843738b97bc0affb52f6</code>, modified to have between 0 and 3 calls to <code>memset</code>.</li>
-<li>With these results, the flag <code>MAP_POPULATE</code> is <b>not</b> used when calling <code>mmap</code>.</li>
-<li><p>
-Command (the size correspond to an allocation of a matrix of size at most 300,000):
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">./cpu_utilization.py 100 720000000000 cpu_exp2.csv
-</pre>
-</div></li>
-<li>Run on <code>nova-17</code> with kernel <code>4.9.0-2-amd64</code>.</li>
-<li><p>
-Analysis:
-</p>
-<div class="org-src-container">
-<pre class="src src-R">library(gridExtra)
-library(ggplot2)
-results &lt;- read.csv('cpu_utilization/cpu_exp2.csv')
-</pre>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">p1 = ggplot(results, aes(x=size, y=cpu_utilization, color=factor(mem_access))) +
-    geom_point() + geom_line()
-p2 = ggplot(results, aes(x=size, y=total_time, color=factor(mem_access))) +
-    geom_point() + geom_line()
-grid.arrange(p1, p2, ncol=2)
-</pre>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">p1 = ggplot(results, aes(x=size, y=user_time, color=factor(mem_access))) +
-    geom_point() + geom_line()
-p2 = ggplot(results, aes(x=size, y=system_time, color=factor(mem_access))) +
-    geom_point() + geom_line()
-grid.arrange(p1, p2, ncol=2)
-</pre>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">p1 = ggplot(results, aes(x=size, y=memory_size, color=factor(mem_access))) +
-    geom_point() + geom_line()
-p2 = ggplot(results, aes(x=size, y=nb_page_faults, color=factor(mem_access))) +
-    geom_point() + geom_line()
-grid.arrange(p1, p2, ncol=2)
-</pre>
-</div></li>
-
-<li>Finally, the number of accesses to the buffer does not seem to impact the CPU utilization. The difference we observed
-on <span class="timestamp-wrapper"><span class="timestamp">[2017-06-24 Sat] </span></span> was probably only noise.</li>
-<li>The user time seems to be proportionnal to both the allocation size and the number of calls to <code>memset</code>. This is expected.</li>
-<li>The system time also seems to be proportionnal to them. We could expect the impact of the allocation size, but the
-impact of the number of accesses is not trivial. It seems to come from the number of page faults (which is also
-proportionnal to both the allocation size and the number of accesses). But the plot of the number of page faults is
-hard to understand. Why would more accesses cause more page faults, when the page table is already initialized?</li>
-<li>Another strange thing is that the memory consumption is lower with only one access than with two or three. They should
-all have the same page table size and thus the same memory consumption.</li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-org4c26f3f" class="outline-4">
-<h4 id="org4c26f3f"><span class="section-number-4">1.5.18</span> 2017-06-26 Monday</h4>
-<div class="outline-text-4" id="text-1-5-18">
-</div>
-<ol class="org-ol">
-<li><a id="org759553e"></a>Small test: several calls to <code>memset</code> with huge pages&#xa0;&#xa0;&#xa0;<span class="tag"><span class="C">C</span>&#xa0;<span class="EXPERIMENTS">EXPERIMENTS</span></span><br />
-<div class="outline-text-5" id="text-1-5-18-1">
-<ul class="org-ul">
-<li>Script commit: <code>005461dad4c06a2e2463d54eec228e65c07b1015</code></li>
-<li><p>
-Compilation:
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">gcc -DHUGEPAGE -std=gnu11 -ggdb3 -O3 -o page_faults page_faults.c -Wall
-</pre>
-</div></li>
-<li>So, same experiment than <span class="timestamp-wrapper"><span class="timestamp">[2017-06-24 Sat]</span></span>, except that huge pages and the <code>MAP_POPULATE</code> flag are used.</li>
-<li><p>
-No call to <code>memset</code>, but using the flag <code>MAP_POPULATE</code>:
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">3.34278
-0.04user 3.29system 0:03.34elapsed 99%CPU (0avgtext+0avgdata 1476maxresident)k
-0inputs+0outputs (0major+65minor)pagefaults 0swaps
-</pre>
-</div>
-<p>
-Much lower number of page faults and system time. Higher CPU utilization.
-</p></li>
-<li><p>
-One call to <code>memset</code>:
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">/usr/bin/time ./page_faults 1 2880000000000 1
-</pre>
-</div>
-<pre class="example">
-102.2
-98.77user 3.26system 1:42.20elapsed 99%CPU (0avgtext+0avgdata 1492maxresident)k
-0inputs+0outputs (0major+67minor)pagefaults 0swaps
-</pre>
-<p>
-In comparison with the case where no huge pages are used, the number of page faults and the time are much lower.
-Also, the system time and the number of page faults are the same than the previous test, where no <code>memset</code> were done,
-only the user time increased.
-It is strange that the number of page faults is so low. With such an allocation size, we have about 1.3M huge pages.
-</p></li>
-<li><p>
-Ten call to <code>memset</code>:
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">/usr/bin/time ./page_faults 1 2880000000000 10
-</pre>
-</div>
-<pre class="example">
-988.682
-984.74user 3.45system 16:28.68elapsed 99%CPU (0avgtext+0avgdata 1488maxresident)k
-0inputs+0outputs (0major+66minor)pagefaults 0swaps
-</pre>
-<p>
-Same system time and number of page faults than with only one call to <code>memset</code>, only the user time increases. This is
-the expected behavior.
-</p></li>
-<li>Let’s try without <code>MAP_POPULATE</code> flag.</li>
-<li><p>
-One call to <code>memset</code>:
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">/usr/bin/time ./page_faults 1 2880000000000 1
-</pre>
-</div>
-<pre class="example">
-102.302
-99.10user 3.18system 1:42.30elapsed 99%CPU (0avgtext+0avgdata 1520maxresident)k
-0inputs+0outputs (0major+1373356minor)pagefaults 0swaps
-</pre>
-<p>
-The number of page faults is now as expected, but this did not change the system time.
-</p></li>
-<li><p>
-Ten calls to <code>memset</code>:
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">/usr/bin/time ./page_faults 1 2880000000000 10
-</pre>
-</div>
-<pre class="example">
-1001.42
-997.40user 3.30system 16:41.41elapsed 99%CPU (0avgtext+0avgdata 1572maxresident)k
-0inputs+0outputs (0major+1373359minor)pagefaults 0swaps
-</pre>
-<p>
-We observe the same behavior than with the flag <code>MAP_POPULATE</code>: going from 1 call to <code>memset</code> to 10 does not impact the
-number of page faults or the system time, it only changes the user time.
-</p></li>
-</ul>
-</div>
-<ol class="org-ol">
-<li><a id="org65ecba5"></a>Conclusion<br />
-<div class="outline-text-6" id="text-1-5-18-1-1">
-<p>
-Using classical pages or huge pages does not only change the page size (and thus the page table size). It actually
-changes the <b>behavior</b> of the OS. With classical pages, the system time and the number of page faults are proportionnal to
-both the allocation size and the number of accesses, whereas with huge pages they are only proportionnal to the
-allocation size.
-</p>
-</div>
-</li>
-</ol>
-</li>
-<li><a id="orgc7d1b6f"></a>Flame graph for the CPU utilization&#xa0;&#xa0;&#xa0;<span class="tag"><span class="C">C</span>&#xa0;<span class="EXPERIMENTS">EXPERIMENTS</span></span><br />
-<div class="outline-text-5" id="text-1-5-18-2">
-<ul class="org-ul">
-<li>Script commit: <code>005461dad4c06a2e2463d54eec228e65c07b1015</code> (the file has been modified to remove the flag <code>MAP_POPULATE</code>).</li>
-<li><p>
-Command (the size correspond to an allocation of a matrix of size 600,000):
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">sudo perf record -F1000 --call-graph dwarf ./page_faults 1 2880000000000 1
-
-sudo perf script | ~/Documents/FlameGraph/stackcollapse-perf.pl --kernel | ~/Documents/FlameGraph/flamegraph.pl &gt; /tmp/flame_2880000000000_hugepage.svg
-</pre>
-</div></li>
-<li><p>
-Kernel version:
-</p>
-<div class="org-src-container">
-<pre class="src src-sh">uname -r
-</pre>
-</div></li>
-
-<li>Result:
-<object type="image/svg+xml" data="cpu_utilization/flame_2880000000000_hugepage.svg" class="org-svg">
-Sorry, your browser does not support SVG.</object></li>
-<li>This flame graph is hard to relate with the previous results.</li>
-<li>We saw that there was a high CPU utilization (99%) and that most of the time was spent in user mode. But the graph
-shows that a very large part of the time is spent in some other function, outside of the program scope. My guess would
-be that such function should not be accounted in the program execution time and that we should therefore have a very
-low CPU utilization.</li>
-</ul>
-</div>
-</li>
-<li><a id="orgc8d1e8f"></a>Segmented regression&#xa0;&#xa0;&#xa0;<span class="tag"><span class="R">R</span></span><br />
-<div class="outline-text-5" id="text-1-5-18-3">
-<ul class="org-ul">
-<li><a href="https://en.wikipedia.org/wiki/Segmented_regression">Wikipedia page</a></li>
-<li><a href="https://stats.stackexchange.com/questions/20890/how-to-use-segmented-package-to-fit-a-piecewise-linear-regression-with-one-break">Example on StackExchange</a></li>
-<li><p>
-Let’s try with dummy data.
-</p>
-<div class="org-src-container">
-<pre class="src src-R">NB = 100
-A1 = 2 # coeff for first part
-A2 = 1 # coeff for second part
-B1 = 0 # intercept for first part
-B2 = 100 # intercept for second part
-df = data.frame(n=1:NB)
-df$n = sample(500, size=NB, replace=TRUE)
-df$noise = sample(20, size=NB, replace=TRUE)-10
-my_func &lt;- function(n, noise) {
-    if(n &lt; 100) {
-	return(A1*n+B1 + noise)
-    }
-    else {
-	return(A2*n+B2 + noise)
-    }
-}
-df$fn = mapply(my_func, df$n, df$noise)
-</pre>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">library(ggplot2)
-ggplot(df, aes(x=n, y=fn)) + geom_point()
-</pre>
-</div></li>
-
-<li><p>
-The two modes are clearly visible, let’s try some regressions.
-</p>
-
-<div class="org-src-container">
-<pre class="src src-R">library(segmented)
-lm = segmented(lm(fn~n, data=df), seg.Z = ~ n)
-summary(lm)
-</pre>
-</div>
-
-<pre class="example">
-
-        ***Regression Model with Segmented Relationship(s)***
-
-Call: 
-segmented.lm(obj = lm(fn ~ n, data = df), seg.Z = ~n)
-
-Estimated Break-Point(s):
-   Est. St.Err 
-99.197  3.361 
-
-Meaningful coefficients of the linear terms:
-            Estimate Std. Error t value Pr(&gt;|t|)    
-(Intercept)  1.22041    4.02077   0.304    0.762    
-n            1.99373    0.06389  31.208   &lt;2e-16 ***
-U1.n        -0.98928    0.06420 -15.409       NA    
----
-Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
-
-Residual standard error: 6.183 on 96 degrees of freedom
-Multiple R-Squared: 0.9985,  Adjusted R-squared: 0.9985 
-
-Convergence attained in 6 iterations with relative change 2.230614e-15
-</pre>
-
-<div class="org-src-container">
-<pre class="src src-R">plot(lm)
-</pre>
-</div></li>
-
-<li>Need to check, but it seems that:
-<ul class="org-ul">
-<li>It expects the underlying “function” to be “continuous”, which is not the case of what we have with <code>dgemm</code> on
-Stampede. If there is a discontinuity at the break point, the estimation fails.</li>
-<li>The intercept value is <code>B1</code>.</li>
-<li>The <code>n</code> coefficient is <code>A1</code>.</li>
-<li>The <code>U1.n</code> coefficient is <code>A2-A1</code>.</li>
-</ul></li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-<div id="outline-container-org5222030" class="outline-4">
-<h4 id="org5222030"><span class="section-number-4">1.5.19</span> 2017-06-27 Tuesday</h4>
-<div class="outline-text-4" id="text-1-5-19">
-</div>
-<ol class="org-ol">
-<li><a id="org7c4d200"></a>Keep trying the segmented regression&#xa0;&#xa0;&#xa0;<span class="tag"><span class="R">R</span></span><br />
-<div class="outline-text-5" id="text-1-5-19-1">
-<ul class="org-ul">
-<li>Using code from <a href="https://stackoverflow.com/questions/8758646/piecewise-regression-with-r-plotting-the-segments">stackoverflow</a></li>
-<li>Asked a question on <a href="https://stackoverflow.com/questions/44778954/segmented-linear-regression-with-discontinuous-data">stackoverflow</a>.</li>
-<li><p>
-Let’s try with dummy data.
-</p>
-<div class="org-src-container">
-<pre class="src src-R">NB = 100
-A1 = 2 # coeff for first part
-A2 = 1 # coeff for second part
-B1 = 0 # intercept for first part
-B2 = 300 # intercept for second part
-df = data.frame(n=1:NB)
-df$n = sample(500, size=NB, replace=TRUE)
-df$noise = sample(20, size=NB, replace=TRUE)-10
-my_func &lt;- function(n, noise) {
-    if(n &lt; 100) {
-	return(A1*n+B1 + noise)
-    }
-    else {
-	return(A2*n+B2 + noise)
-    }
-}
-df$fn = mapply(my_func, df$n, df$noise)
-</pre>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R">library(ggplot2)
-ggplot(df, aes(x=n, y=fn)) + geom_point()
-</pre>
-</div></li>
-
-<li><p>
-First, using <code>segmented</code> package.
-</p>
-
-<div class="org-src-container">
-<pre class="src src-R">library(segmented)
-model_segmented = segmented(lm(fn~n, data=df), seg.Z = ~ n)
-summary(model_segmented)
-</pre>
-</div>
-
-<pre class="example">
-
-        ***Regression Model with Segmented Relationship(s)***
-
-Call: 
-segmented.lm(obj = lm(fn ~ n, data = df), seg.Z = ~n)
-
-Estimated Break-Point(s):
-    Est.  St.Err 
-136.566   5.677 
-
-Meaningful coefficients of the linear terms:
-            Estimate Std. Error t value Pr(&gt;|t|)    
-(Intercept) -61.0463    11.7827  -5.181 1.22e-06 ***
-n             3.6374     0.1534  23.706  &lt; 2e-16 ***
-U1.n         -2.6332     0.1593 -16.525       NA    
----
-Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
-
-Residual standard error: 33.92 on 96 degrees of freedom
-Multiple R-Squared: 0.9804,  Adjusted R-squared: 0.9798 
-
-Convergence attained in 4 iterations with relative change -7.90412e-16
-</pre>
-
-<div class="org-src-container">
-<pre class="src src-R">predict_segmented = data.frame(n = df$n, fn = broken.line(model_segmented)$fit)
-ggplot(df, aes(x = n, y = fn)) +
-geom_point() + geom_line(data = predict_segmented, color = 'blue')
-</pre>
-</div></li>
-
-<li><p>
-Then, doing the segmentation by hand.
-</p>
-
-<div class="org-src-container">
-<pre class="src src-R">Break&lt;-sort(unique(df$n))
-Break&lt;-Break[2:(length(Break)-1)]
-d&lt;-numeric(length(Break))
-for (i in 1:length(Break)) {
-    model_manual&lt;-lm(fn~(n&lt;Break[i])*n + (n&gt;=Break[i])*n, data=df)
-    d[i]&lt;-summary(model_manual)[[6]]
-}
-plot(d)
-</pre>
-</div>
-
-<div class="org-src-container">
-<pre class="src src-R"># Smallest breakpoint
-breakpoint = Break[which.min(d)]
-breakpoint
-df$group = df$n &gt;= breakpoint
-model_manual&lt;-lm(fn~n*group, data=df)
-summary(model_manual)
-</pre>
-</div>
-
-<pre class="example">
-[1] 100
-
-Call:
-lm(formula = fn ~ n * group, data = df)
-
-Residuals:
-    Min      1Q  Median      3Q     Max 
--9.6223 -5.0330 -0.5436  4.7791 10.4031 
-
-Coefficients:
-             Estimate Std. Error t value Pr(&gt;|t|)    
-(Intercept)   1.02021    2.39788   0.425    0.671    
-n             1.98517    0.04128  48.090   &lt;2e-16 ***
-groupTRUE   300.21629    3.07455  97.646   &lt;2e-16 ***
-n:groupTRUE  -0.98826    0.04174 -23.678   &lt;2e-16 ***
----
-Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
-
-Residual standard error: 5.984 on 96 degrees of freedom
-Multiple R-squared:  0.9994,	Adjusted R-squared:  0.9994 
-F-statistic: 5.248e+04 on 3 and 96 DF,  p-value: &lt; 2.2e-16
-</pre>
-
-<div class="org-src-container">
-<pre class="src src-R">dat_pred = data.frame(n = df$n, fn = predict(model_manual, df))
-ggplot(df, aes(x = n, y = fn)) +
-    geom_point() +
-    geom_line(data=dat_pred[dat_pred$n &lt; breakpoint,], color = 'blue')+
-    geom_line(data=dat_pred[dat_pred$n &gt;= breakpoint,], color = 'blue')
-</pre>
-</div></li>
-
-<li>The <code>segmented</code> package fails when the data is discontinuous.</li>
-<li>The dirty method works great.</li>
-</ul>
-</div>
-</li>
-</ol>
-</div>
-</div>
-</div>
-</div>
-<div id="postamble" class="status">
-<p class="validation"><a href="http://validator.w3.org/check?uri=referer">Validate</a></p>
-</div>
-</body>
-</html>
diff --git a/module2/ressources/video_examples/paper.html b/module2/ressources/video_examples/paper.html
deleted file mode 100644
index 5982c0cbdf953f20ac6298f44c314f71586b0912..0000000000000000000000000000000000000000
--- a/module2/ressources/video_examples/paper.html
+++ /dev/null
@@ -1,1377 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
-"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
-<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">
-<head>
-<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
-<meta name="viewport" content="width=device-width, initial-scale=1" />
-<title>&lrm;</title>
-<meta name="generator" content="Org mode" />
-<style type="text/css">
- <!--/*--><![CDATA[/*><!--*/
-  .title  { text-align: center;
-             margin-bottom: .2em; }
-  .subtitle { text-align: center;
-              font-size: medium;
-              font-weight: bold;
-              margin-top:0; }
-  .todo   { font-family: monospace; color: red; }
-  .done   { font-family: monospace; color: green; }
-  .priority { font-family: monospace; color: orange; }
-  .tag    { background-color: #eee; font-family: monospace;
-            padding: 2px; font-size: 80%; font-weight: normal; }
-  .timestamp { color: #bebebe; }
-  .timestamp-kwd { color: #5f9ea0; }
-  .org-right  { margin-left: auto; margin-right: 0px;  text-align: right; }
-  .org-left   { margin-left: 0px;  margin-right: auto; text-align: left; }
-  .org-center { margin-left: auto; margin-right: auto; text-align: center; }
-  .underline { text-decoration: underline; }
-  #postamble p, #preamble p { font-size: 90%; margin: .2em; }
-  p.verse { margin-left: 3%; }
-  pre {
-    border: 1px solid #ccc;
-    box-shadow: 3px 3px 3px #eee;
-    padding: 8pt;
-    font-family: monospace;
-    overflow: auto;
-    margin: 1.2em;
-  }
-  pre.src {
-    position: relative;
-    overflow: visible;
-    padding-top: 1.2em;
-  }
-  pre.src:before {
-    display: none;
-    position: absolute;
-    background-color: white;
-    top: -10px;
-    right: 10px;
-    padding: 3px;
-    border: 1px solid black;
-  }
-  pre.src:hover:before { display: inline;}
-  /* Languages per Org manual */
-  pre.src-asymptote:before { content: 'Asymptote'; }
-  pre.src-awk:before { content: 'Awk'; }
-  pre.src-C:before { content: 'C'; }
-  /* pre.src-C++ doesn't work in CSS */
-  pre.src-clojure:before { content: 'Clojure'; }
-  pre.src-css:before { content: 'CSS'; }
-  pre.src-D:before { content: 'D'; }
-  pre.src-ditaa:before { content: 'ditaa'; }
-  pre.src-dot:before { content: 'Graphviz'; }
-  pre.src-calc:before { content: 'Emacs Calc'; }
-  pre.src-emacs-lisp:before { content: 'Emacs Lisp'; }
-  pre.src-fortran:before { content: 'Fortran'; }
-  pre.src-gnuplot:before { content: 'gnuplot'; }
-  pre.src-haskell:before { content: 'Haskell'; }
-  pre.src-hledger:before { content: 'hledger'; }
-  pre.src-java:before { content: 'Java'; }
-  pre.src-js:before { content: 'Javascript'; }
-  pre.src-latex:before { content: 'LaTeX'; }
-  pre.src-ledger:before { content: 'Ledger'; }
-  pre.src-lisp:before { content: 'Lisp'; }
-  pre.src-lilypond:before { content: 'Lilypond'; }
-  pre.src-lua:before { content: 'Lua'; }
-  pre.src-matlab:before { content: 'MATLAB'; }
-  pre.src-mscgen:before { content: 'Mscgen'; }
-  pre.src-ocaml:before { content: 'Objective Caml'; }
-  pre.src-octave:before { content: 'Octave'; }
-  pre.src-org:before { content: 'Org mode'; }
-  pre.src-oz:before { content: 'OZ'; }
-  pre.src-plantuml:before { content: 'Plantuml'; }
-  pre.src-processing:before { content: 'Processing.js'; }
-  pre.src-python:before { content: 'Python'; }
-  pre.src-R:before { content: 'R'; }
-  pre.src-ruby:before { content: 'Ruby'; }
-  pre.src-sass:before { content: 'Sass'; }
-  pre.src-scheme:before { content: 'Scheme'; }
-  pre.src-screen:before { content: 'Gnu Screen'; }
-  pre.src-sed:before { content: 'Sed'; }
-  pre.src-sh:before { content: 'shell'; }
-  pre.src-sql:before { content: 'SQL'; }
-  pre.src-sqlite:before { content: 'SQLite'; }
-  /* additional languages in org.el's org-babel-load-languages alist */
-  pre.src-forth:before { content: 'Forth'; }
-  pre.src-io:before { content: 'IO'; }
-  pre.src-J:before { content: 'J'; }
-  pre.src-makefile:before { content: 'Makefile'; }
-  pre.src-maxima:before { content: 'Maxima'; }
-  pre.src-perl:before { content: 'Perl'; }
-  pre.src-picolisp:before { content: 'Pico Lisp'; }
-  pre.src-scala:before { content: 'Scala'; }
-  pre.src-shell:before { content: 'Shell Script'; }
-  pre.src-ebnf2ps:before { content: 'ebfn2ps'; }
-  /* additional language identifiers per "defun org-babel-execute"
-       in ob-*.el */
-  pre.src-cpp:before  { content: 'C++'; }
-  pre.src-abc:before  { content: 'ABC'; }
-  pre.src-coq:before  { content: 'Coq'; }
-  pre.src-groovy:before  { content: 'Groovy'; }
-  /* additional language identifiers from org-babel-shell-names in
-     ob-shell.el: ob-shell is the only babel language using a lambda to put
-     the execution function name together. */
-  pre.src-bash:before  { content: 'bash'; }
-  pre.src-csh:before  { content: 'csh'; }
-  pre.src-ash:before  { content: 'ash'; }
-  pre.src-dash:before  { content: 'dash'; }
-  pre.src-ksh:before  { content: 'ksh'; }
-  pre.src-mksh:before  { content: 'mksh'; }
-  pre.src-posh:before  { content: 'posh'; }
-  /* Additional Emacs modes also supported by the LaTeX listings package */
-  pre.src-ada:before { content: 'Ada'; }
-  pre.src-asm:before { content: 'Assembler'; }
-  pre.src-caml:before { content: 'Caml'; }
-  pre.src-delphi:before { content: 'Delphi'; }
-  pre.src-html:before { content: 'HTML'; }
-  pre.src-idl:before { content: 'IDL'; }
-  pre.src-mercury:before { content: 'Mercury'; }
-  pre.src-metapost:before { content: 'MetaPost'; }
-  pre.src-modula-2:before { content: 'Modula-2'; }
-  pre.src-pascal:before { content: 'Pascal'; }
-  pre.src-ps:before { content: 'PostScript'; }
-  pre.src-prolog:before { content: 'Prolog'; }
-  pre.src-simula:before { content: 'Simula'; }
-  pre.src-tcl:before { content: 'tcl'; }
-  pre.src-tex:before { content: 'TeX'; }
-  pre.src-plain-tex:before { content: 'Plain TeX'; }
-  pre.src-verilog:before { content: 'Verilog'; }
-  pre.src-vhdl:before { content: 'VHDL'; }
-  pre.src-xml:before { content: 'XML'; }
-  pre.src-nxml:before { content: 'XML'; }
-  /* add a generic configuration mode; LaTeX export needs an additional
-     (add-to-list 'org-latex-listings-langs '(conf " ")) in .emacs */
-  pre.src-conf:before { content: 'Configuration File'; }
-
-  table { border-collapse:collapse; }
-  caption.t-above { caption-side: top; }
-  caption.t-bottom { caption-side: bottom; }
-  td, th { vertical-align:top;  }
-  th.org-right  { text-align: center;  }
-  th.org-left   { text-align: center;   }
-  th.org-center { text-align: center; }
-  td.org-right  { text-align: right;  }
-  td.org-left   { text-align: left;   }
-  td.org-center { text-align: center; }
-  dt { font-weight: bold; }
-  .footpara { display: inline; }
-  .footdef  { margin-bottom: 1em; }
-  .figure { padding: 1em; }
-  .figure p { text-align: center; }
-  .equation-container {
-    display: table;
-    text-align: center;
-    width: 100%;
-  }
-  .equation {
-    vertical-align: middle;
-  }
-  .equation-label {
-    display: table-cell;
-    text-align: right;
-    vertical-align: middle;
-  }
-  .inlinetask {
-    padding: 10px;
-    border: 2px solid gray;
-    margin: 10px;
-    background: #ffffcc;
-  }
-  #org-div-home-and-up
-   { text-align: right; font-size: 70%; white-space: nowrap; }
-  textarea { overflow-x: auto; }
-  .linenr { font-size: smaller }
-  .code-highlighted { background-color: #ffff00; }
-  .org-info-js_info-navigation { border-style: none; }
-  #org-info-js_console-label
-    { font-size: 10px; font-weight: bold; white-space: nowrap; }
-  .org-info-js_search-highlight
-    { background-color: #ffff00; color: #000000; font-weight: bold; }
-  .org-svg { width: 90%; }
-  /*]]>*/-->
-</style>
-<script type="text/javascript">
-/*
-@licstart  The following is the entire license notice for the
-JavaScript code in this tag.
-
-Copyright (C) 2012-2019 Free Software Foundation, Inc.
-
-The JavaScript code in this tag is free software: you can
-redistribute it and/or modify it under the terms of the GNU
-General Public License (GNU GPL) as published by the Free Software
-Foundation, either version 3 of the License, or (at your option)
-any later version.  The code is distributed WITHOUT ANY WARRANTY;
-without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE.  See the GNU GPL for more details.
-
-As additional permission under GNU GPL version 3 section 7, you
-may distribute non-source (e.g., minimized or compacted) forms of
-that code without the copy of the GNU GPL normally required by
-section 4, provided you include this license notice and a URL
-through which recipients can access the Corresponding Source.
-
-
-@licend  The above is the entire license notice
-for the JavaScript code in this tag.
-*/
-<!--/*--><![CDATA[/*><!--*/
- function CodeHighlightOn(elem, id)
- {
-   var target = document.getElementById(id);
-   if(null != target) {
-     elem.cacheClassElem = elem.className;
-     elem.cacheClassTarget = target.className;
-     target.className = "code-highlighted";
-     elem.className   = "code-highlighted";
-   }
- }
- function CodeHighlightOff(elem, id)
- {
-   var target = document.getElementById(id);
-   if(elem.cacheClassElem)
-     elem.className = elem.cacheClassElem;
-   if(elem.cacheClassTarget)
-     target.className = elem.cacheClassTarget;
- }
-/*]]>*///-->
-</script>
-<script type="text/x-mathjax-config">
-    MathJax.Hub.Config({
-        displayAlign: "center",
-        displayIndent: "0em",
-
-        "HTML-CSS": { scale: 100,
-                        linebreaks: { automatic: "false" },
-                        webFont: "TeX"
-                       },
-        SVG: {scale: 100,
-              linebreaks: { automatic: "false" },
-              font: "TeX"},
-        NativeMML: {scale: 100},
-        TeX: { equationNumbers: {autoNumber: "AMS"},
-               MultLineWidth: "85%",
-               TagSide: "right",
-               TagIndent: ".8em"
-             }
-});
-</script>
-<script type="text/javascript"
-        src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js?config=TeX-AMS_HTML"></script>
-</head>
-<body>
-<div id="content">
-
-<div id="outline-container-orgfadf147" class="outline-2">
-<h2 id="orgfadf147"><span class="section-number-2">1</span> LaTeX Preamble&#xa0;&#xa0;&#xa0;<span class="tag"><span class="ignore">ignore</span></span></h2>
-<div class="outline-text-2" id="text-1">
-</div>
-</div>
-
-<div id="outline-container-org5865c53" class="outline-2">
-<h2 id="org5865c53"><span class="section-number-2">2</span> LaTeX IEEE title and authors&#xa0;&#xa0;&#xa0;<span class="tag"><span class="ignore">ignore</span></span></h2>
-<div class="outline-text-2" id="text-2">
-</div>
-</div>
-<div id="outline-container-org7078b95" class="outline-2">
-<h2 id="org7078b95"><span class="section-number-2">3</span> Abstract&#xa0;&#xa0;&#xa0;<span class="tag"><span class="ignore">ignore</span></span></h2>
-<div class="outline-text-2" id="text-3">
-<p>
-The Linpack benchmark, in particular the High-Performance Linpack
-(HPL) implementation, has emerged as the de-facto standard benchmark
-to rank supercomputers in the TOP500. With a power consumption of
-several MW per hour on a TOP500 machine, test-running HPL on the whole
-machine for hours is extremely expensive. With core-counts beyond the
-100,000 cores threshold being common and sometimes even ranging into
-the millions, an optimization of HPL parameters (problem size, grid
-arrangement, granularity, collective operation algorithms, etc.)
-specifically suited to the network topology and performance is
-essential. Such optimization can be particularly time consuming and
-can hardly be done through simple mathematical performance models. In
-this article, we explain how we both extended the SimGrid's SMPI
-simulator and slightly modified HPL to allow a fast emulation of HPL
-on a single commodity computer at the scale of a supercomputer. More
-precisely, we take as a motivating use case the large-scale run
-performed on the Stampede cluster at TACC in 2013, when it got ranked
-6th in the TOP500. While this qualification run required the
-dedication of 6,006 computing nodes of the supercomputer and more than
-120&nbsp;TB of RAM for more than 2&nbsp;hours, we manage to simulate a similar
-configuration on a commodity computer with 19&nbsp;GB of RAM in about
-62&nbsp;hours. Allied to a careful modeling of Stampede, this simulation
-allows us to evaluate the performance that would have been obtained
-using the freely available version of HPL. Such performance reveals much
-lower than what was reported and which was obtained using a
-closed-source version specifically designed by the Intel
-engineers. Our simulation allows us to hint where the main algorithmic
-improvements must have been done in HPL. 
-</p>
-</div>
-</div>
-<div id="outline-container-org36d3f0f" class="outline-2">
-<h2 id="org36d3f0f"><span class="section-number-2">4</span> Introduction</h2>
-<div class="outline-text-2" id="text-4">
-<p>
-The world's largest and fastest machines are ranked twice a year in the so-called
-TOP500 list. Among the benchmarks that are often used to evaluate
-those machines, the Linpack benchmark, in particular the High-Performance Linpack (HPL)
-implementation, has emerged as the de-facto standard benchmark, although
-other benchmarks such as HPCG and HPGMG have recently been proposed to
-become the new standard. Today, machines with 100,000&nbsp;cores 
-and more are common and several machines beyond the 1,000,000&nbsp;cores mark
-are already in production. This high density of computation units requires diligent optimization of application
-parameters, such as problem size, process organization or choice of algorithm, as these
-have an impact on load distribution and network utilization.
-Furthermore, to yield best benchmark results,
-runtimes (such as OpenMPI) and supporting libraries (such as BLAS) need to be fine-tuned and adapted to the
-underlying platform. 
-</p>
-
-<p>
-Alas, it takes typically several hours to run HPL on the list's number one system.
-This duration, combined with the power consumption that often reaches several MW
-for TOP500 machines, makes it financially infeasible to test-run HPL on the whole
-machine just to tweak parameters. 
-Yet, performance results of an already deployed, current-generation machine typically also
-play a role in the funding process for future machines. Results near
-the optimal performance for the current machine are hence considered critical for
-HPC centers and vendors. These entities would benefit from being able to
-tune parameters without actually running the benchmark for hours.
-</p>
-
-<p>
-In this article, we explain how to predict the performance of HPL
-through simulation with the SimGrid/SMPI simulator. We detail how we obtained
-faithful models for several functions (\eg <code>DGEMM</code> and <code>DTRSM</code>) and how we managed
-to reduce the memory consumption from more than a hundred terabytes to several
-gigabytes, allowing us to emulate HPL on a commonly available server node.
-We evaluate the effectiveness of our solution by
-simulating a scenario similar to the run conducted on the Stampede
-cluster (TACC) in 2013 for the TOP500 . 
-</p>
-
-<p>
-This article is organized as follows:
-Section\ref{sec:con} presents the main characteristics of the HPL
-application and provides detail on the run that was conducted at TACC
-in 2013.  Section\ref{sec:relwork} discusses existing related work and
-explains why emulation (or <i>online simulation</i>) is the only relevant
-approach when studying an application as complex as HPL. In
-Section\ref{sec:smpi}, we briefly present the simulator we used for
-this work, SimGrid/SMPI, followed by an
-extensive discussion in Section\ref{sec:em} about the
-optimizations on all levels (\ie simulator, application, system) that
-were necessary to make a large-scale run tractable. The scalability of
-our approach is evaluated in Section\ref{sec:scalabilityevol}. The
-modeling of the Stampede platform and the comparison of our simulation
-with the 2013 execution is detailed in
-Section\ref{sec:science}. Lastly, Section\ref{sec:cl} concludes this
-article by summarizing our contributions.
-</p>
-</div>
-</div>
-
-<div id="outline-container-org3210f13" class="outline-2">
-<h2 id="org3210f13"><span class="section-number-2">5</span> Context</h2>
-<div class="outline-text-2" id="text-5">
-</div>
-
-<div id="outline-container-orgfbcf18e" class="outline-3">
-<h3 id="orgfbcf18e"><span class="section-number-3">5.1</span> High-Performance Linpack</h3>
-<div class="outline-text-3" id="text-5-1">
-<p>
-\label{sec:hpl}
-</p>
-<p>
-For this work, we use the freely-available reference-implementation of
-the High-Performance Linpack benchmark\cite{HPL}, HPL, which is 
-used to benchmark systems for the TOP500\cite{top500} list. HPL
-requires MPI to be available and implements
-a LU decomposition, \ie a factorization of a square matrix \(A\) as the
-product of a lower triangular matrix \(L\) and an upper triangular
-matrix \(U\). HPL checks the correctness of this factorization by
-solving a linear system \(A\cdot{}x=b\), but only the factorization step is
-benchmarked.  The factorization is based on a right-looking variant of
-the LU factorization with row partial pivoting and allows multiple
-look-ahead depths. The working principle of the factorization is depicted in
-Figure\ref{fig:hpl_overview} and consists of a series of panel
-factorizations followed by an update of the trailing sub-matrix.
-HPL uses a two-dimensional block-cyclic data distribution of \(A\) and implements several custom
-collective communication algorithms to efficiently overlap communication
-with computation.
-The main parameters of HPL are listed subsequently:
-</p>
-<ul class="org-ul">
-<li>\(N\) is the order of the square matrix \(A\).</li>
-<li><code>NB</code> is the ``blocking factor'', \ie the granularity at
-which HPL operates when panels are distributed or worked on.</li>
-<li>\(P\) and \(Q\) denote the number of process rows and the
-number of process columns, respectively.</li>
-<li><code>RFACT</code> determines the panel factorization algorithm. Possible values are Crout, left- or right-looking.</li>
-<li><code>SWAP</code> specifies the swapping algorithm used while pivoting. Two
-algorithms are available: one based on <i>binary exchange</i> (along a virtual tree topology) and the other one based on
-a <i>spread-and-roll</i> (with a higher number of parallel communications). HPL
-also provides a panel-size threshold triggering a switch from one variant to the other.</li>
-<li><code>BCAST</code> sets the algorithm used to broadcast the
-panel of columns to the other process columns. Legacy versions of
-the MPI standard only supported non-blocking point-to-point communications but did
-not support non-blocking collective communications, which is why HPL
-ships with in total 6 self-implemented variants to efficiently
-overlap the time spent waiting for an incoming panel with updates to
-the trailing matrix: <code>ring</code>, <code>ring-modified</code>, <code>2-ring</code>, <code>2-ring-modified</code>,
-<code>long</code>, and <code>long-modified</code>. The <code>modified</code> versions guarantee that
-the process right after the root (\ie the process that will become the root
-in the next iteration) receives data first and does not participate
-further in the broadcast. This process can thereby start working on the
-panel as soon as possible. The <code>ring</code> and <code>2-ring</code> versions correspond
-to the name-giving two virtual topologies while the <code>long</code> version
-is a <i>spread and roll</i> algorithm where messages are chopped into \(Q\)
-pieces. This generally leads to better bandwidth exploitation. The <code>ring</code> and
-<code>2-ring</code> variants rely on <code>MPI_Iprobe</code>, meaning they
-return control if no message has been fully received yet and hence
-facilitate partial overlapping of communication with computations. In HPL 2.2 and 2.1, this capability
-has been deactivated for the <code>long</code> and <code>long-modified</code> algorithms. A comment in the source code states that some
-machines apparently get stuck when there are too many ongoing messages.</li>
-<li><code>DEPTH</code> controls how many iterations of the outer loop can overlap with each other.</li>
-</ul>
-
-<p>
-The sequential complexity of this factorization is 
-\(\mathrm{flop}(N) = \frac{2}{3}N^3 + 2N^2 + \O(N)\) where \(N\) is the
-order of the matrix to factorize. The time complexity can be
-approximated by
-\[T(N) \approx \frac{\left(\frac{2}{3}N^3 + 2N^2\right)}{P\cdot{}Q\cdot{}w} + \Theta((P+Q)\cdot{}N^2),\] where
-\(w\) is the flop rate of a single node and 
-the second term corresponds to the communication overhead which is
-influenced by the network capacity and by the previously listed parameters (<code>RFACT</code>, <code>SWAP</code>, <code>BCAST</code>,
-<code>DEPTH</code>, \ldots). 
-After each run, HPL reports the overall flop
-rate \(\mathrm{flop}(N)/T(N)\) (expressed in \si{\giga\flops}) for
-the given configuration. See Figure\ref{fig:hpl_output} for a (shortened)
-example output.
-</p>
-
-<p>
-A large-scale execution of HPL on a real machine in order to submit to the TOP500
-can therefore be quite time consuming as all the BLAS kernels, the MPI runtime, and HPL's numerous parameters
-need to be tuned carefully in order to reach optimal performance.
-</p>
-</div>
-</div>
-<div id="outline-container-org92463e9" class="outline-3">
-<h3 id="org92463e9"><span class="section-number-3">5.2</span> A Typical Run on a Supercomputer</h3>
-<div class="outline-text-3" id="text-5-2">
-<p>
-\label{sec:stampede}
-In June 2013, the Stampede supercomputer at TACC was ranked 6th in the
-TOP500 by achieving \SI{5168.1}{\tera\flops} and was still ranked 20th in
-June 2017. In 2017, this machine got upgraded and renamed Stampede2. The Stampede platform
-consisted of 6400 Sandy Bridge nodes, each with two 8-core Xeon E5-2680 and one
-Intel Xeon Phi KNC MIC coprocessor. The nodes were interconnected
-through a \SI{56}{\giga\bit\per\second} FDR InfiniBand 2-level Clos
-fat-tree topology built on Mellanox switches. As can be seen in 
-Figure\ref{fig:fat_tree_topology}, the 6400 nodes are
-divided into groups of 20, with each group being connected to one of the 320 36-port switches (\SI{4}{\tera\bit\per\second}
-capacity), which are themselves connected to 8 648-port
-``core&nbsp;switches'' (each with a capacity of \SI{73}{\tera\bit\per\second}). 
-The peak performance of the 2 Xeon CPUs per node was approximately \SI{346}{\giga\flops},
-while the peak performance of the KNC co-processor was about
-\SI{1}{\tera\flops}. The theoretical peak performance of the
-platform was therefore \SI{8614}{\tera\flops}. However, in the TOP500, Stampede
-was ranked with \SI{5168}{\tera\flops}. According to the log submitted
-to the TOP500 (see Figure\ref{fig:hpl_output}) that was provided to us,
-this execution took roughly two hours and used \(77\times78 = 6,006\)
-processes. The matrix of order \(N = 3,875,000\) occupied approximately
-\SI{120}{\tera\byte} of memory, \ie \SI{20}{\giga\byte} per node.
-One MPI process per node was used and each node's
-computational resources (the 16 CPU-cores and the Xeon Phi) must have 
-been controlled by OpenMP and/or Intel's MKL.
-</p>
-</div>
-</div>
-<div id="outline-container-org9aec446" class="outline-3">
-<h3 id="org9aec446"><span class="section-number-3">5.3</span> Performance Evaluation Challenges</h3>
-<div class="outline-text-3" id="text-5-3">
-<p>
-The performance achieved by Stampede, \SI{5168}{\tera\flops}, needs to
-be compared to the peak performance of the 6,006 nodes, \ie
-\SI{8084}{\tera\flops}. This difference may be attributed to the node
-usage (\eg the MKL), to the MPI library, to the network topology that
-may be unable to deal with the very intensive communication workload, to
-load imbalance among nodes because some node happens to be slower for some
-reason (defect, system noise, \ldots), to the algorithmic structure of
-HPL, etc. All these factors make it difficult to know precisely what
-performance to expect
-without running the application at scale.
-</p>
-
-<p>
-It is clear that due to the level of complexity of both HPL and
-the underlying hardware, simple performance models (analytic expressions based
-on \(N, P, Q\) and estimations of platform characteristics as presented in
-Section\ref{sec:hpl}) may be able to provide trends but can by no means
-predict the performance for each configuration (\ie consider the
-exact effect of HPL's 6 different broadcast algorithms on network
-contention). Additionally, these expressions do not allow
-engineers to improve the performance through actively identifying performance bottlenecks.
-For complex optimizations such as partially non-blocking
-collective communication algorithms intertwined with computations,
-very faithful modeling of both the application and the platform is
-required. Given the scale of this scenario
-(3,785&nbsp;steps on 6,006 nodes in two hours), detailed
-simulations quickly become intractable without significant effort.
-</p>
-</div>
-</div>
-</div>
-<div id="outline-container-org1e57599" class="outline-2">
-<h2 id="org1e57599"><span class="section-number-2">6</span> Related Work</h2>
-<div class="outline-text-2" id="text-6">
-<p>
-Performance prediction of MPI application through simulation has been
-widely studied over the last decades, with today's literature distinguishing mainly
-between two approaches: offline and online simulation.
-</p>
-
-<p>
-With the most common approach, <i>offline simulation</i>, a time-independent
-trace of the application is first obtained on a real platform. This
-trace comprises sequences of MPI optimizations and CPU bursts and can
-be given as an input to a simulator that implements performance models
-for the CPUs and the network to derive timings. Researchers
-interested in finding out how their application reacts to changes to
-the underlying platform can replay the trace on commodity hardware at
-will with different platform models.
-Most HPC simulators available today, notably BigSim\cite{bigsim_04},
-Dimemas\cite{dimemas} and CODES\cite{CODES}, rely on this approach.
-</p>
-
-<p>
-The main limitation of this approach comes from the trace
-acquisition requirement.
-Additionally, tracing an application provides only information about
-its behavior at the time of the run. Even light modifications 
-(\eg to communication patterns) may make the trace inaccurate. For
-simple applications (\eg <code>stencil</code>) it is sometimes
-possible to extrapolate behavior from small-scale
-traces\cite{scalaextrap,pmac_lspp13} but the execution is
-non-deterministic whenever the application relies on
-non-blocking communication patterns, which is unfortunately the
-case for HPL.
-</p>
-
-<p>
-The second approach discussed in literature is <i>online simulation</i>.
-Here, the application is executed (emulated) on top of a simulator
-that is responsible for determining when each process
-is run. This approach allows researchers
-to study directly the behavior of MPI applications but only a few
-recent simulators such as SST Macro\cite{sstmacro},
-SimGrid/SMPI\cite{simgrid} 
-and the closed-source extreme-scale simulator xSim\cite{xsim} support
-it. To the best of our knowledge, only SST Macro and
-SimGrid/SMPI are not only mature enough to faithfully emulate 
-HPL but also free software. For our work, we relied on SimGrid as we 
-have an excellent knowledge of its internals although the developments we
-propose would a priori also be possible with SST Macro. Emulation of
-HPL comes with at least two challenges:
-</p>
-<ul class="org-ul">
-<li>Firstly, the time-complexity of the
-algorithm is \(\Theta(N^3)\). Furthermore, 
-\(\Theta(N^2)\) communications are performed, with \(N\) being very
-large. The execution on the Stampede cluster took roughly two hours
-on 6,006&nbsp;compute nodes. Using only a single node, a naive
-emulation of HPL at the scale of the Stampede run would take about
-500&nbsp;days if perfect scaling is reached. Although the emulation could
-be done in parallel, we want to use as little computing resources as possible.</li>
-<li>Secondly, the tremendous memory consumption and consequent high
-number of RAM accesses for read/write operations need to be dealt with.</li>
-</ul>
-</div>
-</div>
-
-<div id="outline-container-orgc6c5855" class="outline-2">
-<h2 id="orgc6c5855"><span class="section-number-2">7</span> SimGrid/SMPI in a nutshell</h2>
-<div class="outline-text-2" id="text-7">
-<p>
-SimGrid\cite{simgrid} is a flexible and open-source simulation
-framework that was originally designed in 2000 to study scheduling
-heuristics tailored to heterogeneous grid computing
-environments. Since then, SimGrid has also been used to study
-peer-to-peer systems with up to two million
-peers\cite{simgrid_simix2_12} just as cloud and HPC infrastructures.
-To this end, SMPI, a simulator based on SimGrid, has been
-developed and used to faithfully simulate unmodified MPI applications
-written in C/C++ or FORTRAN\cite{smpi}.
-A main development goal for SimGrid has been to provide validated
-performance models particularly for scenarios leveraging the network. 
-Such a validation normally consists of comparing simulation
-predictions with results from real experiments to confirm or debunk network and application models.
-In\cite{heinrich:hal-01523608}, we have for instance validated
-SimGrid's energy module by accurately and consistently predicting within a few
-percent the performance and the energy consumption of HPL and some
-other benchmarks on small-scale clusters (up to \(12\times12\) cores
-in\cite{heinrich:hal-01523608} and up to \(128\times1\) cores
-in\cite{smpi}).
-</p>
-
-<p>
-In this article, we aim to validate our approach through much larger experiments.
-This scale, however, comes at the cost of a much less controlled
-scenario for real-life experiments since the Stampede run of HPL was done
-in 2013 and we only have very limited information about the
-setup (\eg software versions).
-</p>
-</div>
-
-<div id="outline-container-org38dd98a" class="outline-3">
-<h3 id="org38dd98a"><span class="section-number-3">7.1</span> MPI Communication Modeling</h3>
-<div class="outline-text-3" id="text-7-1">
-<p>
-The complex network optimizations done in real MPI implementations
-need to be considered when predicting performance of MPI applications.
-For instance, message size not only influences the network's latency
-and bandwidth factors but also the protocol used, such as ``eager'' or
-``rendez-vous'', as they are selected
-based on the message size, with each protocol having its own
-synchronization semantics.
-To deal with this, SMPI relies on a generalization of the LogGPS
-model\cite{smpi} and supports specifying synchronization and performance modes. This model
-needs to be instantiated once per platform through a carefully controlled series of messages
-(<code>MPI_Send</code> and <code>MPI_Recv</code>) between two nodes and through a set of
-piece-wise linear regressions.
-</p>
-<p>
-Modeling network topologies and contention is also difficult.  SMPI
-relies on SimGrid's communication models where each ongoing
-communication is represented as a whole (as opposed to single packets)
-by a <i>flow</i>. Assuming steady-state, contention between active
-communications can be modeled as a bandwidth sharing problem that
-accounts for non-trivial phenomena (\eg RTT-unfairness of TCP,
-cross-traffic interference or network
-heterogeneity\cite{Velho_TOMACS13}). Communications that start or end
-trigger re-computation of the bandwidth sharing if needed.  In this
-model, the time to simulate a message passing through the network is
-independent of its size, which is advantageous for large-scale
-applications frequently sending large messages.  SimGrid does not
-model transient phenomena incurred by the network protocol but
-accounts for network topology and heterogeneity.
-</p>
-
-<p>
-Finally, collective operations are also challenging, particularly since
-these operations often play a key factor to an application's performance. Consequently, performance optimization
-of these operations has been studied intensively. As a result, MPI
-implementations now commonly have several alternatives for each
-collective operation and select one at runtime, depending on message size and communicator
-geometry. SMPI implements collective
-communication algorithms and the selection logic from several MPI implementations (\eg
-Open MPI, MPICH), which helps to ensure that
-simulations are as close as possible to real
-executions. 
-Although SMPI supports these facilities, they are not required in the
-case of HPL as it ships with its own implementation of collective
-operations.
-</p>
-</div>
-</div>
-<div id="outline-container-org57b35fa" class="outline-3">
-<h3 id="org57b35fa"><span class="section-number-3">7.2</span> Application Behavior Modeling</h3>
-<div class="outline-text-3" id="text-7-2">
-<p>
-In Section\ref{sec:relwork} we explained that SMPI relies on the <i>online</i> simulation approach.
-Since SimGrid is a sequential simulator, SMPI maps every MPI process of the application onto a
-lightweight simulation thread. These threads are then run one at a
-time, \ie in mutual exclusion.
-Every time a thread enters an MPI call, 
-SMPI takes control and the time that was spent
-computing (isolated from the other threads) since the previous
-MPI call can be injected into the simulator as a virtual delay. 
-</p>
-
-<p>
-Mapping MPI processes to threads of a single
-process effectively folds them into the same address space.
-Consequently, global variables in the MPI application are shared
-between threads unless these variables are <i>privatized</i> and the
-simulated MPI ranks thus isolated from each other. Several
-technical solutions are possible to handle this issue\cite{smpi}. The
-default strategy in SMPI consists of making a copy of the <code>data</code>
-segment (containing all global variables) per MPI rank at startup and,
-when context switching to another rank, to remap the <code>data</code> segment via <code>mmap</code> to the private copy of that rank.
-SMPI also implements another mechanism relying on the <code>dlopen</code>
-function that saves calls to <code>mmap</code> when context switching.
-</p>
-
-<p>
-This causes online simulation to be expensive in terms of both simulation time and memory
-since the whole parallel application is executed on a single node.
-To deal with this, SMPI provides two simple annotation mechanisms:
-</p>
-<ul class="org-ul">
-<li><b>Kernel sampling</b>: Control flow is in many cases
-independent of the computation results. This allows
-computation-intensive kernels (\eg BLAS kernels for HPL) 
-to be skipped during the simulation. For this purpose, SMPI
-supports annotation of regular kernels through several macros
-such as <code>SMPI_SAMPLE_LOCAL</code> and <code>SMPI_SAMPLE_GLOBAL</code>. The regularity allows SMPI to execute these
-kernels a few times, estimate their cost and skip the kernel in
-the future by deriving its cost from these samples, hence cutting
-simulation time significantly. Skipping kernels renders the
-content of some variables invalid but in simulation, only the
-behavior of the application and not the correctness of computation
-results are of concern.</li>
-<li><b>Memory folding</b>: SMPI provides the <code>SMPI_SHARED_MALLOC</code> (<code>SMPI_SHARED_FREE</code>) macro to
-replace calls to <code>malloc</code> (<code>free</code>). They indicate that some data structures can safely be
-shared between processes and that the data they contain is not
-critical for the execution (\eg an input matrix) and that it may
-even be overwritten. 
-<code>SMPI_SHARED_MALLOC</code> works as follows (see Figure\ref{fig:global_shared_malloc}) : a single block of physical memory (of default size \SI{1}{\mega\byte}) for the whole
-execution is allocated and shared by all MPI processes.
-A range of virtual addresses corresponding to a specified size is reserved and cyclically mapped onto the previously obtained
-physical address.
-This mechanism allows applications to obtain a nearly constant memory
-footprint, regardless of the size of the actual allocations.</li>
-</ul>
-</div>
-</div>
-</div>
-
-<div id="outline-container-org5657da2" class="outline-2">
-<h2 id="org5657da2"><span class="section-number-2">8</span> Improving SMPI Emulation Mechanisms and Preparing HPL</h2>
-<div class="outline-text-2" id="text-8">
-<p>
-We now present our changes to SimGrid and HPL that were
-required for a scalable and faithful simulation. We provide
-only a brief evaluation of our modifications and refer the 
-reader interested in details to\cite{cornebize:hal-01544827} and our laboratory 
-</p>
-<p>
-For our experiments in this section, we used a single core from nodes
-of the Nova cluster provided by the Grid'5000 testbed\cite{grid5000} with
-\SI{32}{\giga\byte} RAM, two 8-core Intel Xeon E5-2620 v4
-CPUs processors with \SI{2.1}{\GHz} and Debian Stretch (kernel 4.9). 
-</p>
-</div>
-
-<div id="outline-container-org5193a27" class="outline-3">
-<h3 id="org5193a27"><span class="section-number-3">8.1</span> Kernel modeling</h3>
-<div class="outline-text-3" id="text-8-1">
-<p>
-As explained in Section\ref{sec:con:diff}, faithful prediction
-of HPL necessitates emulation, \ie to execute the code.
-HPL relies heavily on BLAS kernels such as <code>dgemm</code> (for matrix-matrix multiplication) or <code>dtrsm</code> (for solving
-an equation of the form \(Ax=b\)). An analysis of an HPL
-simulation with \(64\) processes and a very small matrix of order
-\(30,000\) showed that roughly \SI{96}{\percent} of
-the time is spent in these two very regular kernels.
-For larger matrices, these kernels will consume
-an even bigger percentage of the computation time. Since these
-kernels do not influence the control flow, simulation time can
-be reduced by substituting <code>dgemm</code> and <code>dtrsm</code> function calls 
-with a performance model for the respective kernel. 
-Figure\ref{fig:macro_simple} shows an example of this
-macro-based mechanism that allows us to keep HPL code modifications to an absolute
-minimum. The <code>(1.029e-11)</code> value represents the inverse of the
-flop rate for this computation kernel and was obtained
-through calibration. The estimated time for the real
-kernel is calculated based on the parameters and eventually
-passed on to <code>smpi_execute_benched</code> that advances the clock of the executing
-rank by this estimate by entering a sleep state.
-The effect on simulation time for a small scenario is depicted in Figure\ref{fig:kernel_sampling}. 
-On the one hand, this modification speeds up the simulation by
-orders of magnitude, especially when the matrix order
-grows. On the other hand, this kernel model leads to an
-optimistic estimation of the floprate. This may 
-be caused by inaccuracies in our model as well as by the fact
-that the initial emulation is generally more sensitive to pre-emptions,
-\eg by the operating system, and therefore more likely to be
-pessimistic compared to a real execution.
-</p>
-</div>
-</div>
-<div id="outline-container-orge52fdfe" class="outline-3">
-<h3 id="orge52fdfe"><span class="section-number-3">8.2</span> Adjusting the behavior of HPL</h3>
-<div class="outline-text-3" id="text-8-2">
-<p>
-HPL uses pseudo-randomly generated
-matrices that need to be setup every time HPL is executed. The time
-spent on this just as the validation of the computed result is
-not considered in the reported \si{\giga\flops} performance. 
-We skip all the
-computations since we replaced them by a kernel model and therefore, 
-result validation is meaningless. Since both 
-phases do not have an impact on the reported performance, we can safely
-skip them.
-</p>
-
-<p>
-In addition to the main computation kernels <code>dgemm</code> and <code>dtrsm</code>, 
-we identified seven other BLAS functions through
-profiling as computationally expensive enough to justify a specific
-handling: <code>dgemv</code>, <code>dswap</code>, <code>daxpy</code>,
-<code>dscal</code>, <code>dtrsv</code>, <code>dger</code> and <code>idamax</code>. Similarly, a significant amount of time was
-spent in fifteen functions implemented in HPL: 
-<code>HPL_dlaswp*N</code>, <code>HPL_dlaswp*T</code>, <code>HPL_dlacpy</code> and <code>HPL_dlatcpy</code>.
-</p>
-
-<p>
-All of these functions are called during the
-LU factorization and hence impact the performance measured by HPL; however, because of
-the removal of the <code>dgemm</code> and <code>dtrsm</code> computations, they all operate on
-bogus data and hence also produce bogus data. We also determined
-through experiments that their impact on the performance prediction is
-minimal and hence modeled them for the sake of simplicity as being instantaneous.
-</p>
-
-<p>
-Note that HPL
-implements an LU factorization with partial pivoting and a special
-treatment of the <code>idamax</code> function that returns the index of the first
-element equaling the maximum absolute value. Although we ignored the
-cost of this function as well, we set its return value to an arbitrary
-value to make the simulation fully deterministic.
-We confirmed that this modification is harmless in terms of performance prediction while it
-speeds up the simulation by an additional factor of \(\approx3\) to \(4\)
-on small (\(N=30,000\)) and even more on large scenarios.
-</p>
-</div>
-</div>
-<div id="outline-container-org865ba5e" class="outline-3">
-<h3 id="org865ba5e"><span class="section-number-3">8.3</span> Memory folding</h3>
-<div class="outline-text-3" id="text-8-3">
-<p>
-As explained in Section\ref{sec:smpi}, when emulating an application
-with SMPI, all MPI processes are run within the same simulation process on a single
-node. The memory consumption of the simulation can therefore quickly reach
-several \si{\tera\byte} of RAM. 
-</p>
-
-<p>
-Yet, as we no longer operate on real data, storing the whole
-input matrix \(A\) is needless. However, since only a minimal portion of the code was
-modified, some functions may still read or write some parts of the matrix.
-It is thus not possible to simply remove the memory allocations of
-large data structures altogether. Instead, SMPI's <code>SHARED_MALLOC</code> mechanism can be used
-to share unimportant data structures between all ranks, minimizing the memory footprint.
-</p>
-
-<p>
-The largest two allocated data structures in HPL are the input matrix <code>A</code>
-(with a size of typically several \si{\giga\byte} per process) and the <code>panel</code> which contains
-information about the sub-matrix currently being factorized. This sub-matrix 
-typically occupies a few hundred \si{\mega\byte} per process.
-Although using the default <code>SHARED_MALLOC</code> mechanism works flawlessly
-for <code>A</code>, a more careful strategy needs to be used for the
-<code>panel</code>. Indeed, the <code>panel</code> is an intricate data structure with both \texttt{int}s
-(accounting for matrix indices, error codes, MPI tags, and pivoting information)
-and \texttt{double}s (corresponding to a copy of a sub-matrix of <code>A</code>). To
-optimize data transfers, HPL flattens this structure into a single
-allocation of \texttt{double}s (see
-Figure\ref{fig:panel_structure}). Using a fully shared memory
-allocation for the <code>panel</code> therefore leads to index corruption that results in
-classic invalid memory accesses as well as communication
-deadlocks, as processes may not send to or receive from the correct
-process. Since \texttt{int}s and \texttt{double}s are stored in
-non-contiguous parts of this flat allocation, it is therefore
-essential to have a mechanism that preserves the process-specific
-content. We have thus introduced the macro
-<code>SMPI_PARTIAL_SHARED_MALLOC</code> that works as follows: 
-<code>mem = SMPI_PARTIAL_SHARED_MALLOC(500, {27,42 , 100,200}, 2)</code>.
-In this example, 500 bytes are allocated in <code>mem</code> with the elements
-<code>mem[27]</code>, &#x2026;, <code>mem[41]</code> and <code>mem[100]</code>, &#x2026;, <code>mem[199]</code> being shared between
-processes (they are therefore generally completely corrupted) while all other
-elements remain private. To apply this to HPL's <code>panel</code> data&#x00ad;structure
-and partially share it between processes, we only had to modify a few lines. 
-</p>
-
-<p>
-Designating memory explicitly as private, shared or partially shared
-helps with both memory management and overall performance. 
-As SMPI is internally aware of the memory's
-visibility, it can avoid calling <code>memcopy</code> when large messages
-containing shared segments are sent from one MPI rank to another.
-For fully private or partially shared segments, SMPI
-identifies and copies only those parts that are process-dependent
-(private) into the corresponding buffers on the receiver side.
-</p>
-
-<p>
-HPL simulation times were considerably improved in our experiments because
-the <code>panel</code> as the most frequently transferred datastructure 
-is partially shared with only a small part being private.
-The additional error introduced by this technique was negligible (below \SI{1}{\percent}) while the
-memory consumption was lowered significantly: for a matrix of order \(40,000\) and \(64\) MPI processes, the memory consumption
-decreased from about \SI{13.5}{\giga\byte} to less than \SI{40}{\mega\byte}.
-</p>
-</div>
-</div>
-<div id="outline-container-orgadfb7da" class="outline-3">
-<h3 id="orgadfb7da"><span class="section-number-3">8.4</span> Panel reuse</h3>
-<div class="outline-text-3" id="text-8-4">
-<p>
-HPL \texttt{malloc}s/\texttt{free}s panels in each
-iteration, with the size of the panel strictly decreasing from
-iteration to iteration. As we explained above, the partial sharing of panels requires
-many calls to <code>mmap</code> and introduces an overhead that makes these repeated
-allocations / frees become a bottleneck. Since
-the very first allocation can fit all subsequent panels, we modified
-HPL to allocate only the first panel and reuse it for subsequent
-iterations (see Figure\ref{fig:panel_reuse}).
-</p>
-
-<p>
-We consider this optimization harmless with respect to simulation
-accuracy as the maximum additional error that we observed was always less than \SI{1}{\percent}. Simulation
-time is reduced significantly, albeit the reached speed-up is less impressive than for previous
-optimizations: For a very small matrix of order \(40,000\) and \(64\) MPI processes,
-the simulation time decreases by four seconds, from \SI{20.5}{\sec} to
-\SI{16.5}{\sec}. Responsible for this is a reduction of system time,
-namely from \SI{5.9}{\sec} to \SI{1.7}{\sec}. The number of page faults decreased from \(2\) million to
-\(0.2\) million, confirming the devastating effect these allocations/deallocations would have at scale.
-</p>
-</div>
-</div>
-<div id="outline-container-orgc4ffde6" class="outline-3">
-<h3 id="orgc4ffde6"><span class="section-number-3">8.5</span> MPI process representation (mmap vs. dlopen)</h3>
-<div class="outline-text-3" id="text-8-5">
-<p>
-We already explained in Section\ref{sec:appmodeling} that SMPI
-supports two mechanisms to keep local static and global variables
-private to each rank, even though they run in the same process. In
-this section, we discuss the impact of the choice.
-</p>
-
-<ul class="org-ul">
-<li><b>mmap</b> When <code>mmap</code> is used, SMPI copies the <code>data</code> segment on startup for
-each rank into the heap. When control is transferred from one rank
-to another, the <code>data</code> segment is <code>mmap</code>'ed to the location of the other
-rank's copy on the heap. All ranks have hence the same addresses in
-the virtual address space at their disposition although <code>mmap</code> ensures
-they point to different physical addresses. This also means
-inevitably that caches must be flushed to ensure that no data of one
-rank leaks into the other rank, making <code>mmap</code> a rather expensive
-operation.</li>
-</ul>
-
-<ul class="org-ul">
-<li><b>dlopen</b> With <code>dlopen</code>, copies of the global variables are still made
-but they are stored inside the <code>data</code> segment as opposed to the
-heap. When switching from one rank to another, the starting virtual
-address for the storage is readjusted rather than the target of the
-addresses.  This means that each rank has distinct addresses for
-global variables. The main advantage of this approach is that caches
-do not need to be flushed as is the case for the <code>mmap</code> approach,
-because data consistency can always be guaranteed.</li>
-</ul>
-<p>
-\noindent
-<b>Impact of choice of mmap/dlopen</b>
-The choice of <code>mmap</code> or <code>dlopen</code> influences the simulation time indirectly
-through its impact on system/user time and page faults, \eg for a
-matrix of order \(80,000\) and \(32\) MPI processes, the number
-of minor page faults drops from \num{4412047} (with <code>mmap</code>) to
-\num{6880} (with <code>dlopen</code>). This results in a reduction of system time from 
-\SI{10.64}{\sec} (out of \SI{51.47}{\sec} in total) to
-\SI{2.12}{\sec}. Obviously, the larger the matrix and the number of
-processes, the larger the number of context switch during the
-simulation, and thus the higher the gain.
-</p>
-</div>
-</div>
-
-<div id="outline-container-org696e979" class="outline-3">
-<h3 id="org696e979"><span class="section-number-3">8.6</span> Huge pages</h3>
-<div class="outline-text-3" id="text-8-6">
-<p>
-For larger matrix orders (\ie \(N\) larger than a few hundred thousand), the performance of the simulation quickly
-deteriorates as the memory consumption rises rapidly.
-</p>
-
-<p>
-We explained already how we fold the memory in order to reduce the <i>physical</i>
-memory usage. The <i>virtual</i> memory, on the other hand, is still
-allocated for every process since the allocation calls are still executed.
-Without a reduction of allocated virtual addresses, the page table
-rapidly becomes too large to fit in a single node. More
-precisely, the size of the page table containing pages of size \SI{4}{\kibi\byte} can be computed as:
-</p>
-
-<p>
-This means that the addresses in the page table for a matrix of order \(N=4,000,000\)
-consume \(PT_{size}(4,000,000) = \num{2.5e11}\) bytes, \ie
-\SI{250}{\giga\byte} on a system where double-precision floating-point numbers
-and addresses take 8 bytes. Thankfully, the x86-64 architecture supports several page
-sizes, known as ``huge pages'' in Linux. Typically, these pages are
-around \SI{2}{\mebi\byte} (instead of \SI{4}{\kibi\byte}), although other sizes
-(\SIrange{2}{256}{\mebi\byte}) are possible as well. 
-Changing the page size requires administrator (root) privileges as the
-Linux kernel support for <i>hugepages</i> needs to be activated and a
-<code>hugetlbfs</code> file system must be mounted. After at least one huge
-page has been allocated, the path of the allocated file system can then be
-passed on to SimGrid.
-Setting the page size to \SI{2}{\mebi\byte} reduces drastically the page table size.
-For example, for a matrix of order \(N=4,000,000\), it shrinks from \SI{250}{\giga\byte}
-to \SI{0.488}{\giga\byte}.
-</p>
-</div>
-</div>
-</div>
-<div id="outline-container-org6871290" class="outline-2">
-<h2 id="org6871290"><span class="section-number-2">9</span> Scalability Evaluation</h2>
-<div class="outline-text-2" id="text-9">
-<p>
-In Section\ref{sec:em} we explained the problems we encountered when trying
-to run a large-scale simulation on a single node and how we solved them. 
-For the most part, we identified and eliminated bottlenecks one after
-another while simultaneously making sure that the accuracy of our performance prediction was
-not impacted. Certainly, the main goal was to reduce the
-complexity from \(\O(N^3) + \O(N^2\cdot{}P\cdot{}Q)\) to something more reasonable.
-The \(\O(N^3)\) was removed through skipping most computations. 
-Ideally, since there are \(N/NB\) iterations (steps), 
-the complexity of simulating one step should be decreased to something independent of
-\(N\). SimGrid's fluid models, used to simulate communications, do not
-depend on \(N\). Therefore, the time to simulate a step of HPL should mostly depend on \(P\) and
-\(Q\). Yet, some memory operations on the panel that are related to pivoting
-are intertwined in HPL with collective communications, meaning that it
-is impossible to completely get rid of the \(\O(N)\) complexity without
-modifying HPL more profoundly.
-</p>
-
-<p>
-Although our goal was to model and simulate HPL on the Stampede
-platform, we decided to conduct a first evaluation on a
-similar, albeit non-existing, platform comprising 4,096 8-core nodes
-interconnected through a \(\langle2;16,32;1,16;1,1\rangle\) fat-tree topology
-built on ideal network links with a bandwidth of
-\SI{50}{\giga\byte\per\sec} and a latency of \SI{5}{\micro\sec}. We ran
-simulations with \(512\); \(1,024\); \(2,048\) or \(4,096\) MPI processes and
-with matrices of orders \num{5e5}, \num{1e6}, \num{2e6} or \num{4e6}.
-The impact of the matrix order on total makespan and memory is illustrated in Figure\ref{fig:hpl_scalability}. 
-With all previously described
-optimizations enabled, the simulation with the largest matrix took close to \(47\) hours and consumed
-\SI{16}{\giga\byte} of memory whereas the smallest one took \(20\) minutes and \SI{282}{\mega\byte} of memory.
-One can also see that, when the matrix order (\(N\)) is increased, memory consumption and
-simulation time both grow slightly quadratic as the amount of matrix
-elements is \(N^{2}\) and the number of steps of the algorithm also linearly.
-</p>
-
-<p>
-Moreover, all the simulations spend less than \SI{10}{\percent} of their execution time in kernel
-mode, which means the number of system calls is reasonably low.
-</p>
-</div>
-</div>
-
-<div id="outline-container-orge065d2a" class="outline-2">
-<h2 id="orge065d2a"><span class="section-number-2">10</span> Modeling Stampede and Simulating HPL</h2>
-<div class="outline-text-2" id="text-10">
-</div>
-
-<div id="outline-container-orgee5c2c1" class="outline-3">
-<h3 id="orgee5c2c1"><span class="section-number-3">10.1</span> Modeling Stampede</h3>
-<div class="outline-text-3" id="text-10-1">
-</div>
-<div id="outline-container-orge81c904" class="outline-4">
-<h4 id="orge81c904"><span class="section-number-4">10.1.1</span> Computations</h4>
-<div class="outline-text-4" id="text-10-1-1">
-<p>
-Each node of the Stampede cluster comprises two 8-core Intel Xeon
-E5-2680 8C \SI{2.7}{\GHz} CPUs and one 61-core Intel Xeon Phi SE10P
-(KNC) \SI{1.1}{\GHz} accelerator that is roughly three times more
-powerful than the two CPUs and can be used in two ways:
-either as a classical accelerator, \ie for offloading expensive
-computations from the CPU, or by compiling
-binaries specifically for and executing them directly on the Xeon Phi.
-While the accelerator's \SI{8}{\gibi\byte} of RAM are rather
-small, the main advantage of the second approach is that data does not
-need to be transferred back and forth between the node's CPUs and the
-accelerator via the x16 PCIe bus.
-</p>
-
-<p>
-The HPL output submitted to the TOP500 (Figure\ref{fig:hpl_output})
-does not indicate how the KNC was used. However, because of the values assigned
-to \(P\) and \(Q\), we are certain that only a single MPI process per node
-was run. For this reason, it is likely that the KNC used as an accelerator. 
-With Intel's Math Kernel Library (MKL), this is effortless as the MKL comes with
-support for automatic offloading <b>for</b> selected BLAS functions. 
-Unfortunately, we do not know which MKL version was used in 2013 and therefore decided to
-use the default version used on Stampede in the beginning of 2017, \ie
-version 11.1.1. The MKL documentation states
-that, depending on the matrix geometry, the computation will run on
-either all the cores of the CPU or exclusively on the KNC.  In the case of
-<code>DGEMM</code>, the computation of \(A=\alpha\cdot{}A+\beta\cdot{}B\times{}C\) with \(A, B, C\) of
-dimensions \(M\times{}K\), \(K\times{}N\) and \(M\times{}N\), respectively, is offloaded onto the KNC whenever \(M\)
-and \(N\) are both larger than \(1280\) while \(K\) is simultaneously larger
-than \(256\). Similarly, offloading for <code>DTRSM</code> is used when both \(M\) and \(N\)
-are larger than \(512\), which results in a
-better throughput but incurs a higher latency. The complexity for <code>DGEMM</code> is always of the order
-of \(M\cdot{}N\cdot{}K\) (\(M\cdot{}N^2\) for <code>DTRSM</code>) but the model that describes the time it
-takes to run <code>DGEMM</code> (<code>DTRSM</code>) is very different for small and large
-matrices. The table in Figure\ref{fig:macro_real} indicates the
-parameters of the linear regression for the four scenarios (<code>DGEMM</code>
-or <code>DTRSM</code> and CPU or Phi). The measured performance was close to the
-peak performance: \eg for <code>DGEMM</code> on the Phi reached
-\(2/\num{1.981e-12} = \SI{1.009}{\tera\flops}\). Since the granularity
-used in HPL (see Figure\ref{fig:hpl_output}) is 1024, all calls (except
-for maybe the very last iteration) are offloaded to the KNC. 
-In any case, this behavior can easily be accounted for by replacing the
-macro in Figure\ref{fig:macro_simple} by the one in Figure\ref{fig:macro_real}.
-</p>
-</div>
-</div>
-
-<div id="outline-container-org33f44a9" class="outline-4">
-<h4 id="org33f44a9"><span class="section-number-4">10.1.2</span> Communications</h4>
-<div class="outline-text-4" id="text-10-1-2">
-<p>
-We unfortunately do not know for sure which version of Intel MPI was used in
-2013, so we decided to use the default one on Stampede
-in May 2017, \ie version 3.1.4. As explained in 
-Section\ref{sec:smpi}, SMPI's communication model is a hybrid model
-between the LogP family and a fluid model. For each message, the send mode
-(\eg fully asynchronous, detached or eager) is determined solely by the
-message size. It is hence possible to model the resulting performance
-of communication operations through a piece-wise linear model, as depicted in
-Figure\ref{fig:stampede_calibration}. For a thorough discussion of
-the calibration techniques used to obtain this model,
-see\cite{smpi}. As illustrated, the results for
-<code>MPI_Send</code> are quite stable and piece-wise regular, but the behavior of
-<code>MPI_Recv</code> is surprising: for small messages with a size of less than \SI{17420}{\byte}
-(represented by purple, blue and red dots), one can observe two modes,
-namely ``slow'' and ``fast'' communications. ``Slow''
-operations take twice longer and are much more common than the 
-``fast'' ones. We observed this behavior in several experiments even though both MPI
-processes that were used in the calibration were connected through 
-the same local switch. When observed, this ``perturbation'' was present throughout the execution of that
-calibration. 
-Having taken into consideration that small messages are scarce in HPL, we eventually decided to
-ignore this phenomenon and opted to use the more favorable scenario (fast
-communications) for small messages. We believe that the impact of
-our choice on the simulation accuracy is minimal as primarily large,
-bulk messages are sent that make use of the <i>rendez-vous</i> mode (depicted in dark green).
-</p>
-
-<p>
-Furthermore, we configured SMPI to use Stampede's network topology,
-\ie Mellanox FDR InfiniBand technology with \SI{56}{\giga\bit\per\second}, setup in
-a fat-tree topology (see Figure\ref{fig:fat_tree_topology}). We
-assumed the routing was done through D-mod-K\cite{dmodk} as it is
-commonly used on this topology.
-</p>
-</div>
-</div>
-<div id="outline-container-org356acb3" class="outline-4">
-<h4 id="org356acb3"><span class="section-number-4">10.1.3</span> Summary of modeling uncertainties</h4>
-<div class="outline-text-4" id="text-10-1-3">
-<p>
-For the compiler, Intel MPI and MKL, we were unable to determine
-which version was used in 2013, but decided to go for rather optimistic
-choices. The models for the MKL and for Intel MPI are close to the peak
-performance. It is plausible that the compiler managed to optimize
-computations in HPL. While it is true that most of these computations
-are executed in our simulations, they are not accounted for. This
-allows us to obtain fully deterministic simulations without harming the
-outcome of the simulation as these parts only represent a tiny fraction of
-the total execution time of HPL. A few HPL compilation flags (\eg
-<code>HPL_NO_MPI_DATATYPE</code> and <code>HPL_COPY_L</code> that control whether MPI datatypes
-should be used and how, respectively) could not be deduced from
-HPL's original output on Stampede but we believe their impact to be
-minimal. Finally, the HPL output reports the use of HPL v2.1 but the
-main difference between v2.1 and v2.2 is the option to
-continuously report factorization progress. We hence decided to apply
-our modifications to the later version of HPL.
-</p>
-
-<p>
-With all these modifications in place, we expected the prediction of
-our simulations to be optimistic yet close to results obtained by a real life execution.
-</p>
-</div>
-</div>
-</div>
-<div id="outline-container-orgf812247" class="outline-3">
-<h3 id="orgf812247"><span class="section-number-3">10.2</span> Simulating HPL</h3>
-<div class="outline-text-3" id="text-10-2">
-</div>
-<div id="outline-container-org9265cd8" class="outline-4">
-<h4 id="org9265cd8"><span class="section-number-4">10.2.1</span> Performance Prediction</h4>
-<div class="outline-text-4" id="text-10-2-1">
-<p>
-Figure\ref{fig:stampede_prediction} compares two simulation scenarios
-with the original result from 2013. The solid red line represents the HPL
-performance prediction as obtained with SMPI with the Stampede model
-that we described in the previous section. Although we expected SMPI to be
-optimistic, the prediction was surprisingly much lower than the TOP500 result.
-We verified that no part of HPL was left unmodeled and decided to
-investigate whether a flaw in our network model that would result in
-too much congestion could explain the performance. 
-Alas, even a congestion-free network model 
-(represented by the dashed blue line in Figure\ref{fig:stampede_prediction}) only
-results in minor improvements. In our experiments to model <code>DGEMM</code> and <code>DTRSM</code>,
-either the CPU or the KNC seemed to be used at one time and a specifically
-optimized version of the MKL may have been used in 2013. 
-Removing the offloading latency and modeling each node as a
-single \SI{1.2}{\tera\flops} node does not sufficiently explain the
-divide between our results and reality.
-</p>
-</div>
-</div>
-
-<div id="outline-container-orga7fd0c5" class="outline-4">
-<h4 id="orga7fd0c5"><span class="section-number-4">10.2.2</span> Performance Gap Investigation</h4>
-<div class="outline-text-4" id="text-10-2-2">
-<p>
-In this section, we explain our investigation and give possible reasons for
-the aforementioned mismatch (apparent in Figure\ref{fig:stampede_prediction}). With SMPI, it is simple to trace
-the first iterations of HPL to get an idea of what could be
-improved (the trace for the first five iterations can be obtained in
-about 609 seconds on a commodity computer and is compressed about
-\SI{175}{\mega\byte} large). Figure\ref{fig:hpl_gantt} illustrates the
-very synchronous and iterative nature of the first iterations: One can identify first a factorization of the panel, then a broadcast to all the
-nodes, and finally an update of trailing matrix.
-More than one fifth of each iteration is spent communicating (although the first
-iterations are the ones with the lowest communication to computation ratio),
-which prevents HPL from reaching the Top500 performance. 
-Overlapping of these heavy communication phases with computation would improve
-performance significantly. The fact that this is
-almost not happening can be explained by the look-ahead <code>DEPTH</code>
-parameter that was supposedly set to <code>0</code> (see
-Figure\ref{fig:hpl_output}). This is quite surprising as even
-the tuning section of the HPL documentation indicates that a depth of
-1 is supposed to yield the best results, even though a large problem size could
-be needed to see some performance gain. We discussed this
-surprising behavior with the Stampede-team and were informed that the
-run in 2013 was executed with an HPL binary provided by Intel
-and probably specifically modified for Stampede. We
-believe that some configuration values have been hardcoded to enforce an overlap of
-iterations with others. Indeed, the shortened part (marked ``[&#x2026;]'') in
-Figure\ref{fig:hpl_output} provides information about the progress of
-HPL throughout iterations and statistics for the panel-owning process
-about the time spent in the most important parts. 
-According to these statistics, the total time
-spent in the <code>Update</code> section was \SI{9390}{\sec} whereas the total
-execution time was \SI{7505}{\sec}, which is impossible unless iterations have overlapped.
-</p>
-
-<p>
-The broadcast and swapping algorithms use very heavy
-communication patterns. This is not at all surprising since for a matrix of
-this order, several hundred megabytes need to be broadcast. 
-Although the output states that the <code>blongM</code> algorithm was
-used it could be the case that another algorithm had been used.
-We tried the other of the 6 broadcast algorithms HPL comes with but
-did not achieve significantly better overall performance. 
-An analysis of the symbols in the Intel binary
-revealed that another broadcast algorithm named
-<code>HPL_bcast_bpush</code> was available. Unlike the others, this new algorithm relies on non-blocking sends,
-which could contribute to the performance obtained in 2013.
-Likewise, the swapping algorithm that was used (<code>SWAP=Binary-exchange</code>) involves communications that are rather long and
-organized in trees, which is surprising as the <code>spread-roll</code> algorithm
-is recommended for large matrices.
-</p>
-
-<p>
-We do not aim to reverse engineer the Intel HPL code. We can, however,
-already draw two conclusions from our simple analysis: 1) it is apparent that many optimizations have been done on
-the communication side and 2) it is very likely that the reported
-parameters are not the ones used in the real execution, probably because 
-these values were hardcoded and the configuration output file was not updated accordingly.
-</p>
-</div>
-</div>
-</div>
-</div>
-
-<div id="outline-container-org533fb3d" class="outline-2">
-<h2 id="org533fb3d"><span class="section-number-2">11</span> Conclusions</h2>
-<div class="outline-text-2" id="text-11">
-<p>
-Studying HPC applications at scale can be very time- and
-resource-consuming. Simulation is often an effective approach in this
-context and SMPI has previously been successfully validated in several small-scale
-studies with standard HPC applications\cite{smpi,heinrich:hal-01523608}.  In this
-article, we proposed and evaluated extensions to the SimGrid/SMPI
-framework that allowed us to emulate HPL at the scale of a
-supercomputer. Our application of choice, HPL, is particularly challenging in terms of simulation
-as it implements its own set of non-blocking collective operations
-that rely on <code>MPI_Iprobe</code> in order to facilitate overlapping with computations.
-</p>
-
-<p>
-More specifically, we tried to reproduce the execution of HPL on the
-Stampede supercomputer conducted in \(2013\) for the TOP500, which
-involved a \SI{120}{\tera\byte} matrix and took two hours on 6,006&nbsp;nodes.  
-Our emulation of a similar configuration ran on a single machine for
-about \(62\) hours and required less than \SI{19}{\giga\byte} of RAM. This emulation
-employed several non-trivial operating-system level optimizations
-(memory mapping, dynamic library loading, huge pages) that have since been
-integrated into the last version of SimGrid/SMPI.
-</p>
-
-<p>
-The downside of scaling this high is a less well-controlled scenario.
-The reference run of HPL on Stampede was done several years ago and we only
-have very limited information about the setup (\eg software versions
-and configuration), but a reservation and re-execution on the whole
-machine was impossible for us. We nevertheless modeled Stampede carefully, which
-allowed us to predict the performance that would
-have been obtained using an unmodified, freely available version of HPL.
-Unfortunately, despite all our efforts, the predicted performance
-was much lower than what was reported in 2013. We determined that this
-discrepancy comes from the fact that a modified, closed-source version of HPL
-supplied by Intel was used in 2013.
-We believe that some of the HPL configuration parameters were
-hardcoded and therefore misreported in the output. A quick analysis of the optimized
-HPL binary confirmed that algorithmic differences were likely to be the
-reason for the performance differences.
-</p>
-
-<p>
-We conclude that a large-scale (in)validation is unfortunately not
-possible due to the modified source code being unavailable to us.
-We claim that the modifications we made are
-minor and are applicable to that optimized version. In fact, while HPL
-comprises 16K lines of ANSI C over 149 files, our modifications only
-changed 14 files with 286 line insertions and 18 deletions.
-</p>
-
-<p>
-We believe being capable of precisely predicting an application's
-performance on a given platform will become
-invaluable in the future to aid compute centers with the decision of
-whether a new machine (and what technology) will work best for a given
-application or if an upgrade of the current machine should be
-considered. As a future work, we intend to conduct similar studies
-with other HPC benchmarks (\eg HPCG or HPGMG) and with other top500
-machines. From our experience, we believe that a faithful and public
-reporting of the experimental conditions (compiler options, library
-versions, HPL output, etc.) is invaluable and allows researchers
-to better understand of these platforms actually behave.
-</p>
-</div>
-</div>
-
-<div id="outline-container-org062a9c4" class="outline-2">
-<h2 id="org062a9c4"><span class="section-number-2">12</span> Acknowledgements</h2>
-<div class="outline-text-2" id="text-12">
-<p>
-Experiments presented in this paper were carried out using the Grid'5000 testbed, supported by a scientific interest group hosted by Inria and including CNRS, RENATER and several Universities as well as other organizations (see <a href="https://www.grid5000.fr">https://www.grid5000.fr</a>).
-We warmly thank our TACC colleagues for their support in this study and
-providing us with as much information as they could.
-</p>
-</div>
-<div id="outline-container-org60b0907" class="outline-3">
-<h3 id="org60b0907"><span class="section-number-3">12.1</span> References&#xa0;&#xa0;&#xa0;<span class="tag"><span class="ignore">ignore</span></span></h3>
-<div class="outline-text-3" id="text-12-1">
-</div>
-</div>
-</div>
-</div>
-<div id="postamble" class="status">
-<p class="validation"><a href="http://validator.w3.org/check?uri=referer">Validate</a></p>
-</div>
-</body>
-</html>
diff --git a/module2/ressources/video_examples/technical_report.html b/module2/ressources/video_examples/technical_report.html
deleted file mode 100644
index b4ff531eb77b950d9e8adf6556a4670a47da60dc..0000000000000000000000000000000000000000
--- a/module2/ressources/video_examples/technical_report.html
+++ /dev/null
@@ -1,814 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
-"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
-<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">
-<head>
-<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
-<meta name="viewport" content="width=device-width, initial-scale=1" />
-<title>A reproducible comparison between  GNU MPFR and machine double-precision</title>
-<meta name="generator" content="Org mode" />
-<style type="text/css">
- <!--/*--><![CDATA[/*><!--*/
-  .title  { text-align: center;
-             margin-bottom: .2em; }
-  .subtitle { text-align: center;
-              font-size: medium;
-              font-weight: bold;
-              margin-top:0; }
-  .todo   { font-family: monospace; color: red; }
-  .done   { font-family: monospace; color: green; }
-  .priority { font-family: monospace; color: orange; }
-  .tag    { background-color: #eee; font-family: monospace;
-            padding: 2px; font-size: 80%; font-weight: normal; }
-  .timestamp { color: #bebebe; }
-  .timestamp-kwd { color: #5f9ea0; }
-  .org-right  { margin-left: auto; margin-right: 0px;  text-align: right; }
-  .org-left   { margin-left: 0px;  margin-right: auto; text-align: left; }
-  .org-center { margin-left: auto; margin-right: auto; text-align: center; }
-  .underline { text-decoration: underline; }
-  #postamble p, #preamble p { font-size: 90%; margin: .2em; }
-  p.verse { margin-left: 3%; }
-  pre {
-    border: 1px solid #ccc;
-    box-shadow: 3px 3px 3px #eee;
-    padding: 8pt;
-    font-family: monospace;
-    overflow: auto;
-    margin: 1.2em;
-  }
-  pre.src {
-    position: relative;
-    overflow: visible;
-    padding-top: 1.2em;
-  }
-  pre.src:before {
-    display: none;
-    position: absolute;
-    background-color: white;
-    top: -10px;
-    right: 10px;
-    padding: 3px;
-    border: 1px solid black;
-  }
-  pre.src:hover:before { display: inline;}
-  /* Languages per Org manual */
-  pre.src-asymptote:before { content: 'Asymptote'; }
-  pre.src-awk:before { content: 'Awk'; }
-  pre.src-C:before { content: 'C'; }
-  /* pre.src-C++ doesn't work in CSS */
-  pre.src-clojure:before { content: 'Clojure'; }
-  pre.src-css:before { content: 'CSS'; }
-  pre.src-D:before { content: 'D'; }
-  pre.src-ditaa:before { content: 'ditaa'; }
-  pre.src-dot:before { content: 'Graphviz'; }
-  pre.src-calc:before { content: 'Emacs Calc'; }
-  pre.src-emacs-lisp:before { content: 'Emacs Lisp'; }
-  pre.src-fortran:before { content: 'Fortran'; }
-  pre.src-gnuplot:before { content: 'gnuplot'; }
-  pre.src-haskell:before { content: 'Haskell'; }
-  pre.src-hledger:before { content: 'hledger'; }
-  pre.src-java:before { content: 'Java'; }
-  pre.src-js:before { content: 'Javascript'; }
-  pre.src-latex:before { content: 'LaTeX'; }
-  pre.src-ledger:before { content: 'Ledger'; }
-  pre.src-lisp:before { content: 'Lisp'; }
-  pre.src-lilypond:before { content: 'Lilypond'; }
-  pre.src-lua:before { content: 'Lua'; }
-  pre.src-matlab:before { content: 'MATLAB'; }
-  pre.src-mscgen:before { content: 'Mscgen'; }
-  pre.src-ocaml:before { content: 'Objective Caml'; }
-  pre.src-octave:before { content: 'Octave'; }
-  pre.src-org:before { content: 'Org mode'; }
-  pre.src-oz:before { content: 'OZ'; }
-  pre.src-plantuml:before { content: 'Plantuml'; }
-  pre.src-processing:before { content: 'Processing.js'; }
-  pre.src-python:before { content: 'Python'; }
-  pre.src-R:before { content: 'R'; }
-  pre.src-ruby:before { content: 'Ruby'; }
-  pre.src-sass:before { content: 'Sass'; }
-  pre.src-scheme:before { content: 'Scheme'; }
-  pre.src-screen:before { content: 'Gnu Screen'; }
-  pre.src-sed:before { content: 'Sed'; }
-  pre.src-sh:before { content: 'shell'; }
-  pre.src-sql:before { content: 'SQL'; }
-  pre.src-sqlite:before { content: 'SQLite'; }
-  /* additional languages in org.el's org-babel-load-languages alist */
-  pre.src-forth:before { content: 'Forth'; }
-  pre.src-io:before { content: 'IO'; }
-  pre.src-J:before { content: 'J'; }
-  pre.src-makefile:before { content: 'Makefile'; }
-  pre.src-maxima:before { content: 'Maxima'; }
-  pre.src-perl:before { content: 'Perl'; }
-  pre.src-picolisp:before { content: 'Pico Lisp'; }
-  pre.src-scala:before { content: 'Scala'; }
-  pre.src-shell:before { content: 'Shell Script'; }
-  pre.src-ebnf2ps:before { content: 'ebfn2ps'; }
-  /* additional language identifiers per "defun org-babel-execute"
-       in ob-*.el */
-  pre.src-cpp:before  { content: 'C++'; }
-  pre.src-abc:before  { content: 'ABC'; }
-  pre.src-coq:before  { content: 'Coq'; }
-  pre.src-groovy:before  { content: 'Groovy'; }
-  /* additional language identifiers from org-babel-shell-names in
-     ob-shell.el: ob-shell is the only babel language using a lambda to put
-     the execution function name together. */
-  pre.src-bash:before  { content: 'bash'; }
-  pre.src-csh:before  { content: 'csh'; }
-  pre.src-ash:before  { content: 'ash'; }
-  pre.src-dash:before  { content: 'dash'; }
-  pre.src-ksh:before  { content: 'ksh'; }
-  pre.src-mksh:before  { content: 'mksh'; }
-  pre.src-posh:before  { content: 'posh'; }
-  /* Additional Emacs modes also supported by the LaTeX listings package */
-  pre.src-ada:before { content: 'Ada'; }
-  pre.src-asm:before { content: 'Assembler'; }
-  pre.src-caml:before { content: 'Caml'; }
-  pre.src-delphi:before { content: 'Delphi'; }
-  pre.src-html:before { content: 'HTML'; }
-  pre.src-idl:before { content: 'IDL'; }
-  pre.src-mercury:before { content: 'Mercury'; }
-  pre.src-metapost:before { content: 'MetaPost'; }
-  pre.src-modula-2:before { content: 'Modula-2'; }
-  pre.src-pascal:before { content: 'Pascal'; }
-  pre.src-ps:before { content: 'PostScript'; }
-  pre.src-prolog:before { content: 'Prolog'; }
-  pre.src-simula:before { content: 'Simula'; }
-  pre.src-tcl:before { content: 'tcl'; }
-  pre.src-tex:before { content: 'TeX'; }
-  pre.src-plain-tex:before { content: 'Plain TeX'; }
-  pre.src-verilog:before { content: 'Verilog'; }
-  pre.src-vhdl:before { content: 'VHDL'; }
-  pre.src-xml:before { content: 'XML'; }
-  pre.src-nxml:before { content: 'XML'; }
-  /* add a generic configuration mode; LaTeX export needs an additional
-     (add-to-list 'org-latex-listings-langs '(conf " ")) in .emacs */
-  pre.src-conf:before { content: 'Configuration File'; }
-
-  table { border-collapse:collapse; }
-  caption.t-above { caption-side: top; }
-  caption.t-bottom { caption-side: bottom; }
-  td, th { vertical-align:top;  }
-  th.org-right  { text-align: center;  }
-  th.org-left   { text-align: center;   }
-  th.org-center { text-align: center; }
-  td.org-right  { text-align: right;  }
-  td.org-left   { text-align: left;   }
-  td.org-center { text-align: center; }
-  dt { font-weight: bold; }
-  .footpara { display: inline; }
-  .footdef  { margin-bottom: 1em; }
-  .figure { padding: 1em; }
-  .figure p { text-align: center; }
-  .equation-container {
-    display: table;
-    text-align: center;
-    width: 100%;
-  }
-  .equation {
-    vertical-align: middle;
-  }
-  .equation-label {
-    display: table-cell;
-    text-align: right;
-    vertical-align: middle;
-  }
-  .inlinetask {
-    padding: 10px;
-    border: 2px solid gray;
-    margin: 10px;
-    background: #ffffcc;
-  }
-  #org-div-home-and-up
-   { text-align: right; font-size: 70%; white-space: nowrap; }
-  textarea { overflow-x: auto; }
-  .linenr { font-size: smaller }
-  .code-highlighted { background-color: #ffff00; }
-  .org-info-js_info-navigation { border-style: none; }
-  #org-info-js_console-label
-    { font-size: 10px; font-weight: bold; white-space: nowrap; }
-  .org-info-js_search-highlight
-    { background-color: #ffff00; color: #000000; font-weight: bold; }
-  .org-svg { width: 90%; }
-  /*]]>*/-->
-</style>
-<link rel="stylesheet" type="text/css" href="http://www.pirilampo.org/styles/readtheorg/css/htmlize.css"/>
-<link rel="stylesheet" type="text/css" href="http://www.pirilampo.org/styles/readtheorg/css/readtheorg.css"/>
-<script src="https://ajax.googleapis.com/ajax/libs/jquery/2.1.3/jquery.min.js"></script>
-<script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/js/bootstrap.min.js"></script>
-<script type="text/javascript" src="http://www.pirilampo.org/styles/lib/js/jquery.stickytableheaders.js"></script>
-<script type="text/javascript" src="http://www.pirilampo.org/styles/readtheorg/js/readtheorg.js"></script>
-<script type="text/javascript">
-/*
-@licstart  The following is the entire license notice for the
-JavaScript code in this tag.
-
-Copyright (C) 2012-2019 Free Software Foundation, Inc.
-
-The JavaScript code in this tag is free software: you can
-redistribute it and/or modify it under the terms of the GNU
-General Public License (GNU GPL) as published by the Free Software
-Foundation, either version 3 of the License, or (at your option)
-any later version.  The code is distributed WITHOUT ANY WARRANTY;
-without even the implied warranty of MERCHANTABILITY or FITNESS
-FOR A PARTICULAR PURPOSE.  See the GNU GPL for more details.
-
-As additional permission under GNU GPL version 3 section 7, you
-may distribute non-source (e.g., minimized or compacted) forms of
-that code without the copy of the GNU GPL normally required by
-section 4, provided you include this license notice and a URL
-through which recipients can access the Corresponding Source.
-
-
-@licend  The above is the entire license notice
-for the JavaScript code in this tag.
-*/
-<!--/*--><![CDATA[/*><!--*/
- function CodeHighlightOn(elem, id)
- {
-   var target = document.getElementById(id);
-   if(null != target) {
-     elem.cacheClassElem = elem.className;
-     elem.cacheClassTarget = target.className;
-     target.className = "code-highlighted";
-     elem.className   = "code-highlighted";
-   }
- }
- function CodeHighlightOff(elem, id)
- {
-   var target = document.getElementById(id);
-   if(elem.cacheClassElem)
-     elem.className = elem.cacheClassElem;
-   if(elem.cacheClassTarget)
-     target.className = elem.cacheClassTarget;
- }
-/*]]>*///-->
-</script>
-<script type="text/x-mathjax-config">
-    MathJax.Hub.Config({
-        displayAlign: "center",
-        displayIndent: "0em",
-
-        "HTML-CSS": { scale: 100,
-                        linebreaks: { automatic: "false" },
-                        webFont: "TeX"
-                       },
-        SVG: {scale: 100,
-              linebreaks: { automatic: "false" },
-              font: "TeX"},
-        NativeMML: {scale: 100},
-        TeX: { equationNumbers: {autoNumber: "AMS"},
-               MultLineWidth: "85%",
-               TagSide: "right",
-               TagIndent: ".8em"
-             }
-});
-</script>
-<script type="text/javascript"
-        src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js?config=TeX-AMS_HTML"></script>
-</head>
-<body>
-<div id="content">
-<h1 class="title">A reproducible comparison between  GNU MPFR and machine double-precision</h1>
-<div id="table-of-contents">
-<h2>Table of Contents</h2>
-<div id="text-table-of-contents">
-<ul>
-<li><a href="#org04c943d">1. Reproducible Experimental Setup</a></li>
-<li><a href="#orgbcbe0e6">2. Experimental Results From Arnaud Legrand</a>
-<ul>
-<li><a href="#org18e18e7">2.1. Code</a></li>
-<li><a href="#org7094334">2.2. Setup</a></li>
-<li><a href="#org6c624df">2.3. A first measurement</a></li>
-<li><a href="#orgeb3b581">2.4. A second measurement</a></li>
-</ul>
-</li>
-<li><a href="#org3977eed">3. References</a></li>
-</ul>
-</div>
-</div>
-<p>
-Several authors claim that GNU MPFR [1] is \(x\) times slower than
-double-precision floating-point numbers, for various values of \(x\),
-without any way for the reader to reproduce their claim. For example
-in [2], Joris van der Hoeven writes “the MPFR library for arbitrary
-precision and IEEE-style standardized floating-point arithmetic is
-typically about a factor 100 slower than double precision machine
-arithmetic”. Such a claim typically: (i) does not say which version of
-MPFR was used (and which version of GMP, since MPFR being based on
-GMP, its efficiency also depends on GMP); (ii) does not detail the
-environment used (processor, compiler, operating system); (iii) does
-not explain which application was used for the comparison. Therefore
-it cannot be reproduced by the reader, which could thus have no
-confidence in the claimed factor of 100. In this short note we provide
-reproducible figures that can be checked by the reader.
-</p>
-<div id="outline-container-org04c943d" class="outline-2">
-<h2 id="org04c943d"><span class="section-number-2">1</span> Reproducible Experimental Setup</h2>
-<div class="outline-text-2" id="text-1">
-<p>
-We use the programs in appendix to multiply two \(1000 × 1000\)
-matrices. The matrix \(A\) has coefficients \(1/(i + j + 1)\) for \(0 ≤ i,
-j < 1000\), and matrix \(b\) has coefficients \(1/(ij + 1)\). Both programs
-print the time for the matrix product (not counting the time to
-initialize the matrix), and the sum of coefficients of the product
-matrix (used as a simple checksum between both programs).  
-</p>
-
-<p>
-We used MFPR version 3.1.5, configured with GMP 6.1.2 (both are the
-latest releases as of the date of this document).  
-</p>
-
-<p>
-We used as test processor <code>gcc12.fsffrance.org</code>, which is a machine from
-the GCC Compile Farm, a set of machines available for developers of
-free software. The compiler used was GCC 4.5.1, which is installed in
-<code>/opt/cfarm/release/4.5.1</code> on this machine, with optimization level
-<code>-O3</code>. Both GMP and MPFR were also compiled with this compiler, and the
-GMP and MPFR libraries were linked statically with the application
-programs (given in appendix).
-</p>
-</div>
-</div>
-
-<div id="outline-container-orgbcbe0e6" class="outline-2">
-<h2 id="orgbcbe0e6"><span class="section-number-2">2</span> Experimental Results From Arnaud Legrand</h2>
-<div class="outline-text-2" id="text-2">
-</div>
-<div id="outline-container-org18e18e7" class="outline-3">
-<h3 id="org18e18e7"><span class="section-number-3">2.1</span> Code</h3>
-<div class="outline-text-3" id="text-2-1">
-<p>
-The program (<code>a.c</code>) using the C double-precision type is the
-following. It takes as command-line argument the matrix dimension.
-</p>
-<div class="org-src-container">
-<pre class="src src-C"><span class="org-preprocessor">#include</span> <span class="org-string">&lt;stdio.h&gt;</span>
-<span class="org-preprocessor">#include</span> <span class="org-string">&lt;stdlib.h&gt;</span>
-<span class="org-preprocessor">#include</span> <span class="org-string">&lt;sys/types.h&gt;</span>
-<span class="org-preprocessor">#include</span> <span class="org-string">&lt;sys/resource.h&gt;</span> 
-
-<span class="org-keyword">static</span> <span class="org-type">int</span> <span class="org-function-name">cputime</span>()
-{
-  <span class="org-keyword">struct</span> <span class="org-type">rusage</span> <span class="org-variable-name">rus</span>;
-  getrusage(0, &amp;rus);
-  <span class="org-keyword">return</span> rus.ru_utime.tv_sec * 1000 + rus.ru_utime.tv_usec / 1000;
-}
-
-<span class="org-type">int</span> <span class="org-function-name">main</span>(<span class="org-type">int</span> <span class="org-variable-name">argc</span>, <span class="org-type">char</span> *<span class="org-variable-name">argv</span>[])
-{
-  <span class="org-type">double</span> **<span class="org-variable-name">a</span>;
-  <span class="org-type">double</span> **<span class="org-variable-name">b</span>;
-  <span class="org-type">double</span> **<span class="org-variable-name">c</span>;
-  <span class="org-type">double</span> <span class="org-variable-name">t</span> = 0.0;
-  <span class="org-type">int</span> <span class="org-variable-name">i</span>, <span class="org-variable-name">j</span>, <span class="org-variable-name">k</span>, <span class="org-variable-name">st</span>;
-  <span class="org-type">int</span> <span class="org-variable-name">N</span> = atoi(argv[1]);
-  st = cputime();
-  a = malloc(<span class="org-type">N</span> * <span class="org-keyword">sizeof</span>(<span class="org-type">double</span> *));
-  b = malloc(<span class="org-type">N</span> * <span class="org-keyword">sizeof</span>(<span class="org-type">double</span> *));
-  c = malloc(<span class="org-type">N</span> * <span class="org-keyword">sizeof</span>(<span class="org-type">double</span> *));
-  <span class="org-keyword">for</span> (i = 0; i &lt; N; i++) {
-    a[i] = malloc(<span class="org-type">N</span> * <span class="org-keyword">sizeof</span>(<span class="org-type">double</span>));
-    b[i] = malloc(<span class="org-type">N</span> * <span class="org-keyword">sizeof</span>(<span class="org-type">double</span>));
-    c[i] = malloc(<span class="org-type">N</span> * <span class="org-keyword">sizeof</span>(<span class="org-type">double</span>));
-    <span class="org-keyword">for</span> (j = 0; j &lt; N; j++) {
-      a[i][j] = 1.0 / (1.0 + i + j);
-      b[i][j] = 1.0 / (1.0 + i * j);
-    }
-  }
-  st = cputime();
-  <span class="org-keyword">for</span> (i = 0; i &lt; N; i++)
-    <span class="org-keyword">for</span> (j = 0; j &lt; N; j++)
-      c[i][j] = 0.0;
-  <span class="org-keyword">for</span> (i = 0; i &lt; N; i++)
-    <span class="org-keyword">for</span> (k = 0; k &lt; N; k++)
-      <span class="org-keyword">for</span> (j = 0; j &lt; N; j++)
-        c[i][j] += a[i][k] * b[k][j];
-  <span class="org-keyword">for</span> (i = 0; i &lt; N; i++)
-    <span class="org-keyword">for</span> (j = 0; j &lt; N; j++)
-      t += c[i][j];
-  printf(<span class="org-string">"matrix product took %dms\n"</span>, cputime() - st);
-  printf(<span class="org-string">"t=%f\n"</span>, t);
-  <span class="org-keyword">for</span> (i = 0; i &lt; N; i++) {
-    free(a[i]);
-    free(b[i]);
-    free(c[i]);
-  }
-  free(a);
-  free(b);
-  free(c);
-  <span class="org-keyword">return</span> 0;
-}
-</pre>
-</div>
-
-<p>
-The program (<code>d.c</code>) using GNU MPFR is the following. It takes as
-command-line argument the matrix dimension and the MPFR precision (in
-bits).
-</p>
-
-<div class="org-src-container">
-<pre class="src src-C"><span class="org-preprocessor">#include</span> <span class="org-string">&lt;stdio.h&gt;</span>
-<span class="org-preprocessor">#include</span> <span class="org-string">&lt;stdlib.h&gt;</span>
-<span class="org-preprocessor">#include</span> <span class="org-string">&lt;sys/types.h&gt;</span>
-<span class="org-preprocessor">#include</span> <span class="org-string">&lt;sys/resource.h&gt;</span> 
-<span class="org-preprocessor">#include</span> <span class="org-string">&lt;mpfr.h&gt;</span>
-
-<span class="org-keyword">static</span> <span class="org-type">int</span> <span class="org-function-name">cputime</span>()
-{
-  <span class="org-keyword">struct</span> <span class="org-type">rusage</span> <span class="org-variable-name">rus</span>;
-  getrusage(0, &amp;rus);
-  <span class="org-keyword">return</span> rus.ru_utime.tv_sec * 1000 + rus.ru_utime.tv_usec / 1000;
-}
-
-<span class="org-type">int</span> <span class="org-function-name">main</span>(<span class="org-type">int</span> <span class="org-variable-name">argc</span>, <span class="org-type">char</span> *<span class="org-variable-name">argv</span>[])
-{
-  <span class="org-type">mpfr_t</span> **<span class="org-variable-name">a</span>;
-  <span class="org-type">mpfr_t</span> **<span class="org-variable-name">b</span>;
-  <span class="org-type">mpfr_t</span> **<span class="org-variable-name">c</span>;
-  <span class="org-type">mpfr_t</span> <span class="org-variable-name">s</span>;
-  <span class="org-type">double</span> <span class="org-variable-name">t</span> = 0.0;
-  <span class="org-type">int</span> <span class="org-variable-name">i</span>, <span class="org-variable-name">j</span>, <span class="org-variable-name">k</span>, <span class="org-variable-name">st</span>;
-  <span class="org-type">int</span> <span class="org-variable-name">N</span> = atoi(argv[1]);
-  <span class="org-type">int</span> <span class="org-variable-name">prec</span> = atoi(argv[2]);
-  printf(<span class="org-string">"MPFR library: %-12s\nMPFR header: %s (based on %d.%d.%d)\n"</span>,
-         mpfr_get_version(), MPFR_VERSION_STRING, MPFR_VERSION_MAJOR,
-         MPFR_VERSION_MINOR, MPFR_VERSION_PATCHLEVEL);
-  st = cputime();
-  a = malloc(<span class="org-type">N</span> * <span class="org-keyword">sizeof</span>(<span class="org-type">mpfr_t</span> *));
-  b = malloc(<span class="org-type">N</span> * <span class="org-keyword">sizeof</span>(<span class="org-type">mpfr_t</span> *));
-  c = malloc(<span class="org-type">N</span> * <span class="org-keyword">sizeof</span>(<span class="org-type">mpfr_t</span> *));
-  mpfr_init2(s, prec);
-  <span class="org-keyword">for</span> (i = 0; i &lt; N; i++) {
-    a[i] = malloc(<span class="org-type">N</span> * <span class="org-keyword">sizeof</span>(mpfr_t));
-    b[i] = malloc(<span class="org-type">N</span> * <span class="org-keyword">sizeof</span>(mpfr_t));
-    c[i] = malloc(<span class="org-type">N</span> * <span class="org-keyword">sizeof</span>(mpfr_t));
-    <span class="org-keyword">for</span> (j = 0; j &lt; N; j++) {
-      mpfr_init2(a[i][j], prec);
-      mpfr_init2(b[i][j], prec);
-      mpfr_init2(c[i][j], prec);
-      mpfr_set_ui(a[i][j], 1, MPFR_RNDN);
-      mpfr_div_ui(a[i][j], a[i][j], i + j + 1, MPFR_RNDN);
-      mpfr_set_ui(b[i][j], 1, MPFR_RNDN);
-      mpfr_div_ui(b[i][j], b[i][j], i * j + 1, MPFR_RNDN);
-    }
-  }
-  st = cputime();
-  <span class="org-keyword">for</span> (i = 0; i &lt; N; i++)
-    <span class="org-keyword">for</span> (j = 0; j &lt; N; j++)
-      mpfr_set_ui(c[i][j], 0, MPFR_RNDN);
-  <span class="org-keyword">for</span> (i = 0; i &lt; N; i++)
-    <span class="org-keyword">for</span> (k = 0; k &lt; N; k++)
-      <span class="org-keyword">for</span> (j = 0; j &lt; N; j++) {
-        mpfr_mul(s, a[i][k], b[k][j], MPFR_RNDN);
-        mpfr_add(c[i][j], c[i][j], s, MPFR_RNDN);
-      }
-  <span class="org-keyword">for</span> (i = 0; i &lt; N; i++)
-    <span class="org-keyword">for</span> (j = 0; j &lt; N; j++)
-      t += mpfr_get_d(c[i][j], MPFR_RNDN);
-  printf(<span class="org-string">"matrix product took %dms\n"</span>, cputime() - st);
-  printf(<span class="org-string">"t=%f\n"</span>, t);
-  <span class="org-keyword">for</span> (i = 0; i &lt; N; i++) {
-    <span class="org-keyword">for</span> (j = 0; j &lt; N; j++) {
-      mpfr_clear(a[i][j]);
-      mpfr_clear(b[i][j]);
-      mpfr_clear(c[i][j]);
-    }
-    free(a[i]);
-    free(b[i]);
-    free(c[i]);
-  }
-  mpfr_clear(s);
-  free(a);
-  free(b);
-  free(c);
-  <span class="org-keyword">return</span> 0;
-}
-</pre>
-</div>
-</div>
-</div>
-
-<div id="outline-container-org7094334" class="outline-3">
-<h3 id="org7094334"><span class="section-number-3">2.2</span> Setup</h3>
-<div class="outline-text-3" id="text-2-2">
-<ul class="org-ul">
-<li><p>
-Name of the machine and OS version:
-</p>
-<pre class="example">
-Linux sama 4.2.0-1-amd64 #1 SMP Debian 4.2.6-1 (2015-11-10) x86_64 GNU/Linux
-</pre></li>
-
-<li><p>
-CPU/architecture information:
-</p>
-<div class="org-src-container">
-<pre class="src src-shell">cat /proc/cpuinfo
-</pre>
-</div>
-
-<pre class="example">
-processor	: 0
-vendor_id	: GenuineIntel
-cpu family	: 6
-model		: 58
-model name	: Intel(R) Core(TM) i7-3687U CPU @ 2.10GHz
-stepping	: 9
-microcode	: 0x15
-cpu MHz		: 2165.617
-cache size	: 4096 KB
-physical id	: 0
-siblings	: 4
-core id		: 0
-cpu cores	: 2
-apicid		: 0
-initial apicid	: 0
-fpu		: yes
-fpu_exception	: yes
-cpuid level	: 13
-wp		: yes
-flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm ida arat epb pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms xsaveopt
-bugs		:
-bogomips	: 5182.68
-clflush size	: 64
-cache_alignment	: 64
-address sizes	: 36 bits physical, 48 bits virtual
-power management:
-
-processor	: 1
-vendor_id	: GenuineIntel
-cpu family	: 6
-model		: 58
-model name	: Intel(R) Core(TM) i7-3687U CPU @ 2.10GHz
-stepping	: 9
-microcode	: 0x15
-cpu MHz		: 3140.515
-cache size	: 4096 KB
-physical id	: 0
-siblings	: 4
-core id		: 1
-cpu cores	: 2
-apicid		: 2
-initial apicid	: 2
-fpu		: yes
-fpu_exception	: yes
-cpuid level	: 13
-wp		: yes
-flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm ida arat epb pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms xsaveopt
-bugs		:
-bogomips	: 5182.68
-clflush size	: 64
-cache_alignment	: 64
-address sizes	: 36 bits physical, 48 bits virtual
-power management:
-
-processor	: 2
-vendor_id	: GenuineIntel
-cpu family	: 6
-model		: 58
-model name	: Intel(R) Core(TM) i7-3687U CPU @ 2.10GHz
-stepping	: 9
-microcode	: 0x15
-cpu MHz		: 2860.000
-cache size	: 4096 KB
-physical id	: 0
-siblings	: 4
-core id		: 0
-cpu cores	: 2
-apicid		: 1
-initial apicid	: 1
-fpu		: yes
-fpu_exception	: yes
-cpuid level	: 13
-wp		: yes
-flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm ida arat epb pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms xsaveopt
-bugs		:
-bogomips	: 5182.68
-clflush size	: 64
-cache_alignment	: 64
-address sizes	: 36 bits physical, 48 bits virtual
-power management:
-
-processor	: 3
-vendor_id	: GenuineIntel
-cpu family	: 6
-model		: 58
-model name	: Intel(R) Core(TM) i7-3687U CPU @ 2.10GHz
-stepping	: 9
-microcode	: 0x15
-cpu MHz		: 2813.585
-cache size	: 4096 KB
-physical id	: 0
-siblings	: 4
-core id		: 1
-cpu cores	: 2
-apicid		: 3
-initial apicid	: 3
-fpu		: yes
-fpu_exception	: yes
-cpuid level	: 13
-wp		: yes
-flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm ida arat epb pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms xsaveopt
-bugs		:
-bogomips	: 5182.68
-clflush size	: 64
-cache_alignment	: 64
-address sizes	: 36 bits physical, 48 bits virtual
-power management:
-
-</pre></li>
-
-<li><p>
-Compiler version
-</p>
-<div class="org-src-container">
-<pre class="src src-shell">gcc --version
-</pre>
-</div>
-
-<pre class="example">
-gcc (Debian 5.3.1-6) 5.3.1 20160114
-Copyright (C) 2015 Free Software Foundation, Inc.
-This is free software; see the source for copying conditions.  There is NO
-warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-
-</pre></li>
-
-<li><p>
-Libpmfr version:
-</p>
-<div class="org-src-container">
-<pre class="src src-shell">apt-cache show libmpfr-dev  
-</pre>
-</div>
-
-<pre class="example">
-Package: libmpfr-dev
-Source: mpfr4
-Version: 3.1.5-1
-Installed-Size: 1029
-Maintainer: Debian GCC Maintainers &lt;debian-gcc@lists.debian.org&gt;
-Architecture: amd64
-Replaces: libgmp3-dev (&lt;&lt; 4.1.4-3)
-Depends: libgmp-dev, libmpfr4 (= 3.1.5-1)
-Suggests: libmpfr-doc
-Breaks: libgmp3-dev (&lt;&lt; 4.1.4-3)
-Description-en: multiple precision floating-point computation developers tools
- This development package provides the header files and the symbolic
- links to allow compilation and linking of programs that use the libraries
- provided in the libmpfr4 package.
- .
- MPFR provides a library for multiple-precision floating-point computation
- with correct rounding.  The computation is both efficient and has a
- well-defined semantics. It copies the good ideas from the
- ANSI/IEEE-754 standard for double-precision floating-point arithmetic
- (53-bit mantissa).
-Description-md5: a2580b68a7c6f1fcadeefc6b17102b32
-Multi-Arch: same
-Homepage: http://www.mpfr.org/
-Tag: devel::lang:c, devel::library, implemented-in::c, role::devel-lib,
- suite::gnu
-Section: libdevel
-Priority: optional
-Filename: pool/main/m/mpfr4/libmpfr-dev_3.1.5-1_amd64.deb
-Size: 207200
-MD5sum: e5c7872461f263e27312c9ef4f4218b9
-SHA256: 279970e210c7db4e2550f5a3b7abb2674d01e9f0afd2a4857f1589a6947e0cbd
-
-</pre></li>
-</ul>
-</div>
-</div>
-
-<div id="outline-container-org6c624df" class="outline-3">
-<h3 id="org6c624df"><span class="section-number-3">2.3</span> A first measurement</h3>
-<div class="outline-text-3" id="text-2-3">
-<div class="org-src-container">
-<pre class="src src-shell"><span class="org-builtin">cd</span> /tmp/
-gcc -O3 a.c -o a
-./a 1000
-</pre>
-</div>
-
-<pre class="example">
-matrix product took 680ms
-t=9062.368470
-</pre>
-
-
-<div class="org-src-container">
-<pre class="src src-shell"><span class="org-builtin">cd</span> /tmp/
-gcc -O3 d.c -o d -lmpfr
-./d 1000 53
-</pre>
-</div>
-
-<pre class="example">
-MPFR library: 3.1.5
-MPFR header: 3.1.5 (based on 3.1.5)
-matrix product took 74460ms
-t=9062.368470
-</pre>
-
-
-<p>
-Et donc, chez moi, le ratio est plutôt de
-</p>
-<div class="org-src-container">
-<pre class="src src-R">74460/844
-</pre>
-</div>
-
-<pre class="example">
-[1] 88.22275
-</pre>
-</div>
-</div>
-
-<div id="outline-container-orgeb3b581" class="outline-3">
-<h3 id="orgeb3b581"><span class="section-number-3">2.4</span> A second measurement</h3>
-<div class="outline-text-3" id="text-2-4">
-<p>
-Ceci étant dit, si je reexécute ces deux codes:
-</p>
-
-<div class="org-src-container">
-<pre class="src src-shell"><span class="org-builtin">cd</span> /tmp/
-gcc -O3 a.c -o a
-./a 1000
-</pre>
-</div>
-
-<pre class="example">
-matrix product took 676ms
-t=9062.368470
-</pre>
-
-
-<div class="org-src-container">
-<pre class="src src-shell"><span class="org-builtin">cd</span> /tmp/
-gcc -O3 d.c -o d -lmpfr
-./d 1000 53
-</pre>
-</div>
-
-<pre class="example">
-MPFR library: 3.1.5
-MPFR header: 3.1.5 (based on 3.1.5)
-matrix product took 68732ms
-t=9062.368470
-</pre>
-
-
-<p>
-J'obtiens une valeur assez différente qui me donnerait cette fois ci
-un ratio de
-</p>
-<div class="org-src-container">
-<pre class="src src-R">68732/676
-</pre>
-</div>
-
-<pre class="example">
-[1] 101.6746
-</pre>
-
-
-<p>
-c'est à dire "plus proche" de ce qui est annoncé dans [2] mais c'est
-un coup de chance, j'aurais tout aussi bien pu obtenir 120 !  Bref,
-c'est pas le même setup que vous mais statistiquement parlant, il doit
-aussi y avoir quelque chose à faire là, non ?
-</p>
-</div>
-</div>
-</div>
-
-<div id="outline-container-org3977eed" class="outline-2">
-<h2 id="org3977eed"><span class="section-number-2">3</span> References</h2>
-<div class="outline-text-2" id="text-3">
-<p>
-[1] Fousse, L., Hanrot, G., Lefèvre, V., Pélissier, P., and
-Zimmermann, P. MPFR: A multiple-precision binary floating- point
-library with correct rounding. ACM Trans. Math. Softw. 33, 2 (2007),
-article 13.
-</p>
-
-<p>
-[2] van der Hoeven, J. Multiple precision floating-point arithmetic on
-SIMD processors. In Proceedings of Arith’24 (2017), IEEE, pp. 2–9.
-</p>
-
-<p>
-Entered on <span class="timestamp-wrapper"><span class="timestamp">[2017-09-01 Fri 17:12]</span></span>
-</p>
-</div>
-</div>
-</div>
-<div id="postamble" class="status">
-<p class="validation"><a href="http://validator.w3.org/check?uri=referer">Validate</a></p>
-</div>
-</body>
-</html>
diff --git a/module4/ressources/exo1.html b/module4/ressources/exo1.html
deleted file mode 100644
index 2af1ef51ec842a638a796f2b275db50a42f4fd4c..0000000000000000000000000000000000000000
--- a/module4/ressources/exo1.html
+++ /dev/null
@@ -1,222 +0,0 @@
-<div id="outline-container-orgc50e61e" class="outline-2">
-<h2 id="orgc50e61e">Exercice 1 : Ré-exécuter n'est pas répliquer&#x2026;</h2>
-<div class="outline-text-2" id="text-orgc50e61e">
-<p>
-Même si la terminologie peut varier d'un auteur ou d'une communauté à
-l'autre, il est important de comprendre que l'on peut distinguer
-différents niveaux de "réplication" selon que l'on s'est contenté de
-vérifier que l'on pouvait ré-exécuter le code et obtenir exactement les
-mêmes résultats ou bien que l'on arrivait à reproduire des résultats
-similaires en suivant une approche similaire (éventuellement avec un
-autre langage, une autre méthode de calcul, etc.). À ce sujet, vous
-pourrez vouloir par exemple lire <a href="https://arxiv.org/abs/1708.08205">https://arxiv.org/abs/1708.08205</a>.
-</p>
-
-<p>
-Le diable se cache souvent dans des endroits auxquels on n'aurait jamais 
-pensé et nous sommes nous-mêmes allés de surprise en surprise en
-préparant ce MOOC, notamment avec l'exercice du module 2 sur
-Challenger. C'est pourquoi nous vous proposons dans cet exercice, de
-refaire une partie de l'analyse des données de Challenger, comme l'ont
-fait Siddhartha Dallal et ses co-auteurs il y a presque 30 ans dans
-leur article <i>Risk Analysis of the Space Shuttle: Pre-Challenger
-Prediction of Failure</i> et publié dans le <i>Journal of the American
-Statistical Association</i> (Vol. 84, No. 408, Déc., 1989) mais dans un autre langage de votre choix (Python, R, Julia, SAS&#x2026;). 
-</p>
-
-<p>
-Nous savons d'expérience que si les estimations de pente et
-d'intercept sont généralement les mêmes, on peut avoir des différences
-lorsque l'on regarde les estimateurs de variance et le R<sup>2</sup> un peu plus 
-dans les détails. Il peut également y avoir des surprises dans le
-graphique final selon les versions de bibliothèques utilisées.
-</p>
-
-<p>
-L'ensemble des calculs à effectuer est décrit ici avec les
-indications sur comment contribuer :
-<a href="https://app-learninglab.inria.fr/gitlab/moocrr-session1/moocrr-reproducibility-study/">https://app-learninglab.inria.fr/gitlab/moocrr-session1/moocrr-reproducibility-study/</a>
-</p>
-
-<p>
-Vous y trouverez notre réplication des calculs de Dallal <i>et al.</i> (en
-R), une mise en œuvre en Python et une en R (très similaires à ce que
-vous avez pu utiliser dans le module 2). Cet exercice peut donc se
-faire à deux niveaux :
-</p>
-<ol class="org-ol">
-<li>un niveau facile pour ceux qui repartiront du code dans le langage
-qu'ils n'auront initialement pas utilisé et se contenteront de le
-ré-exécuter. Pour cela, nul besoin de maîtriser la régression
-logistique, il suffit de bien inspecter les sorties produites et de
-vérifier qu'elles correspondent bien aux valeurs attendues. Pour
-ceux qui ré-exécuteraient le notebook Python dans l'environnement
-Jupyter du MOOC, n'hésitez pas à consulter <a href="https://www.fun-mooc.fr/courses/course-v1:inria+41016+session01bis/jump_to_id/4ab5bb42ca1e45c8b0f349751b96d405">les ressources de la
-section 4A du module 2</a> qui expliquent comment y importer un
-notebook.</li>
-<li>un niveau plus difficile pour ceux qui souhaiteront le réécrire
-complètement (éventuellement dans un autre langage que R ou Python,
-l'expérience peut être d'autant plus intéressante que nous n'avons
-pas testé ces variations). Là, si les fonctions de calcul d'une
-régression logistique ne sont pas présentes, il y a par contre
-intérêt à en savoir un minimum pour pouvoir les
-implémenter. L'exercice en est d'autant plus instructif.</li>
-</ol>
-
-<p>
-Vous pourrez alors discuter sur le forum des succès et des échecs que
-vous aurez pu rencontrer. Pour cela :
-</p>
-<ul class="org-ul">
-<li><b>Vous publierez auparavant dans votre dépôt les différents notebooks</b>
-en prenant bien soin d'enrichir votre document des informations
-(numéros de version, etc.) sur votre système et sur les
-bibliothèques installées.</li>
-<li>Vous indiquerez votre résultat (que ça soit un succès ou échec à
-obtenir les mêmes résultats) en <b>remplissant ce <a href="https://app-learninglab.inria.fr/gitlab/moocrr-session1/moocrr-reproducibility-study/blob/master/results.md">tableau</a></b> (vous avez
-les droits d'édition donc il vous suffit d'éditer les fichiers via
-l'interface GitLab). Vous vérifierez les valeurs obtenues pour :
-<ol class="org-ol">
-<li>les coefficients de la pente et de l'intercept</li>
-<li>les estimations d'erreur de ces coefficients</li>
-<li>le goodness of fit</li>
-<li>la figure</li>
-<li>la zone de confiance</li>
-</ol></li>
-<li><p>
-Pour chacun vous indiquerez si le résultat est :
-</p>
-<ul class="org-ul">
-<li>identique</li>
-<li>proche à moins de trois décimales</li>
-<li>très différent</li>
-<li>non fonctionnel (pas de résultat obtenu)</li>
-</ul>
-<p>
-Vous indiquerez également dans ce tableau :
-</p>
-<ul class="org-ul">
-<li>un lien vers votre espace gitlab contenant les différents notebooks</li>
-<li>le nom du système d'exploitation utilisé</li>
-<li>le langage utilisé et son numéro de version</li>
-<li>les numéros des principales bibliothèques utilisées
-<ul class="org-ul">
-<li>Python : numpy, pandas, matplotlib, statsmodels&#x2026;</li>
-<li>R : BLAS, ggplot, dplyr si chargées</li>
-</ul></li>
-</ul></li>
-</ul>
-
-<p>
-Ne vous inquiétez pas si ces consignes vous semblent peu claires sur l'instant,
-elles sont rappelées en haut du <a href="https://app-learninglab.inria.fr/gitlab/moocrr-session1/moocrr-reproducibility-study/blob/master/results.md">tableau</a> et vous vous rendrez vite
-compte s'il vous manque quelque chose quand vous essaierez de remplir
-ce tableau.
-</p>
-
-<p>
-Nous effectuerons une synthèse illustrant les principales divergences
-observées et nous vous l'enverrons à la fin du MOOC.
-</p>
-</div>
-</div>
-
-<div id="outline-container-org55f722b" class="outline-2">
-<h2 id="org55f722b" style="color: #b62567;">Re-execution is not replication&#x2026;</h2>
-<div class="outline-text-2" id="text-org55f722b">
-<p style="color: #b62567;">
-Unfortunately terminology varies a lot between authors and
-communities, but it is important to understand the distinction between
-different levels of "replication". You can be satisfied with
-re-running the code and get exactly the same results, but you can also
-try to obtain similar results using a similar approach, changing for
-example the programming language, computational method, etc. An
-article we recommend on this topic is
-<a href="https://arxiv.org/abs/1708.08205">https://arxiv.org/abs/1708.08205</a>.
-</p>
-
-<p style="color: #b62567;">
-Often the devil is in the details that one would have never thought
-about, and we have had our share of surprises while preparing this
-MOOC, in particular with the exercise on the Challenger catastrophe
-from module 2. We therefore propose in this exercise that you re-do a
-part of this analysis, following the example of Siddhartha Dallal and
-co-authors almost 30 years ago in their article <i>Risk Analysis of the
-Space Shuttle: Pre-Challenger Prediction of Failure</i>, published in the
-<i>Journal of the American Statistical Association</i> (Vol. 84, No. 408,
-Déc., 1989), but using a different language of your choosing (Python,
-R, Julia, SAS&#x2026;).
-</p>
-
-<p style="color: #b62567;">
-Our experience shows that the estimations of slope and intercept are
-generally the same, but there can be differences when looking at
-variance estimators and R<sup>2</sup> in more detail. Another source of
-surprises is the final graphical presentation, depending on the
-versions of the libraries that are used.
-</p>
-
-<p style="color: #b62567;">
-The computations to be done are described at
-<a href="https://app-learninglab.inria.fr/gitlab/moocrr-session1/moocrr-reproducibility-study/">https://app-learninglab.inria.fr/gitlab/moocrr-session1/moocrr-reproducibility-study/</a>
-together with instructions for contributing.
-</p>
-
-<p style="color: #b62567;">
-You will find there our replications of the computations by Dallal <i>et
-al.</i> (in R), one in Python and one in R (very similar to what you have
-used in module 2). This exercise can be done at two levels:
-</p>
-
-<ol class="org-ol">
-<li style="color: #b62567;">an easy level at which you start from the code in the language that you did not use initially, and content yourself with re-executin it. This doesn't require mastering logistic regression, it is sufficien to inspect the outputs produced and check that they correspond to the expected values. For those who want to re-execute the Python notebook in our MOOC's Jupyter environment, check <a href="https://www.fun-mooc.fr/courses/course-v1:inria+41016+session01bis/jump_to_id/4ab5bb42ca1e45c8b0f349751b96d405">the resources for sequence 4A of module 2</a> that explain how to import a notebook.</li>
-<li style="color: #b62567;">a more difficult level at which you rewrite the analysis completely, possibly in a different language than Python or R, which makes the exercise more interesting because we have not tested such variants. If logistic regression is not already implemented for your language, you will need a good understanding of it in order to write the code yourself, which of course makes the exercise even more instructive.</li>
-</ol>
-
-<p style="color: #b62567;">
-You can discuss your successes or failures on the forum, after following these instructions:
-</p>
-<ul class="org-ul">
-<li style="color: #b62567;"><b>First, publish your notebooks in your repository</b>, taking care to enrich your document with information about your system and your libraries (version numbers etc.).</li>
-<li style="color: #b62567;">Indicate your result by adding to this <a href="https://app-learninglab.inria.fr/gitlab/moocrr-session1/moocrr-reproducibility-study/blob/master/results.md">table</a> (you have write permissions, so you can simply edit it via the GitLab interface). Check the values obtained for:
-<ol class="org-ol">
-<li style="color: #b62567;">the slope and intercept coefficients</li>
-<li style="color: #b62567;">the error estimates for these coefficients</li>
-<li style="color: #b62567;">the goodness of fit</li>
-<li style="color: #b62567;">the plot</li>
-<li style="color: #b62567;">the confidence region</li>
-</ol></li>
-<li><p style="color: #b62567;">
-For each of these values, specify if your result is
-</p>
-<ul class="org-ul">
-<li style="color: #b62567;">identical</li>
-<li style="color: #b62567;">close, to three decimal places</li>
-<li style="color: #b62567;">very different</li>
-<li style="color: #b62567;">non functional (no result obtained)</li>
-</ul>
-<p style="color: #b62567;">
-Also provide in this table:
-</p>
-<ul class="org-ul">
-<li style="color: #b62567;">a link to your GitLab workspace with your notebook(s)</li>
-<li style="color: #b62567;">your operating system</li>
-<li style="color: #b62567;">the language you used, with the version number</li>
-<li style="color: #b62567;">version numbers for the main libraries
-<ul class="org-ul">
-<li style="color: #b62567;">Python: numpy, pandas, matplotlib, statsmodels&#x2026;</li>
-<li style="color: #b62567;">R: BLAS, ggplot, dplyr if used</li>
-</ul></li>
-</ul></li>
-</ul>
-
-<p style="color: #b62567;">
-Don't worry if these instructions seem confusing, they are reproduced above the <a href="https://app-learninglab.inria.fr/gitlab/moocrr-session1/moocrr-reproducibility-study/blob/master/results.md">table</a> and you will quickly notice if something is missing when you try to add your data.
-</p>
-
-<p style="color: #b62567;">
-We will compile a synthesis of the principal divergences observes and make it available at the end of the MOOC.
-</p>
-</div>
-</div>
-</div>
diff --git a/module4/ressources/exo2.html b/module4/ressources/exo2.html
deleted file mode 100644
index f1e467a25a3be3ac34bb880874f4f599c6ae87d8..0000000000000000000000000000000000000000
--- a/module4/ressources/exo2.html
+++ /dev/null
@@ -1,51 +0,0 @@
-<h2 id="org1f802ba">Exercice 2 : L'importance de l'environnement</h2>
-<div class="outline-text-2" id="text-org1f802ba">
-<p>
-Dans cet exercice, nous vous proposons de reprendre l'exercice
-précédent mais en mettant à jour l'environnement de calcul. En effet,
-nous avons rencontré des surprises en préparant ce MOOC puisqu'il nous
-est arrivé d'avoir des résultats différents entre nos machines et
-l'environnement Jupyter que nous avions mis en place pour le MOOC. Ça
-sera peut-être également votre cas !
-</p>
-
-<ol class="org-ol">
-<li>Pour ceux qui ont suivi le parcours Jupyter, recréez
-l'environnement du MOOC sur votre propre machine en suivant les
-instructions données 
-<a href="https://www.fun-mooc.fr/courses/course-v1:inria+41016+session01bis/jump_to_id/4ab5bb42ca1e45c8b0f349751b96d405">dans les ressources de la section 4A du module 2</a>.</li>
-<li>Vérifiez si vous obtenez bien les mêmes résultats que ceux
-attendus.</li>
-<li>Mettez à jour (vers le haut ou vers la bas) cet environnement et
-vérifiez si vous obtenez les mêmes résultats.</li>
-</ol>
-
-<p>
-Comme précédemment, vous mettrez à jour le <a href="https://app-learninglab.inria.fr/gitlab/moocrr-session1/moocrr-reproducibility-study/blob/master/results.md">tableau</a> et vous discuterez
-sur le forum des succès et des échecs que vous aurez rencontrés.
-</p>
-</div>
-</div>
-
-<div id="outline-container-org1a24dbb" class="outline-2">
-<h2 id="org1a24dbb"><span  style="color: #b62567;">The importance of the environment</span></h2>
-<div class="outline-text-2" id="text-org1a24dbb">
-<p style="color: #b62567;">
-In this exercise, we ask you to redo the preceding exercise after
-updating the computational environment. When preparing this MOOC, we
-had a few surprises due to different results on our own computers and
-on the Jupyter environment that we had installed for the MOOC. Maybe
-that will happen to you as well!
-</p>
-
-<ol class="org-ol">
-<li style="color: #b62567;">For those you followed the Jupyter path, re-create the MOOC's Jupyter environment on your own computer by following the instructions given
-<a href="https://www.fun-mooc.fr/courses/course-v1:inria+41016+session01bis/jump_to_id/4ab5bb42ca1e45c8b0f349751b96d405">in the resource section of sequence 4A of module 2</a>.</li>
-<li style="color: #b62567;">Check if you get the same results as in the MOOC environment.</li>
-<li style="color: #b62567;">Update this environment, increasing or decreasing some package's version numbers, and check if the results are still the same.</li>
-</ol>
-
-<p style="color: #b62567;">
-As before, you can add your observations to the <a href="https://app-learninglab.inria.fr/gitlab/moocrr-session1/moocrr-reproducibility-study/blob/master/results.md">table</a> and discuss your successes and failures on the forum.
-</p>
-</div>
diff --git a/module4/ressources/exo3.html b/module4/ressources/exo3.html
deleted file mode 100644
index 61191d8b449476293a7cff8ef964555600d63d9f..0000000000000000000000000000000000000000
--- a/module4/ressources/exo3.html
+++ /dev/null
@@ -1,47 +0,0 @@
-<div id="outline-container-org5b10dc4" class="outline-2">
-<h2 id="org5b10dc4">Exercice 3 : Répliquer un papier de ReScience</h2>
-<div class="outline-text-2" id="text-org5b10dc4">
-<p>
-ReScience (<a href="http://rescience.github.io/">http://rescience.github.io/</a>) est un journal de sciences
-computationnelles entièrement ouvert dont l'objectif est d'encourager
-la réplication de travaux déjà publiés en s'assurant que l'ensemble du
-code et des données soit disponible. Pour chacun des articles publiés
-dans ReScience, nous avons la garantie qu'au moins two chercheurs
-indépendants ont réussi à suivre les indications, à ré-exécuter le
-code et à ré-obtenir les mêmes résultats que ceux décrits par les
-auteurs. Cela ne veut pas dire que cela soit parfaitement automatique
-pour autant et il peut être intéressant de voir comment ils ont
-procédé.
-</p>
-
-<p>
-Nous vous proposons donc de choisir l'un de ces articles (celui avec
-lequel vous avez le plus d'affinité) et d'essayer de réexécuter les
-codes et les calculs décrits dans l'article. N'hésitez pas à indiquer
-vos difficultés éventuelles sur le forum où nous répondrons à vos questions.
-</p>
-</div>
-</div>
-
-<div id="outline-container-org60c7839" class="outline-2">
-<h2 id="org60c7839" style="color: #b62567;">Replicate a paper from ReScience</h2>
-<div class="outline-text-2" id="text-org60c7839">
-<p style="color: #b62567;">
-ReScience (<a href="http://rescience.github.io/">http://rescience.github.io/</a>) is a scientific journal for
-computational science that is completely open and has the goal of
-encouraging the replication of already published work while providing
-a complete set of code and data. For each article published in
-ReScience, we know that at least two independent researchers (the
-reviewers) have been able to follow the instructions, re-execute the
-code, and obtain the same results as those described by the
-authors. This doesn't mean that the process is fully automatic, and
-therefore it is of interest to see how they have proceeded.
-</p>
-
-<p style="color: #b62567;">
-We ask you to choose one of the articles (the one that you like most)
-and to try to re-execute the code as described in the article. Don't
-hesitate to indicate any difficulties you might encounter in the
-forum, where we will reply to your questions&#x2026;
-</p>
-</div>

Function	time (s)
`HPL_dswap`	0.5
`HPL_dcopy`	N/A
`HPL_daxpy`	0
`HPL_dscal`	N/A
`HPL_idamax`	N/A
`HPL_dgemv`	1
`HPL_dtrsv`	0
`HPL_dger`	0.5
`HPL_dtrsm`	10
Code	Virtual time	Gflops	Total simulation time	Time for application computations
Before	222.27	2.400e+01	19.2529	10.0526
After	258.28	2.065e+01	48.2851	41.7249
Code	Virtual time	Gflops	Total simulation time	Time for application computations
WORK[0] unmodified, real dgemv	223.81	2.383e+01	15.5049	9.5045
WORK[0] modified, real dgemv	223.74	2.384e+01	25.9935	20.0480
WORK[0] modified, no-op dgemv	226.28	2.357e+01	26.3907	20.3201
Code	Virtual time	Gflops	Total simulation time	Time for application computations
WORK[0] unmodified, real dgemv	223.68	2.385e+01	15.8909	9.5658
WORK[0] modified, real dgemv	257.79	2.069e+01	47.9488	41.5125
WORK[0] modified, no-op dgemv	225.91	2.361e+01	26.2768	20.1776