# -*- coding: utf-8 -*-
# -*- mode: org -*-
#+TITLE: A reproducible comparison between @@latex:\\@@ GNU MPFR and machine double-precision
#+AUTHOR: Paul Zimmermann (reproduction with org-mode by Arnaud Legrand)
#+STARTUP: overview indent inlineimages logdrawer
#+LANGUAGE: en
#+LATEX_CLASS: IEEEtran
#+LaTeX_CLASS_OPTIONS: [onecolumn]
# #+HTML_HEAD:
#+HTML_HEAD:
#+HTML_HEAD:
#+HTML_HEAD:
#+HTML_HEAD:
#+HTML_HEAD:
#+HTML_HEAD:
#+PROPERTY: header-args :eval never-export
Several authors claim that GNU MPFR [1] is $x$ times slower than
double-precision floating-point numbers, for various values of $x$,
without any way for the reader to reproduce their claim. For example
in [2], Joris van der Hoeven writes “the MPFR library for arbitrary
precision and IEEE-style standardized floating-point arithmetic is
typically about a factor 100 slower than double precision machine
arithmetic”. Such a claim typically: (i) does not say which version of
MPFR was used (and which version of GMP, since MPFR being based on
GMP, its efficiency also depends on GMP); (ii) does not detail the
environment used (processor, compiler, operating system); (iii) does
not explain which application was used for the comparison. Therefore
it cannot be reproduced by the reader, which could thus have no
confidence in the claimed factor of 100. In this short note we provide
reproducible figures that can be checked by the reader.
** Reproducible Experimental Setup
We use the programs in appendix to multiply two $1000 × 1000$
matrices. The matrix $A$ has coefficients $1/(i + j + 1)$ for $0 ≤ i,
j < 1000$, and matrix $b$ has coefficients $1/(ij + 1)$. Both programs
print the time for the matrix product (not counting the time to
initialize the matrix), and the sum of coefficients of the product
matrix (used as a simple checksum between both programs).
We used MFPR version 3.1.5, configured with GMP 6.1.2 (both are the
latest releases as of the date of this document).
We used as test processor =gcc12.fsffrance.org=, which is a machine from
the GCC Compile Farm, a set of machines available for developers of
free software. The compiler used was GCC 4.5.1, which is installed in
~/opt/cfarm/release/4.5.1~ on this machine, with optimization level
~-O3~. Both GMP and MPFR were also compiled with this compiler, and the
GMP and MPFR libraries were linked statically with the application
programs (given in appendix).
** Experimental Results From Arnaud Legrand
*** Code
The program (=a.c=) using the C double-precision type is the
following. It takes as command-line argument the matrix dimension.
#+BEGIN_SRC C :tangle /tmp/a.c
#include
#include
#include
#include
static int cputime()
{
struct rusage rus;
getrusage(0, &rus);
return rus.ru_utime.tv_sec * 1000 + rus.ru_utime.tv_usec / 1000;
}
int main(int argc, char *argv[])
{
double **a;
double **b;
double **c;
double t = 0.0;
int i, j, k, st;
int N = atoi(argv[1]);
st = cputime();
a = malloc(N * sizeof(double *));
b = malloc(N * sizeof(double *));
c = malloc(N * sizeof(double *));
for (i = 0; i < N; i++) {
a[i] = malloc(N * sizeof(double));
b[i] = malloc(N * sizeof(double));
c[i] = malloc(N * sizeof(double));
for (j = 0; j < N; j++) {
a[i][j] = 1.0 / (1.0 + i + j);
b[i][j] = 1.0 / (1.0 + i * j);
}
}
st = cputime();
for (i = 0; i < N; i++)
for (j = 0; j < N; j++)
c[i][j] = 0.0;
for (i = 0; i < N; i++)
for (k = 0; k < N; k++)
for (j = 0; j < N; j++)
c[i][j] += a[i][k] * b[k][j];
for (i = 0; i < N; i++)
for (j = 0; j < N; j++)
t += c[i][j];
printf("matrix product took %dms\n", cputime() - st);
printf("t=%f\n", t);
for (i = 0; i < N; i++) {
free(a[i]);
free(b[i]);
free(c[i]);
}
free(a);
free(b);
free(c);
return 0;
}
#+END_SRC
The program (=d.c=) using GNU MPFR is the following. It takes as
command-line argument the matrix dimension and the MPFR precision (in
bits).
#+BEGIN_SRC C :tangle /tmp/d.c
#include
#include
#include
#include
#include
static int cputime()
{
struct rusage rus;
getrusage(0, &rus);
return rus.ru_utime.tv_sec * 1000 + rus.ru_utime.tv_usec / 1000;
}
int main(int argc, char *argv[])
{
mpfr_t **a;
mpfr_t **b;
mpfr_t **c;
mpfr_t s;
double t = 0.0;
int i, j, k, st;
int N = atoi(argv[1]);
int prec = atoi(argv[2]);
printf("MPFR library: %-12s\nMPFR header: %s (based on %d.%d.%d)\n",
mpfr_get_version(), MPFR_VERSION_STRING, MPFR_VERSION_MAJOR,
MPFR_VERSION_MINOR, MPFR_VERSION_PATCHLEVEL);
st = cputime();
a = malloc(N * sizeof(mpfr_t *));
b = malloc(N * sizeof(mpfr_t *));
c = malloc(N * sizeof(mpfr_t *));
mpfr_init2(s, prec);
for (i = 0; i < N; i++) {
a[i] = malloc(N * sizeof(mpfr_t));
b[i] = malloc(N * sizeof(mpfr_t));
c[i] = malloc(N * sizeof(mpfr_t));
for (j = 0; j < N; j++) {
mpfr_init2(a[i][j], prec);
mpfr_init2(b[i][j], prec);
mpfr_init2(c[i][j], prec);
mpfr_set_ui(a[i][j], 1, MPFR_RNDN);
mpfr_div_ui(a[i][j], a[i][j], i + j + 1, MPFR_RNDN);
mpfr_set_ui(b[i][j], 1, MPFR_RNDN);
mpfr_div_ui(b[i][j], b[i][j], i * j + 1, MPFR_RNDN);
}
}
st = cputime();
for (i = 0; i < N; i++)
for (j = 0; j < N; j++)
mpfr_set_ui(c[i][j], 0, MPFR_RNDN);
for (i = 0; i < N; i++)
for (k = 0; k < N; k++)
for (j = 0; j < N; j++) {
mpfr_mul(s, a[i][k], b[k][j], MPFR_RNDN);
mpfr_add(c[i][j], c[i][j], s, MPFR_RNDN);
}
for (i = 0; i < N; i++)
for (j = 0; j < N; j++)
t += mpfr_get_d(c[i][j], MPFR_RNDN);
printf("matrix product took %dms\n", cputime() - st);
printf("t=%f\n", t);
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
mpfr_clear(a[i][j]);
mpfr_clear(b[i][j]);
mpfr_clear(c[i][j]);
}
free(a[i]);
free(b[i]);
free(c[i]);
}
mpfr_clear(s);
free(a);
free(b);
free(c);
return 0;
}
#+END_SRC
*** Setup
- Name of the machine and OS version:
#+begin_src shell :results output :exports results :tangle get_info.sh
uname -a
#+end_src
#+RESULTS:
: Linux sama 4.2.0-1-amd64 #1 SMP Debian 4.2.6-1 (2015-11-10) x86_64 GNU/Linux
- CPU/architecture information:
#+begin_src shell :results output :exports both :tangle get_info.sh
cat /proc/cpuinfo
#+end_src
#+RESULTS:
#+begin_example
processor : 0
vendor_id : GenuineIntel
cpu family : 6
model : 58
model name : Intel(R) Core(TM) i7-3687U CPU @ 2.10GHz
stepping : 9
microcode : 0x15
cpu MHz : 2165.617
cache size : 4096 KB
physical id : 0
siblings : 4
core id : 0
cpu cores : 2
apicid : 0
initial apicid : 0
fpu : yes
fpu_exception : yes
cpuid level : 13
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm ida arat epb pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms xsaveopt
bugs :
bogomips : 5182.68
clflush size : 64
cache_alignment : 64
address sizes : 36 bits physical, 48 bits virtual
power management:
processor : 1
vendor_id : GenuineIntel
cpu family : 6
model : 58
model name : Intel(R) Core(TM) i7-3687U CPU @ 2.10GHz
stepping : 9
microcode : 0x15
cpu MHz : 3140.515
cache size : 4096 KB
physical id : 0
siblings : 4
core id : 1
cpu cores : 2
apicid : 2
initial apicid : 2
fpu : yes
fpu_exception : yes
cpuid level : 13
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm ida arat epb pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms xsaveopt
bugs :
bogomips : 5182.68
clflush size : 64
cache_alignment : 64
address sizes : 36 bits physical, 48 bits virtual
power management:
processor : 2
vendor_id : GenuineIntel
cpu family : 6
model : 58
model name : Intel(R) Core(TM) i7-3687U CPU @ 2.10GHz
stepping : 9
microcode : 0x15
cpu MHz : 2860.000
cache size : 4096 KB
physical id : 0
siblings : 4
core id : 0
cpu cores : 2
apicid : 1
initial apicid : 1
fpu : yes
fpu_exception : yes
cpuid level : 13
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm ida arat epb pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms xsaveopt
bugs :
bogomips : 5182.68
clflush size : 64
cache_alignment : 64
address sizes : 36 bits physical, 48 bits virtual
power management:
processor : 3
vendor_id : GenuineIntel
cpu family : 6
model : 58
model name : Intel(R) Core(TM) i7-3687U CPU @ 2.10GHz
stepping : 9
microcode : 0x15
cpu MHz : 2813.585
cache size : 4096 KB
physical id : 0
siblings : 4
core id : 1
cpu cores : 2
apicid : 3
initial apicid : 3
fpu : yes
fpu_exception : yes
cpuid level : 13
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm ida arat epb pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms xsaveopt
bugs :
bogomips : 5182.68
clflush size : 64
cache_alignment : 64
address sizes : 36 bits physical, 48 bits virtual
power management:
#+end_example
- Compiler version
#+begin_src shell :results output :exports both :tangle get_info.sh
gcc --version
#+end_src
#+RESULTS:
: gcc (Debian 5.3.1-6) 5.3.1 20160114
: Copyright (C) 2015 Free Software Foundation, Inc.
: This is free software; see the source for copying conditions. There is NO
: warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
:
- Libpmfr version:
#+begin_src shell :results output :exports both :tangle get_info.sh
apt-cache show libmpfr-dev
#+end_src
#+RESULTS:
#+begin_example
Package: libmpfr-dev
Source: mpfr4
Version: 3.1.5-1
Installed-Size: 1029
Maintainer: Debian GCC Maintainers
Architecture: amd64
Replaces: libgmp3-dev (<< 4.1.4-3)
Depends: libgmp-dev, libmpfr4 (= 3.1.5-1)
Suggests: libmpfr-doc
Breaks: libgmp3-dev (<< 4.1.4-3)
Description-en: multiple precision floating-point computation developers tools
This development package provides the header files and the symbolic
links to allow compilation and linking of programs that use the libraries
provided in the libmpfr4 package.
.
MPFR provides a library for multiple-precision floating-point computation
with correct rounding. The computation is both efficient and has a
well-defined semantics. It copies the good ideas from the
ANSI/IEEE-754 standard for double-precision floating-point arithmetic
(53-bit mantissa).
Description-md5: a2580b68a7c6f1fcadeefc6b17102b32
Multi-Arch: same
Homepage: http://www.mpfr.org/
Tag: devel::lang:c, devel::library, implemented-in::c, role::devel-lib,
suite::gnu
Section: libdevel
Priority: optional
Filename: pool/main/m/mpfr4/libmpfr-dev_3.1.5-1_amd64.deb
Size: 207200
MD5sum: e5c7872461f263e27312c9ef4f4218b9
SHA256: 279970e210c7db4e2550f5a3b7abb2674d01e9f0afd2a4857f1589a6947e0cbd
#+end_example
*** A first measurement
#+begin_src shell :results output :exports both :tangle measure.sh
cd /tmp/
gcc -O3 a.c -o a
./a 1000
#+end_src
#+RESULTS:
: matrix product took 680ms
: t=9062.368470
#+begin_src shell :results output :exports both :tangle measure.sh
cd /tmp/
gcc -O3 d.c -o d -lmpfr
./d 1000 53
#+end_src
#+RESULTS:
: MPFR library: 3.1.5
: MPFR header: 3.1.5 (based on 3.1.5)
: matrix product took 74460ms
: t=9062.368470
Et donc, chez moi, le ratio est plutôt de
#+begin_src R :results output :session *R* :exports both
74460/844
#+end_src
#+RESULTS:
: [1] 88.22275
*** A second measurement
Ceci étant dit, si je reexécute ces deux codes:
#+begin_src shell :results output :exports both
cd /tmp/
gcc -O3 a.c -o a
./a 1000
#+end_src
#+RESULTS:
: matrix product took 676ms
: t=9062.368470
#+begin_src shell :results output :exports both
cd /tmp/
gcc -O3 d.c -o d -lmpfr
./d 1000 53
#+end_src
#+RESULTS:
: MPFR library: 3.1.5
: MPFR header: 3.1.5 (based on 3.1.5)
: matrix product took 68732ms
: t=9062.368470
J'obtiens une valeur assez différente qui me donnerait cette fois ci
un ratio de
#+begin_src R :results output :session *R* :exports both
68732/676
#+end_src
#+RESULTS:
: [1] 101.6746
c'est à dire "plus proche" de ce qui est annoncé dans [2] mais c'est
un coup de chance, j'aurais tout aussi bien pu obtenir 120 ! Bref,
c'est pas le même setup que vous mais statistiquement parlant, il doit
aussi y avoir quelque chose à faire là, non ?
** References
[1] Fousse, L., Hanrot, G., Lefèvre, V., Pélissier, P., and
Zimmermann, P. MPFR: A multiple-precision binary floating- point
library with correct rounding. ACM Trans. Math. Softw. 33, 2 (2007),
article 13.
[2] van der Hoeven, J. Multiple precision floating-point arithmetic on
SIMD processors. In Proceedings of Arith’24 (2017), IEEE, pp. 2–9.
Entered on [2017-09-01 ven. 17:12]
* Emacs Setup :noexport:
This document has local variables in its postembule, which should
allow Org-mode (9) to work seamlessly without any setup. If you're
uncomfortable using such variables, you can safely ignore them at
startup. Exporting may require that you copy them in your .emacs.
# Local Variables:
# eval: (require 'org-install)
# eval: (org-babel-do-load-languages 'org-babel-load-languages '((sh . t) (R . t) (perl . t) (python .t) ))
# eval: (setq org-confirm-babel-evaluate nil)
# eval: (unless (boundp 'org-latex-classes) (setq org-latex-classes nil))
# eval: (add-to-list 'org-latex-classes '("IEEEtran"
# "\\documentclass[conference, 10pt, compsocconf]{IEEEtran}\n \[NO-DEFAULT-PACKAGES]\n \[EXTRA]\n \\usepackage{graphicx}\n \\usepackage{hyperref}" ("\\section{%s}" . "\\section*{%s}") ("\\subsection{%s}" . "\\subsection*{%s}") ("\\subsubsection{%s}" . "\\subsubsection*{%s}") ("\\paragraph{%s}" . "\\paragraph*{%s}") ("\\subparagraph{%s}" . "\\subparagraph*{%s}")))
# eval: (setq org-alphabetical-lists t)
# eval: (setq org-src-fontify-natively t)
# eval: (add-to-list 'load-path ".")
# eval: (add-to-list 'org-latex-packages-alist '("" "minted"))
# eval: (setq org-latex-listings 'minted)
# eval: (setq org-latex-pdf-process '("pdflatex -shell-escape -interaction nonstopmode -output-directory %o %f" "pdflatex -shell-escape -interaction nonstopmode -output-directory %o %f" "pdflatex -shell-escape -interaction nonstopmode -output-directory %o %f"))
# End: