# -*- coding: utf-8 -*- # -*- mode: org -*- #+TITLE: A reproducible comparison between @@latex:\\@@ GNU MPFR and machine double-precision #+AUTHOR: Paul Zimmermann (reproduction with org-mode by Arnaud Legrand) #+STARTUP: overview indent inlineimages logdrawer #+LANGUAGE: en #+LATEX_CLASS: IEEEtran #+LaTeX_CLASS_OPTIONS: [onecolumn] # #+HTML_HEAD: #+HTML_HEAD: #+HTML_HEAD: #+HTML_HEAD: #+HTML_HEAD: #+HTML_HEAD: #+HTML_HEAD: #+PROPERTY: header-args :eval never-export Several authors claim that GNU MPFR [1] is $x$ times slower than double-precision floating-point numbers, for various values of $x$, without any way for the reader to reproduce their claim. For example in [2], Joris van der Hoeven writes “the MPFR library for arbitrary precision and IEEE-style standardized floating-point arithmetic is typically about a factor 100 slower than double precision machine arithmetic”. Such a claim typically: (i) does not say which version of MPFR was used (and which version of GMP, since MPFR being based on GMP, its efficiency also depends on GMP); (ii) does not detail the environment used (processor, compiler, operating system); (iii) does not explain which application was used for the comparison. Therefore it cannot be reproduced by the reader, which could thus have no confidence in the claimed factor of 100. In this short note we provide reproducible figures that can be checked by the reader. ** Reproducible Experimental Setup We use the programs in appendix to multiply two $1000 × 1000$ matrices. The matrix $A$ has coefficients $1/(i + j + 1)$ for $0 ≤ i, j < 1000$, and matrix $b$ has coefficients $1/(ij + 1)$. Both programs print the time for the matrix product (not counting the time to initialize the matrix), and the sum of coefficients of the product matrix (used as a simple checksum between both programs). We used MFPR version 3.1.5, configured with GMP 6.1.2 (both are the latest releases as of the date of this document). We used as test processor =gcc12.fsffrance.org=, which is a machine from the GCC Compile Farm, a set of machines available for developers of free software. The compiler used was GCC 4.5.1, which is installed in ~/opt/cfarm/release/4.5.1~ on this machine, with optimization level ~-O3~. Both GMP and MPFR were also compiled with this compiler, and the GMP and MPFR libraries were linked statically with the application programs (given in appendix). ** Experimental Results From Arnaud Legrand *** Code The program (=a.c=) using the C double-precision type is the following. It takes as command-line argument the matrix dimension. #+BEGIN_SRC C :tangle /tmp/a.c #include #include #include #include static int cputime() { struct rusage rus; getrusage(0, &rus); return rus.ru_utime.tv_sec * 1000 + rus.ru_utime.tv_usec / 1000; } int main(int argc, char *argv[]) { double **a; double **b; double **c; double t = 0.0; int i, j, k, st; int N = atoi(argv[1]); st = cputime(); a = malloc(N * sizeof(double *)); b = malloc(N * sizeof(double *)); c = malloc(N * sizeof(double *)); for (i = 0; i < N; i++) { a[i] = malloc(N * sizeof(double)); b[i] = malloc(N * sizeof(double)); c[i] = malloc(N * sizeof(double)); for (j = 0; j < N; j++) { a[i][j] = 1.0 / (1.0 + i + j); b[i][j] = 1.0 / (1.0 + i * j); } } st = cputime(); for (i = 0; i < N; i++) for (j = 0; j < N; j++) c[i][j] = 0.0; for (i = 0; i < N; i++) for (k = 0; k < N; k++) for (j = 0; j < N; j++) c[i][j] += a[i][k] * b[k][j]; for (i = 0; i < N; i++) for (j = 0; j < N; j++) t += c[i][j]; printf("matrix product took %dms\n", cputime() - st); printf("t=%f\n", t); for (i = 0; i < N; i++) { free(a[i]); free(b[i]); free(c[i]); } free(a); free(b); free(c); return 0; } #+END_SRC The program (=d.c=) using GNU MPFR is the following. It takes as command-line argument the matrix dimension and the MPFR precision (in bits). #+BEGIN_SRC C :tangle /tmp/d.c #include #include #include #include #include static int cputime() { struct rusage rus; getrusage(0, &rus); return rus.ru_utime.tv_sec * 1000 + rus.ru_utime.tv_usec / 1000; } int main(int argc, char *argv[]) { mpfr_t **a; mpfr_t **b; mpfr_t **c; mpfr_t s; double t = 0.0; int i, j, k, st; int N = atoi(argv[1]); int prec = atoi(argv[2]); printf("MPFR library: %-12s\nMPFR header: %s (based on %d.%d.%d)\n", mpfr_get_version(), MPFR_VERSION_STRING, MPFR_VERSION_MAJOR, MPFR_VERSION_MINOR, MPFR_VERSION_PATCHLEVEL); st = cputime(); a = malloc(N * sizeof(mpfr_t *)); b = malloc(N * sizeof(mpfr_t *)); c = malloc(N * sizeof(mpfr_t *)); mpfr_init2(s, prec); for (i = 0; i < N; i++) { a[i] = malloc(N * sizeof(mpfr_t)); b[i] = malloc(N * sizeof(mpfr_t)); c[i] = malloc(N * sizeof(mpfr_t)); for (j = 0; j < N; j++) { mpfr_init2(a[i][j], prec); mpfr_init2(b[i][j], prec); mpfr_init2(c[i][j], prec); mpfr_set_ui(a[i][j], 1, MPFR_RNDN); mpfr_div_ui(a[i][j], a[i][j], i + j + 1, MPFR_RNDN); mpfr_set_ui(b[i][j], 1, MPFR_RNDN); mpfr_div_ui(b[i][j], b[i][j], i * j + 1, MPFR_RNDN); } } st = cputime(); for (i = 0; i < N; i++) for (j = 0; j < N; j++) mpfr_set_ui(c[i][j], 0, MPFR_RNDN); for (i = 0; i < N; i++) for (k = 0; k < N; k++) for (j = 0; j < N; j++) { mpfr_mul(s, a[i][k], b[k][j], MPFR_RNDN); mpfr_add(c[i][j], c[i][j], s, MPFR_RNDN); } for (i = 0; i < N; i++) for (j = 0; j < N; j++) t += mpfr_get_d(c[i][j], MPFR_RNDN); printf("matrix product took %dms\n", cputime() - st); printf("t=%f\n", t); for (i = 0; i < N; i++) { for (j = 0; j < N; j++) { mpfr_clear(a[i][j]); mpfr_clear(b[i][j]); mpfr_clear(c[i][j]); } free(a[i]); free(b[i]); free(c[i]); } mpfr_clear(s); free(a); free(b); free(c); return 0; } #+END_SRC *** Setup - Name of the machine and OS version: #+begin_src shell :results output :exports results :tangle get_info.sh uname -a #+end_src #+RESULTS: : Linux sama 4.2.0-1-amd64 #1 SMP Debian 4.2.6-1 (2015-11-10) x86_64 GNU/Linux - CPU/architecture information: #+begin_src shell :results output :exports both :tangle get_info.sh cat /proc/cpuinfo #+end_src #+RESULTS: #+begin_example processor : 0 vendor_id : GenuineIntel cpu family : 6 model : 58 model name : Intel(R) Core(TM) i7-3687U CPU @ 2.10GHz stepping : 9 microcode : 0x15 cpu MHz : 2165.617 cache size : 4096 KB physical id : 0 siblings : 4 core id : 0 cpu cores : 2 apicid : 0 initial apicid : 0 fpu : yes fpu_exception : yes cpuid level : 13 wp : yes flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm ida arat epb pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms xsaveopt bugs : bogomips : 5182.68 clflush size : 64 cache_alignment : 64 address sizes : 36 bits physical, 48 bits virtual power management: processor : 1 vendor_id : GenuineIntel cpu family : 6 model : 58 model name : Intel(R) Core(TM) i7-3687U CPU @ 2.10GHz stepping : 9 microcode : 0x15 cpu MHz : 3140.515 cache size : 4096 KB physical id : 0 siblings : 4 core id : 1 cpu cores : 2 apicid : 2 initial apicid : 2 fpu : yes fpu_exception : yes cpuid level : 13 wp : yes flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm ida arat epb pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms xsaveopt bugs : bogomips : 5182.68 clflush size : 64 cache_alignment : 64 address sizes : 36 bits physical, 48 bits virtual power management: processor : 2 vendor_id : GenuineIntel cpu family : 6 model : 58 model name : Intel(R) Core(TM) i7-3687U CPU @ 2.10GHz stepping : 9 microcode : 0x15 cpu MHz : 2860.000 cache size : 4096 KB physical id : 0 siblings : 4 core id : 0 cpu cores : 2 apicid : 1 initial apicid : 1 fpu : yes fpu_exception : yes cpuid level : 13 wp : yes flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm ida arat epb pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms xsaveopt bugs : bogomips : 5182.68 clflush size : 64 cache_alignment : 64 address sizes : 36 bits physical, 48 bits virtual power management: processor : 3 vendor_id : GenuineIntel cpu family : 6 model : 58 model name : Intel(R) Core(TM) i7-3687U CPU @ 2.10GHz stepping : 9 microcode : 0x15 cpu MHz : 2813.585 cache size : 4096 KB physical id : 0 siblings : 4 core id : 1 cpu cores : 2 apicid : 3 initial apicid : 3 fpu : yes fpu_exception : yes cpuid level : 13 wp : yes flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm ida arat epb pln pts dtherm tpr_shadow vnmi flexpriority ept vpid fsgsbase smep erms xsaveopt bugs : bogomips : 5182.68 clflush size : 64 cache_alignment : 64 address sizes : 36 bits physical, 48 bits virtual power management: #+end_example - Compiler version #+begin_src shell :results output :exports both :tangle get_info.sh gcc --version #+end_src #+RESULTS: : gcc (Debian 5.3.1-6) 5.3.1 20160114 : Copyright (C) 2015 Free Software Foundation, Inc. : This is free software; see the source for copying conditions. There is NO : warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. : - Libpmfr version: #+begin_src shell :results output :exports both :tangle get_info.sh apt-cache show libmpfr-dev #+end_src #+RESULTS: #+begin_example Package: libmpfr-dev Source: mpfr4 Version: 3.1.5-1 Installed-Size: 1029 Maintainer: Debian GCC Maintainers Architecture: amd64 Replaces: libgmp3-dev (<< 4.1.4-3) Depends: libgmp-dev, libmpfr4 (= 3.1.5-1) Suggests: libmpfr-doc Breaks: libgmp3-dev (<< 4.1.4-3) Description-en: multiple precision floating-point computation developers tools This development package provides the header files and the symbolic links to allow compilation and linking of programs that use the libraries provided in the libmpfr4 package. . MPFR provides a library for multiple-precision floating-point computation with correct rounding. The computation is both efficient and has a well-defined semantics. It copies the good ideas from the ANSI/IEEE-754 standard for double-precision floating-point arithmetic (53-bit mantissa). Description-md5: a2580b68a7c6f1fcadeefc6b17102b32 Multi-Arch: same Homepage: http://www.mpfr.org/ Tag: devel::lang:c, devel::library, implemented-in::c, role::devel-lib, suite::gnu Section: libdevel Priority: optional Filename: pool/main/m/mpfr4/libmpfr-dev_3.1.5-1_amd64.deb Size: 207200 MD5sum: e5c7872461f263e27312c9ef4f4218b9 SHA256: 279970e210c7db4e2550f5a3b7abb2674d01e9f0afd2a4857f1589a6947e0cbd #+end_example *** A first measurement #+begin_src shell :results output :exports both :tangle measure.sh cd /tmp/ gcc -O3 a.c -o a ./a 1000 #+end_src #+RESULTS: : matrix product took 680ms : t=9062.368470 #+begin_src shell :results output :exports both :tangle measure.sh cd /tmp/ gcc -O3 d.c -o d -lmpfr ./d 1000 53 #+end_src #+RESULTS: : MPFR library: 3.1.5 : MPFR header: 3.1.5 (based on 3.1.5) : matrix product took 74460ms : t=9062.368470 Et donc, chez moi, le ratio est plutôt de #+begin_src R :results output :session *R* :exports both 74460/844 #+end_src #+RESULTS: : [1] 88.22275 *** A second measurement Ceci étant dit, si je reexécute ces deux codes: #+begin_src shell :results output :exports both cd /tmp/ gcc -O3 a.c -o a ./a 1000 #+end_src #+RESULTS: : matrix product took 676ms : t=9062.368470 #+begin_src shell :results output :exports both cd /tmp/ gcc -O3 d.c -o d -lmpfr ./d 1000 53 #+end_src #+RESULTS: : MPFR library: 3.1.5 : MPFR header: 3.1.5 (based on 3.1.5) : matrix product took 68732ms : t=9062.368470 J'obtiens une valeur assez différente qui me donnerait cette fois ci un ratio de #+begin_src R :results output :session *R* :exports both 68732/676 #+end_src #+RESULTS: : [1] 101.6746 c'est à dire "plus proche" de ce qui est annoncé dans [2] mais c'est un coup de chance, j'aurais tout aussi bien pu obtenir 120 ! Bref, c'est pas le même setup que vous mais statistiquement parlant, il doit aussi y avoir quelque chose à faire là, non ? ** References [1] Fousse, L., Hanrot, G., Lefèvre, V., Pélissier, P., and Zimmermann, P. MPFR: A multiple-precision binary floating- point library with correct rounding. ACM Trans. Math. Softw. 33, 2 (2007), article 13. [2] van der Hoeven, J. Multiple precision floating-point arithmetic on SIMD processors. In Proceedings of Arith’24 (2017), IEEE, pp. 2–9. Entered on [2017-09-01 ven. 17:12] * Emacs Setup :noexport: This document has local variables in its postembule, which should allow Org-mode (9) to work seamlessly without any setup. If you're uncomfortable using such variables, you can safely ignore them at startup. Exporting may require that you copy them in your .emacs. # Local Variables: # eval: (require 'org-install) # eval: (org-babel-do-load-languages 'org-babel-load-languages '((sh . t) (R . t) (perl . t) (python .t) )) # eval: (setq org-confirm-babel-evaluate nil) # eval: (unless (boundp 'org-latex-classes) (setq org-latex-classes nil)) # eval: (add-to-list 'org-latex-classes '("IEEEtran" # "\\documentclass[conference, 10pt, compsocconf]{IEEEtran}\n \[NO-DEFAULT-PACKAGES]\n \[EXTRA]\n \\usepackage{graphicx}\n \\usepackage{hyperref}" ("\\section{%s}" . "\\section*{%s}") ("\\subsection{%s}" . "\\subsection*{%s}") ("\\subsubsection{%s}" . "\\subsubsection*{%s}") ("\\paragraph{%s}" . "\\paragraph*{%s}") ("\\subparagraph{%s}" . "\\subparagraph*{%s}"))) # eval: (setq org-alphabetical-lists t) # eval: (setq org-src-fontify-natively t) # eval: (add-to-list 'load-path ".") # eval: (add-to-list 'org-latex-packages-alist '("" "minted")) # eval: (setq org-latex-listings 'minted) # eval: (setq org-latex-pdf-process '("pdflatex -shell-escape -interaction nonstopmode -output-directory %o %f" "pdflatex -shell-escape -interaction nonstopmode -output-directory %o %f" "pdflatex -shell-escape -interaction nonstopmode -output-directory %o %f")) # End: