digitalmars.D.ldc - floor operation problem

bearophile (267/267) Dec 18 2013 While I was debugging a performance problem, I have found the

Marco Leise (5/306) Dec 19 2013 but... fast-math isn't kosher
Marco Leise (14/14) Dec 19 2013 I cannot reproduce this on 64-bit Linux.

bearophile (7/11) Dec 19 2013 Practice of programming shows that there are many situations

David Nadlinger (5/10) Dec 19 2013 I don't think Marco is building his C executable with -ffast-math.

bearophile (7/11) Dec 19 2013 Showing the asm is a good way to understand what's going on in

Marco Leise (14/31) Dec 19 2013 At first my gcc executable had the same speed as on your

"bearophile" <bearophileHUGS lycos.com> writes:

While I was debugging a performance problem, I have found the 
cause is the floor operation. Below there is a small benchmark to 
show it.

I have compiled the code with:

gcc -Ofast -std=c99 -s -flto -mfpmath=sse -ffast-math -msse3 
test1.c -o test1
ldmd2 -O -release -inline -noboundscheck test2.d
ldmd2 -O -release -inline -noboundscheck test3.d

32 bit system

gcc version 4.8.0
LDC 0.12.1 (based on DMD v2.063.2 and LLVM 3.3.1)

Run-time, seconds:
test1.c: 1.01
test2.d: 4.14
test3.d: 6.62

---------------------

// test1.c
#include <stdio.h>
#include <math.h>
#include <stdlib.h>

static inline float foo(const float x, const float y) {
     return floorf(x) + floorf(y);
}

int main() {
     float total = 0.0f;

     for (int i = 0; i < 1000; i++)
         for (int y = 0; y < 256; y++)
             for (int x = 0; x < 256; x++)
                 total += foo(x * 0.1f, y * 0.1f);

     printf("%f\n", total);
     return 0;
}

---------------------

// test2.d
import core.stdc.stdio, core.stdc.math;

float foo(const float x, const float y) nothrow {
     return floorf(x) + floorf(y);
}

int main() {
     float total = 0.0f;

     for (int i = 0; i < 1000; i++)
         for (int y = 0; y < 256; y++)
             for (int x = 0; x < 256; x++)
                 total += foo(x * 0.1f, y * 0.1f);

     printf("%f\n", total);
     return 0;
}

---------------------

// test3.d
import core.stdc.stdio, std.math;

float foo(const float x, const float y) nothrow {
     return floor(x) + floor(y);
}

int main() {
     float total = 0.0f;

     for (int i = 0; i < 1000; i++)
         for (int y = 0; y < 256; y++)
             for (int x = 0; x < 256; x++)
                 total += foo(x * 0.1f, y * 0.1f);

     printf("%f\n", total);
     return 0;
}

---------------------

test1.c asm:

_main:
     pushl   %ebp
     movl    %esp, %ebp
     pushl   %ebx
     movl    $1000, %ebx
     andl    $-16, %esp
     subl    $16, %esp
     call    ___main
     xorps   %xmm1, %xmm1
     movss   LC3, %xmm5
L2:
     movss   LC1, %xmm6
     xorps   %xmm3, %xmm3
     xorl    %ecx, %ecx
     .p2align 4,,7
L9:
     movss   LC2, %xmm4
     xorps   %xmm0, %xmm0
     xorl    %eax, %eax
     .p2align 4,,7
L7:
     addss   %xmm0, %xmm1
     addl    $1, %eax
     cmpl    $256, %eax
     addss   %xmm3, %xmm1
     je  L12
     cvtsi2ss    %eax, %xmm0
     mulss   %xmm6, %xmm0
     movaps  %xmm0, %xmm2
     andps   %xmm5, %xmm2
     ucomiss %xmm2, %xmm4
     jbe L7
     cvttss2si   %xmm0, %edx
     cvtsi2ss    %edx, %xmm2
     movaps  %xmm2, %xmm7
     cmpnless    %xmm0, %xmm7
     movaps  %xmm7, %xmm0
     movss   LC4, %xmm7
     andps   %xmm7, %xmm0
     subss   %xmm0, %xmm2
     movaps  %xmm2, %xmm0
     jmp L7
     .p2align 4,,7
L12:
     addl    $1, %ecx
     cmpl    $256, %ecx
     je  L5
     cvtsi2ss    %ecx, %xmm3
     movss   LC6, %xmm0
     movss   LC2, %xmm2
     mulss   LC1, %xmm3
     andps   %xmm3, %xmm0
     ucomiss %xmm0, %xmm2
     jbe L9
     cvttss2si   %xmm3, %eax
     cvtsi2ss    %eax, %xmm0
     movaps  %xmm0, %xmm2
     cmpnless    %xmm3, %xmm2
     movss   LC4, %xmm3
     andps   %xmm3, %xmm2
     movaps  %xmm0, %xmm3
     subss   %xmm2, %xmm3
     jmp L9
L5:
     subl    $1, %ebx
     jne L2
     unpcklps    %xmm1, %xmm1
     movl    $LC5, (%esp)
     cvtps2pd    %xmm1, %xmm5
     movsd   %xmm5, 4(%esp)
     call    _printf
     xorl    %eax, %eax
     movl    -4(%ebp), %ebx
     leave
     ret

---------------------

test2.d asm:

__Dmain:
	pushl	%ebx
	pushl	%edi
	pushl	%esi
	subl	$28, %esp
	xorps	%xmm0, %xmm0
	xorl	%esi, %esi
	movss	LCPI1_0, %xmm1
	.align	16, 0x90
LBB1_1:
	xorl	%edi, %edi
	.align	16, 0x90
LBB1_2:
	xorps	%xmm2, %xmm2
	cvtsi2ssl	%edi, %xmm2
	mulss	%xmm1, %xmm2
	movss	%xmm2, 12(%esp)
	xorl	%ebx, %ebx
	.align	16, 0x90
LBB1_3:
	movss	%xmm0, 16(%esp)
	xorps	%xmm0, %xmm0
	cvtsi2ssl	%ebx, %xmm0
	mulss	%xmm1, %xmm0
	movss	%xmm0, (%esp)
	calll	_floorf
	movss	12(%esp), %xmm0
	movss	%xmm0, (%esp)
	fstps	24(%esp)
	calll	_floorf
	movss	LCPI1_0, %xmm1
	fstps	20(%esp)
	movss	24(%esp), %xmm0
	addss	20(%esp), %xmm0
	movss	16(%esp), %xmm2
	addss	%xmm0, %xmm2
	movss	%xmm2, 16(%esp)
	movss	16(%esp), %xmm0
	incl	%ebx
	cmpl	$256, %ebx
	jne	LBB1_3
	incl	%edi
	cmpl	$256, %edi
	jne	LBB1_2
	incl	%esi
	cmpl	$1000, %esi
	jne	LBB1_1
	cvtss2sd	%xmm0, %xmm0
	movsd	%xmm0, 4(%esp)
	movl	$_.str, (%esp)
	calll	___mingw_printf
	xorl	%eax, %eax
	addl	$28, %esp
	popl	%esi
	popl	%edi
	popl	%ebx
	ret

---------------------

test3.d asm:

__Dmain:
     pushl   %ebx
     pushl   %edi
     pushl   %esi
     subl    $52, %esp
     xorps   %xmm1, %xmm1
     xorl    %esi, %esi
     movss   LCPI1_0, %xmm2
     .align  16, 0x90
LBB1_1:
     xorl    %edi, %edi
     .align  16, 0x90
LBB1_2:
     xorps   %xmm0, %xmm0
     cvtsi2ssl   %edi, %xmm0
     mulss   %xmm2, %xmm0
     movss   %xmm0, 48(%esp)
     xorl    %ebx, %ebx
     flds    48(%esp)
     fstpt   12(%esp)
     movaps  %xmm1, %xmm0
     .align  16, 0x90
LBB1_3:
     movss   %xmm0, 36(%esp)
     xorps   %xmm0, %xmm0
     cvtsi2ssl   %ebx, %xmm0
     mulss   %xmm2, %xmm0
     movss   %xmm0, 44(%esp)
     flds    44(%esp)
     fstpt   (%esp)
     calll   __D3std4math5floorFNbNeeZe
     subl    $12, %esp
     fstpt   24(%esp)
     fldt    12(%esp)
     fstpt   (%esp)
     calll   __D3std4math5floorFNbNeeZe
     subl    $12, %esp
     movss   36(%esp), %xmm0
     movss   LCPI1_0, %xmm2
     fldt    24(%esp)
     faddp   %st(1)
     fstps   40(%esp)
     addss   40(%esp), %xmm0
     incl    %ebx
     cmpl    $256, %ebx
     jne LBB1_3
     movaps  %xmm0, %xmm1
     incl    %edi
     cmpl    $256, %edi
     jne LBB1_2
     incl    %esi
     cmpl    $1000, %esi
     jne LBB1_1
     xorps   %xmm0, %xmm0
     cvtss2sd    %xmm1, %xmm0
     movsd   %xmm0, 4(%esp)
     movl    $_.str, (%esp)
     calll   ___mingw_printf
     xorl    %eax, %eax
     addl    $52, %esp
     popl    %esi
     popl    %edi
     popl    %ebx
     ret

---------------------

Bye,
bearophile

Dec 18 2013

Marco Leise <Marco.Leise gmx.de> writes:

Am Thu, 19 Dec 2013 01:15:27 +0100
schrieb "bearophile" <bearophileHUGS lycos.com>:

 While I was debugging a performance problem, I have found the 
 cause is the floor operation. Below there is a small benchmark to 
 show it.
 
 I have compiled the code with:
 
 gcc -Ofast -std=c99 -s -flto -mfpmath=sse -ffast-math -msse3 
 test1.c -o test1
 ldmd2 -O -release -inline -noboundscheck test2.d
 ldmd2 -O -release -inline -noboundscheck test3.d
 
 32 bit system
 
 gcc version 4.8.0
 LDC 0.12.1 (based on DMD v2.063.2 and LLVM 3.3.1)
 
 Run-time, seconds:
 test1.c: 1.01
 test2.d: 4.14
 test3.d: 6.62
 
 ---------------------
 
 // test1.c
 #include <stdio.h>
 #include <math.h>
 #include <stdlib.h>
 
 static inline float foo(const float x, const float y) {
      return floorf(x) + floorf(y);
 }
 
 int main() {
      float total = 0.0f;
 
      for (int i = 0; i < 1000; i++)
          for (int y = 0; y < 256; y++)
              for (int x = 0; x < 256; x++)
                  total += foo(x * 0.1f, y * 0.1f);
 
      printf("%f\n", total);
      return 0;
 }
 
 ---------------------
 
 // test2.d
 import core.stdc.stdio, core.stdc.math;
 
 float foo(const float x, const float y) nothrow {
      return floorf(x) + floorf(y);
 }
 
 int main() {
      float total = 0.0f;
 
      for (int i = 0; i < 1000; i++)
          for (int y = 0; y < 256; y++)
              for (int x = 0; x < 256; x++)
                  total += foo(x * 0.1f, y * 0.1f);
 
      printf("%f\n", total);
      return 0;
 }
 
 ---------------------
 
 // test3.d
 import core.stdc.stdio, std.math;
 
 float foo(const float x, const float y) nothrow {
      return floor(x) + floor(y);
 }
 
 int main() {
      float total = 0.0f;
 
      for (int i = 0; i < 1000; i++)
          for (int y = 0; y < 256; y++)
              for (int x = 0; x < 256; x++)
                  total += foo(x * 0.1f, y * 0.1f);
 
      printf("%f\n", total);
      return 0;
 }
 
 ---------------------
 
 test1.c asm:
 
 _main:
      pushl   %ebp
      movl    %esp, %ebp
      pushl   %ebx
      movl    $1000, %ebx
      andl    $-16, %esp
      subl    $16, %esp
      call    ___main
      xorps   %xmm1, %xmm1
      movss   LC3, %xmm5
 L2:
      movss   LC1, %xmm6
      xorps   %xmm3, %xmm3
      xorl    %ecx, %ecx
      .p2align 4,,7
 L9:
      movss   LC2, %xmm4
      xorps   %xmm0, %xmm0
      xorl    %eax, %eax
      .p2align 4,,7
 L7:
      addss   %xmm0, %xmm1
      addl    $1, %eax
      cmpl    $256, %eax
      addss   %xmm3, %xmm1
      je  L12
      cvtsi2ss    %eax, %xmm0
      mulss   %xmm6, %xmm0
      movaps  %xmm0, %xmm2
      andps   %xmm5, %xmm2
      ucomiss %xmm2, %xmm4
      jbe L7
      cvttss2si   %xmm0, %edx
      cvtsi2ss    %edx, %xmm2
      movaps  %xmm2, %xmm7
      cmpnless    %xmm0, %xmm7
      movaps  %xmm7, %xmm0
      movss   LC4, %xmm7
      andps   %xmm7, %xmm0
      subss   %xmm0, %xmm2
      movaps  %xmm2, %xmm0
      jmp L7
      .p2align 4,,7
 L12:
      addl    $1, %ecx
      cmpl    $256, %ecx
      je  L5
      cvtsi2ss    %ecx, %xmm3
      movss   LC6, %xmm0
      movss   LC2, %xmm2
      mulss   LC1, %xmm3
      andps   %xmm3, %xmm0
      ucomiss %xmm0, %xmm2
      jbe L9
      cvttss2si   %xmm3, %eax
      cvtsi2ss    %eax, %xmm0
      movaps  %xmm0, %xmm2
      cmpnless    %xmm3, %xmm2
      movss   LC4, %xmm3
      andps   %xmm3, %xmm2
      movaps  %xmm0, %xmm3
      subss   %xmm2, %xmm3
      jmp L9
 L5:
      subl    $1, %ebx
      jne L2
      unpcklps    %xmm1, %xmm1
      movl    $LC5, (%esp)
      cvtps2pd    %xmm1, %xmm5
      movsd   %xmm5, 4(%esp)
      call    _printf
      xorl    %eax, %eax
      movl    -4(%ebp), %ebx
      leave
      ret
 
 ---------------------
 
 test2.d asm:
 
 __Dmain:
 	pushl	%ebx
 	pushl	%edi
 	pushl	%esi
 	subl	$28, %esp
 	xorps	%xmm0, %xmm0
 	xorl	%esi, %esi
 	movss	LCPI1_0, %xmm1
 	.align	16, 0x90
 LBB1_1:
 	xorl	%edi, %edi
 	.align	16, 0x90
 LBB1_2:
 	xorps	%xmm2, %xmm2
 	cvtsi2ssl	%edi, %xmm2
 	mulss	%xmm1, %xmm2
 	movss	%xmm2, 12(%esp)
 	xorl	%ebx, %ebx
 	.align	16, 0x90
 LBB1_3:
 	movss	%xmm0, 16(%esp)
 	xorps	%xmm0, %xmm0
 	cvtsi2ssl	%ebx, %xmm0
 	mulss	%xmm1, %xmm0
 	movss	%xmm0, (%esp)
 	calll	_floorf
 	movss	12(%esp), %xmm0
 	movss	%xmm0, (%esp)
 	fstps	24(%esp)
 	calll	_floorf
 	movss	LCPI1_0, %xmm1
 	fstps	20(%esp)
 	movss	24(%esp), %xmm0
 	addss	20(%esp), %xmm0
 	movss	16(%esp), %xmm2
 	addss	%xmm0, %xmm2
 	movss	%xmm2, 16(%esp)
 	movss	16(%esp), %xmm0
 	incl	%ebx
 	cmpl	$256, %ebx
 	jne	LBB1_3
 	incl	%edi
 	cmpl	$256, %edi
 	jne	LBB1_2
 	incl	%esi
 	cmpl	$1000, %esi
 	jne	LBB1_1
 	cvtss2sd	%xmm0, %xmm0
 	movsd	%xmm0, 4(%esp)
 	movl	$_.str, (%esp)
 	calll	___mingw_printf
 	xorl	%eax, %eax
 	addl	$28, %esp
 	popl	%esi
 	popl	%edi
 	popl	%ebx
 	ret
 
 ---------------------
 
 test3.d asm:
 
 __Dmain:
      pushl   %ebx
      pushl   %edi
      pushl   %esi
      subl    $52, %esp
      xorps   %xmm1, %xmm1
      xorl    %esi, %esi
      movss   LCPI1_0, %xmm2
      .align  16, 0x90
 LBB1_1:
      xorl    %edi, %edi
      .align  16, 0x90
 LBB1_2:
      xorps   %xmm0, %xmm0
      cvtsi2ssl   %edi, %xmm0
      mulss   %xmm2, %xmm0
      movss   %xmm0, 48(%esp)
      xorl    %ebx, %ebx
      flds    48(%esp)
      fstpt   12(%esp)
      movaps  %xmm1, %xmm0
      .align  16, 0x90
 LBB1_3:
      movss   %xmm0, 36(%esp)
      xorps   %xmm0, %xmm0
      cvtsi2ssl   %ebx, %xmm0
      mulss   %xmm2, %xmm0
      movss   %xmm0, 44(%esp)
      flds    44(%esp)
      fstpt   (%esp)
      calll   __D3std4math5floorFNbNeeZe
      subl    $12, %esp
      fstpt   24(%esp)
      fldt    12(%esp)
      fstpt   (%esp)
      calll   __D3std4math5floorFNbNeeZe
      subl    $12, %esp
      movss   36(%esp), %xmm0
      movss   LCPI1_0, %xmm2
      fldt    24(%esp)
      faddp   %st(1)
      fstps   40(%esp)
      addss   40(%esp), %xmm0
      incl    %ebx
      cmpl    $256, %ebx
      jne LBB1_3
      movaps  %xmm0, %xmm1
      incl    %edi
      cmpl    $256, %edi
      jne LBB1_2
      incl    %esi
      cmpl    $1000, %esi
      jne LBB1_1
      xorps   %xmm0, %xmm0
      cvtss2sd    %xmm1, %xmm0
      movsd   %xmm0, 4(%esp)
      movl    $_.str, (%esp)
      calll   ___mingw_printf
      xorl    %eax, %eax
      addl    $52, %esp
      popl    %esi
      popl    %edi
      popl    %ebx
      ret
 
 ---------------------
 
 Bye,
 bearophile

but... fast-math isn't kosher

-- 
Marco

Dec 19 2013

Marco Leise <Marco.Leise gmx.de> writes:

I cannot reproduce this on 64-bit Linux.

Compiled the C version with:
gcc -std=c99 -march=native -O3 -s -flto test1.c -o test1 -Wl,-lm

and the D version with:
ldc2 -release -O3 test2.d -of=test2 -ffunction-sections
-fdata-sections -L--gc-sections -vectorize-slp
-vectorize-loops -unit-at-a-time -L-O1 -L--as-needed -L-lrt
-L-znorelro -L--no-copy-dt-needed-entries -L--relax
-L--sort-common -L--export-dynamic
strip test2 -R .comment -R .note.ABI-tag -R .gnu.version
-R .jcr -R .got

Runtimes for both executables are around 0.8s.

-- 
Marco

Dec 19 2013

"bearophile" <bearophileHUGS lycos.com> writes:

Marco Leise:

 but... fast-math isn't kosher

Practice of programming shows that there are many situations 
where today fast-math is strictly necessary to allow the compiler 
to perform some important optimizations.


 I cannot reproduce this on 64-bit Linux.
...
 Runtimes for both executables are around 0.8s.

Oh, good, so is it a 32 bit problem?

Bye,
bearophile

Dec 19 2013

"David Nadlinger" <code klickverbot.at> writes:

On 19 Dec 2013, at 12:18, bearophile wrote:
 Marco Leise:
 I cannot reproduce this on 64-bit Linux.
 ...
 Runtimes for both executables are around 0.8s.

 Oh, good, so is it a 32 bit problem?

I don't think Marco is building his C executable with -ffast-math.

We should definitely be able to provide a switch to enable the same 
unsafe/wrong FP optimizations in LDC as well.

David

Dec 19 2013

"bearophile" <bearophileHUGS lycos.com> writes:

David Nadlinger:

 I don't think Marco is building his C executable with 
 -ffast-math.

Showing the asm is a good way to understand what's going on in 
those 64 bit builds.


 We should definitely be able to provide a switch to enable the 
 same unsafe/wrong FP optimizations in LDC as well.

So is this floor problem caused by those (missing) FP 
optimizations? :-)

Bye,
bearophile

Dec 19 2013

Marco Leise <Marco.Leise gmx.de> writes:

Am Thu, 19 Dec 2013 14:54:56 +0100
schrieb "bearophile" <bearophileHUGS lycos.com>:

 David Nadlinger:
 
 I don't think Marco is building his C executable with 
 -ffast-math.

 
 Showing the asm is a good way to understand what's going on in 
 those 64 bit builds.
 
 
 We should definitely be able to provide a switch to enable the 
 same unsafe/wrong FP optimizations in LDC as well.

 
 So is this floor problem caused by those (missing) FP 
 optimizations? :-)
 
 Bye,
 bearophile

At first my gcc executable had the same speed as on your
computer, bearophile (~1s). Then I removed some of the flags
including -ffast-math from the gcc command-line and it became
25% faster -> 0.8s. I didn't try your original ldmd2
command-line, but one from a generic "omg-uber-optimize"
Makefile I often use for stuff like this and immediately had
the same performance in D as for C. I cannot show you the
disassembly though, I'm in the middle of upgrading to to Gnome
3 and most programs don't work, like e.g. copy&paste from
terminals.

-- 
Marco

Dec 19 2013

D Programming

C/C++ Programming

Other

digitalmars.D.ldc - floor operation problem