www.digitalmars.com         C & C++   DMDScript  

D.gnu - Optimisation question

reply "John Colvin" <john.loughran.colvin gmail.com> writes:
void mul(float[] a, float v)
{
   if ((cast(size_t)a.ptr) % 32 == 0
     && a.length == 16)
   {
     foreach (ref el; a)
       el *= v;
   }
}

with
-Ofast -march=broadwell -frelease
becomes

void example.mul(float[], float):
	movq	%rsi, %rax
	andl	$31, %eax
	jne	.L44
	cmpq	$16, %rdi
	jne	.L44
	shrq	$2, %rax
	negq	%rax
	andl	$7, %eax
	je	.L10
	vmulss	(%rsi), %xmm0, %xmm1
	vmovss	%xmm1, (%rsi)
	cmpq	$1, %rax
	je	.L11
	vmulss	4(%rsi), %xmm0, %xmm1
	vmovss	%xmm1, 4(%rsi)
	cmpq	$2, %rax
	je	.L12
	vmulss	8(%rsi), %xmm0, %xmm1
	vmovss	%xmm1, 8(%rsi)
	cmpq	$3, %rax
	je	.L13
	vmulss	12(%rsi), %xmm0, %xmm1
	vmovss	%xmm1, 12(%rsi)
	cmpq	$4, %rax
	je	.L14
	vmulss	16(%rsi), %xmm0, %xmm1
	vmovss	%xmm1, 16(%rsi)
	cmpq	$5, %rax
	je	.L15
	vmulss	20(%rsi), %xmm0, %xmm1
	vmovss	%xmm1, 20(%rsi)
	cmpq	$6, %rax
	je	.L16
	vmulss	24(%rsi), %xmm0, %xmm1
	movl	$9, %edx
	movl	$7, %r9d
	vmovss	%xmm1, 24(%rsi)
.L5:
	movl	$16, %edi
	movl	$8, %r8d
	movl	$1, %r10d
	subq	%rax, %rdi
.L4:
	leaq	(%rsi,%rax,4), %rcx
	vbroadcastss	%xmm0, %ymm1
	vmulps	(%rcx), %ymm1, %ymm2
	vmovaps	%ymm2, (%rcx)
	cmpq	$1, %r10
	je	.L6
	vmulps	32(%rcx), %ymm1, %ymm1
	vmovaps	%ymm1, 32(%rcx)
.L6:
	leaq	(%r9,%r8), %rax
	subq	%r8, %rdx
	cmpq	%r8, %rdi
	je	.L43
	leaq	(%rsi,%rax,4), %rcx
	vmulss	(%rcx), %xmm0, %xmm1
	vmovss	%xmm1, (%rcx)
	leaq	1(%rax), %rcx
	cmpq	$1, %rdx
	je	.L43
	leaq	(%rsi,%rcx,4), %rcx
	vmulss	(%rcx), %xmm0, %xmm1
	vmovss	%xmm1, (%rcx)
	leaq	2(%rax), %rcx
	cmpq	$2, %rdx
	je	.L43
	leaq	(%rsi,%rcx,4), %rcx
	vmulss	(%rcx), %xmm0, %xmm1
	vmovss	%xmm1, (%rcx)
	leaq	3(%rax), %rcx
	cmpq	$3, %rdx
	je	.L43
	leaq	(%rsi,%rcx,4), %rcx
	vmulss	(%rcx), %xmm0, %xmm1
	vmovss	%xmm1, (%rcx)
	leaq	4(%rax), %rcx
	cmpq	$4, %rdx
	je	.L43
	leaq	(%rsi,%rcx,4), %rcx
	vmulss	(%rcx), %xmm0, %xmm1
	vmovss	%xmm1, (%rcx)
	leaq	5(%rax), %rcx
	cmpq	$5, %rdx
	je	.L43
	leaq	(%rsi,%rcx,4), %rcx
	addq	$6, %rax
	vmulss	(%rcx), %xmm0, %xmm1
	vmovss	%xmm1, (%rcx)
	cmpq	$6, %rdx
	je	.L43
	leaq	(%rsi,%rax,4), %rax
	vmulss	(%rax), %xmm0, %xmm0
	vmovss	%xmm0, (%rax)
	vzeroupper
	ret
.L43:
	vzeroupper
.L44:
	ret
.L10:
	movl	$16, %r8d
	movl	$2, %r10d
	movl	$16, %edi
	movl	$16, %edx
	xorl	%r9d, %r9d
	jmp	.L4
.L11:
	movl	$15, %edx
	movl	$1, %r9d
	jmp	.L5
.L16:
	movl	$10, %edx
	movl	$6, %r9d
	jmp	.L5
.L15:
	movl	$11, %edx
	movl	$5, %r9d
	jmp	.L5
.L14:
	movl	$12, %edx
	movl	$4, %r9d
	jmp	.L5
.L13:
	movl	$13, %edx
	movl	$3, %r9d
	jmp	.L5
.L12:
	movl	$14, %edx
	movl	$2, %r9d
	jmp	.L5

Which seems like an awful lot of code, wouldn't you say?

I was expecting something along the lines of this (untested):

void example.mul(float[], float):
	testb	$31, %sil
	jne	.L44
	cmpq	$16, %rdi
	jne	.L44
         vbroadcastss	xmm0, ymm2
         vmulps 	(%rsi), ymm2, ymm0
         vmulps	32(%rsi), ymm2, ymm1
         vmovaps	ymm0, (%rsi)
         vmovaps	ymm1, 32(%rsi)
.L44:
	ret

Am I being stupid, or is the optimiser making a complete hash of 
things?
Apr 10 2015
parent "Iain Buclaw via D.gnu" <d.gnu puremagic.com> writes:
On 10 April 2015 at 20:18, John Colvin via D.gnu <d.gnu puremagic.com> wrote:
 void mul(float[] a, float v)
 {
   if ((cast(size_t)a.ptr) % 32 == 0
     && a.length == 16)
   {
     foreach (ref el; a)
       el *= v;
   }
 }

 with
 -Ofast -march=broadwell -frelease
 becomes

 void example.mul(float[], float):
         movq    %rsi, %rax
         andl    $31, %eax
         jne     .L44
         cmpq    $16, %rdi
         jne     .L44
         shrq    $2, %rax
         negq    %rax
         andl    $7, %eax
         je      .L10
         vmulss  (%rsi), %xmm0, %xmm1
         vmovss  %xmm1, (%rsi)
         cmpq    $1, %rax
         je      .L11
         vmulss  4(%rsi), %xmm0, %xmm1
         vmovss  %xmm1, 4(%rsi)
         cmpq    $2, %rax
         je      .L12
         vmulss  8(%rsi), %xmm0, %xmm1
         vmovss  %xmm1, 8(%rsi)
         cmpq    $3, %rax
         je      .L13
         vmulss  12(%rsi), %xmm0, %xmm1
         vmovss  %xmm1, 12(%rsi)
         cmpq    $4, %rax
         je      .L14
         vmulss  16(%rsi), %xmm0, %xmm1
         vmovss  %xmm1, 16(%rsi)
         cmpq    $5, %rax
         je      .L15
         vmulss  20(%rsi), %xmm0, %xmm1
         vmovss  %xmm1, 20(%rsi)
         cmpq    $6, %rax
         je      .L16
         vmulss  24(%rsi), %xmm0, %xmm1
         movl    $9, %edx
         movl    $7, %r9d
         vmovss  %xmm1, 24(%rsi)
 .L5:
         movl    $16, %edi
         movl    $8, %r8d
         movl    $1, %r10d
         subq    %rax, %rdi
 .L4:
         leaq    (%rsi,%rax,4), %rcx
         vbroadcastss    %xmm0, %ymm1
         vmulps  (%rcx), %ymm1, %ymm2
         vmovaps %ymm2, (%rcx)
         cmpq    $1, %r10
         je      .L6
         vmulps  32(%rcx), %ymm1, %ymm1
         vmovaps %ymm1, 32(%rcx)
 .L6:
         leaq    (%r9,%r8), %rax
         subq    %r8, %rdx
         cmpq    %r8, %rdi
         je      .L43
         leaq    (%rsi,%rax,4), %rcx
         vmulss  (%rcx), %xmm0, %xmm1
         vmovss  %xmm1, (%rcx)
         leaq    1(%rax), %rcx
         cmpq    $1, %rdx
         je      .L43
         leaq    (%rsi,%rcx,4), %rcx
         vmulss  (%rcx), %xmm0, %xmm1
         vmovss  %xmm1, (%rcx)
         leaq    2(%rax), %rcx
         cmpq    $2, %rdx
         je      .L43
         leaq    (%rsi,%rcx,4), %rcx
         vmulss  (%rcx), %xmm0, %xmm1
         vmovss  %xmm1, (%rcx)
         leaq    3(%rax), %rcx
         cmpq    $3, %rdx
         je      .L43
         leaq    (%rsi,%rcx,4), %rcx
         vmulss  (%rcx), %xmm0, %xmm1
         vmovss  %xmm1, (%rcx)
         leaq    4(%rax), %rcx
         cmpq    $4, %rdx
         je      .L43
         leaq    (%rsi,%rcx,4), %rcx
         vmulss  (%rcx), %xmm0, %xmm1
         vmovss  %xmm1, (%rcx)
         leaq    5(%rax), %rcx
         cmpq    $5, %rdx
         je      .L43
         leaq    (%rsi,%rcx,4), %rcx
         addq    $6, %rax
         vmulss  (%rcx), %xmm0, %xmm1
         vmovss  %xmm1, (%rcx)
         cmpq    $6, %rdx
         je      .L43
         leaq    (%rsi,%rax,4), %rax
         vmulss  (%rax), %xmm0, %xmm0
         vmovss  %xmm0, (%rax)
         vzeroupper
         ret
 .L43:
         vzeroupper
 .L44:
         ret
 .L10:
         movl    $16, %r8d
         movl    $2, %r10d
         movl    $16, %edi
         movl    $16, %edx
         xorl    %r9d, %r9d
         jmp     .L4
 .L11:
         movl    $15, %edx
         movl    $1, %r9d
         jmp     .L5
 .L16:
         movl    $10, %edx
         movl    $6, %r9d
         jmp     .L5
 .L15:
         movl    $11, %edx
         movl    $5, %r9d
         jmp     .L5
 .L14:
         movl    $12, %edx
         movl    $4, %r9d
         jmp     .L5
 .L13:
         movl    $13, %edx
         movl    $3, %r9d
         jmp     .L5
 .L12:
         movl    $14, %edx
         movl    $2, %r9d
         jmp     .L5

 Which seems like an awful lot of code, wouldn't you say?

 I was expecting something along the lines of this (untested):

 void example.mul(float[], float):
         testb   $31, %sil
         jne     .L44
         cmpq    $16, %rdi
         jne     .L44
         vbroadcastss    xmm0, ymm2
         vmulps  (%rsi), ymm2, ymm0
         vmulps  32(%rsi), ymm2, ymm1
         vmovaps ymm0, (%rsi)
         vmovaps ymm1, 32(%rsi)
 .L44:
         ret

 Am I being stupid, or is the optimiser making a complete hash of things?
I fear that I cannot reproduce on gcc-5, maybe is a problem specific to your gcc version? _D6nested3mulFAffZv: testb $31, %sil jne .L8 cmpq $16, %rdi jne .L8 vbroadcastss %xmm0, %ymm0 vmulps (%rsi), %ymm0, %ymm1 vmulps 32(%rsi), %ymm0, %ymm0 vmovaps %ymm1, (%rsi) vmovaps %ymm0, 32(%rsi) vzeroupper .L8: ret Iain.
Apr 10 2015