D.gnu - Optimisation question
- John Colvin (161/161) Apr 10 2015 void mul(float[] a, float v)
- Iain Buclaw via D.gnu (17/177) Apr 10 2015 I fear that I cannot reproduce on gcc-5, maybe is a problem specific
void mul(float[] a, float v) { if ((cast(size_t)a.ptr) % 32 == 0 && a.length == 16) { foreach (ref el; a) el *= v; } } with -Ofast -march=broadwell -frelease becomes void example.mul(float[], float): movq %rsi, %rax andl $31, %eax jne .L44 cmpq $16, %rdi jne .L44 shrq $2, %rax negq %rax andl $7, %eax je .L10 vmulss (%rsi), %xmm0, %xmm1 vmovss %xmm1, (%rsi) cmpq $1, %rax je .L11 vmulss 4(%rsi), %xmm0, %xmm1 vmovss %xmm1, 4(%rsi) cmpq $2, %rax je .L12 vmulss 8(%rsi), %xmm0, %xmm1 vmovss %xmm1, 8(%rsi) cmpq $3, %rax je .L13 vmulss 12(%rsi), %xmm0, %xmm1 vmovss %xmm1, 12(%rsi) cmpq $4, %rax je .L14 vmulss 16(%rsi), %xmm0, %xmm1 vmovss %xmm1, 16(%rsi) cmpq $5, %rax je .L15 vmulss 20(%rsi), %xmm0, %xmm1 vmovss %xmm1, 20(%rsi) cmpq $6, %rax je .L16 vmulss 24(%rsi), %xmm0, %xmm1 movl $9, %edx movl $7, %r9d vmovss %xmm1, 24(%rsi) .L5: movl $16, %edi movl $8, %r8d movl $1, %r10d subq %rax, %rdi .L4: leaq (%rsi,%rax,4), %rcx vbroadcastss %xmm0, %ymm1 vmulps (%rcx), %ymm1, %ymm2 vmovaps %ymm2, (%rcx) cmpq $1, %r10 je .L6 vmulps 32(%rcx), %ymm1, %ymm1 vmovaps %ymm1, 32(%rcx) .L6: leaq (%r9,%r8), %rax subq %r8, %rdx cmpq %r8, %rdi je .L43 leaq (%rsi,%rax,4), %rcx vmulss (%rcx), %xmm0, %xmm1 vmovss %xmm1, (%rcx) leaq 1(%rax), %rcx cmpq $1, %rdx je .L43 leaq (%rsi,%rcx,4), %rcx vmulss (%rcx), %xmm0, %xmm1 vmovss %xmm1, (%rcx) leaq 2(%rax), %rcx cmpq $2, %rdx je .L43 leaq (%rsi,%rcx,4), %rcx vmulss (%rcx), %xmm0, %xmm1 vmovss %xmm1, (%rcx) leaq 3(%rax), %rcx cmpq $3, %rdx je .L43 leaq (%rsi,%rcx,4), %rcx vmulss (%rcx), %xmm0, %xmm1 vmovss %xmm1, (%rcx) leaq 4(%rax), %rcx cmpq $4, %rdx je .L43 leaq (%rsi,%rcx,4), %rcx vmulss (%rcx), %xmm0, %xmm1 vmovss %xmm1, (%rcx) leaq 5(%rax), %rcx cmpq $5, %rdx je .L43 leaq (%rsi,%rcx,4), %rcx addq $6, %rax vmulss (%rcx), %xmm0, %xmm1 vmovss %xmm1, (%rcx) cmpq $6, %rdx je .L43 leaq (%rsi,%rax,4), %rax vmulss (%rax), %xmm0, %xmm0 vmovss %xmm0, (%rax) vzeroupper ret .L43: vzeroupper .L44: ret .L10: movl $16, %r8d movl $2, %r10d movl $16, %edi movl $16, %edx xorl %r9d, %r9d jmp .L4 .L11: movl $15, %edx movl $1, %r9d jmp .L5 .L16: movl $10, %edx movl $6, %r9d jmp .L5 .L15: movl $11, %edx movl $5, %r9d jmp .L5 .L14: movl $12, %edx movl $4, %r9d jmp .L5 .L13: movl $13, %edx movl $3, %r9d jmp .L5 .L12: movl $14, %edx movl $2, %r9d jmp .L5 Which seems like an awful lot of code, wouldn't you say? I was expecting something along the lines of this (untested): void example.mul(float[], float): testb $31, %sil jne .L44 cmpq $16, %rdi jne .L44 vbroadcastss xmm0, ymm2 vmulps (%rsi), ymm2, ymm0 vmulps 32(%rsi), ymm2, ymm1 vmovaps ymm0, (%rsi) vmovaps ymm1, 32(%rsi) .L44: ret Am I being stupid, or is the optimiser making a complete hash of things?
Apr 10 2015
On 10 April 2015 at 20:18, John Colvin via D.gnu <d.gnu puremagic.com> wrote:void mul(float[] a, float v) { if ((cast(size_t)a.ptr) % 32 == 0 && a.length == 16) { foreach (ref el; a) el *= v; } } with -Ofast -march=broadwell -frelease becomes void example.mul(float[], float): movq %rsi, %rax andl $31, %eax jne .L44 cmpq $16, %rdi jne .L44 shrq $2, %rax negq %rax andl $7, %eax je .L10 vmulss (%rsi), %xmm0, %xmm1 vmovss %xmm1, (%rsi) cmpq $1, %rax je .L11 vmulss 4(%rsi), %xmm0, %xmm1 vmovss %xmm1, 4(%rsi) cmpq $2, %rax je .L12 vmulss 8(%rsi), %xmm0, %xmm1 vmovss %xmm1, 8(%rsi) cmpq $3, %rax je .L13 vmulss 12(%rsi), %xmm0, %xmm1 vmovss %xmm1, 12(%rsi) cmpq $4, %rax je .L14 vmulss 16(%rsi), %xmm0, %xmm1 vmovss %xmm1, 16(%rsi) cmpq $5, %rax je .L15 vmulss 20(%rsi), %xmm0, %xmm1 vmovss %xmm1, 20(%rsi) cmpq $6, %rax je .L16 vmulss 24(%rsi), %xmm0, %xmm1 movl $9, %edx movl $7, %r9d vmovss %xmm1, 24(%rsi) .L5: movl $16, %edi movl $8, %r8d movl $1, %r10d subq %rax, %rdi .L4: leaq (%rsi,%rax,4), %rcx vbroadcastss %xmm0, %ymm1 vmulps (%rcx), %ymm1, %ymm2 vmovaps %ymm2, (%rcx) cmpq $1, %r10 je .L6 vmulps 32(%rcx), %ymm1, %ymm1 vmovaps %ymm1, 32(%rcx) .L6: leaq (%r9,%r8), %rax subq %r8, %rdx cmpq %r8, %rdi je .L43 leaq (%rsi,%rax,4), %rcx vmulss (%rcx), %xmm0, %xmm1 vmovss %xmm1, (%rcx) leaq 1(%rax), %rcx cmpq $1, %rdx je .L43 leaq (%rsi,%rcx,4), %rcx vmulss (%rcx), %xmm0, %xmm1 vmovss %xmm1, (%rcx) leaq 2(%rax), %rcx cmpq $2, %rdx je .L43 leaq (%rsi,%rcx,4), %rcx vmulss (%rcx), %xmm0, %xmm1 vmovss %xmm1, (%rcx) leaq 3(%rax), %rcx cmpq $3, %rdx je .L43 leaq (%rsi,%rcx,4), %rcx vmulss (%rcx), %xmm0, %xmm1 vmovss %xmm1, (%rcx) leaq 4(%rax), %rcx cmpq $4, %rdx je .L43 leaq (%rsi,%rcx,4), %rcx vmulss (%rcx), %xmm0, %xmm1 vmovss %xmm1, (%rcx) leaq 5(%rax), %rcx cmpq $5, %rdx je .L43 leaq (%rsi,%rcx,4), %rcx addq $6, %rax vmulss (%rcx), %xmm0, %xmm1 vmovss %xmm1, (%rcx) cmpq $6, %rdx je .L43 leaq (%rsi,%rax,4), %rax vmulss (%rax), %xmm0, %xmm0 vmovss %xmm0, (%rax) vzeroupper ret .L43: vzeroupper .L44: ret .L10: movl $16, %r8d movl $2, %r10d movl $16, %edi movl $16, %edx xorl %r9d, %r9d jmp .L4 .L11: movl $15, %edx movl $1, %r9d jmp .L5 .L16: movl $10, %edx movl $6, %r9d jmp .L5 .L15: movl $11, %edx movl $5, %r9d jmp .L5 .L14: movl $12, %edx movl $4, %r9d jmp .L5 .L13: movl $13, %edx movl $3, %r9d jmp .L5 .L12: movl $14, %edx movl $2, %r9d jmp .L5 Which seems like an awful lot of code, wouldn't you say? I was expecting something along the lines of this (untested): void example.mul(float[], float): testb $31, %sil jne .L44 cmpq $16, %rdi jne .L44 vbroadcastss xmm0, ymm2 vmulps (%rsi), ymm2, ymm0 vmulps 32(%rsi), ymm2, ymm1 vmovaps ymm0, (%rsi) vmovaps ymm1, 32(%rsi) .L44: ret Am I being stupid, or is the optimiser making a complete hash of things?I fear that I cannot reproduce on gcc-5, maybe is a problem specific to your gcc version? _D6nested3mulFAffZv: testb $31, %sil jne .L8 cmpq $16, %rdi jne .L8 vbroadcastss %xmm0, %ymm0 vmulps (%rsi), %ymm0, %ymm1 vmulps 32(%rsi), %ymm0, %ymm0 vmovaps %ymm1, (%rsi) vmovaps %ymm0, 32(%rsi) vzeroupper .L8: ret Iain.
Apr 10 2015