www.digitalmars.com         C & C++   DMDScript  

D.gnu - Regression - quality of generated x86-64 code between GDC v12.3 and

reply Cecil Ward <cecil cecilward.com> writes:
I wrote a very small procedure in D and the x86-64 asm code 
generated in GDC 12.3 was excellent whereas that from 13.1 was 
insanely bloated, totally different. Note: the badness is 
independent of the -On optimisation level (-O3 used initially.)

Here’s the D code and following it, two asm code snippets:

====



public
pragma( inline, true )
cpuid_abcd_t
cpuid_insn( in uint32_t eax ) pure nothrow  nogc  trusted
    	{ /* ecx arg omitted; absolutely minimal variant wrapper */
    	assert( ! is_ecx_needed( eax ) );	// since we are not 
providing an ecx, we had better not be needing to supply one

    	static assert( eax.sizeof * 8 == 32 );	// optional, exact
    	static assert( eax.sizeof * 8 >= 32 );	// essential min

    	const uint32_t in_eax = eax;	// really just for 
type-checking, and constness-assertion
    	static assert( in_eax.sizeof * 8 == 32 );
	
	cpuid_abcd_t ret = void;	/* undefined until the cpuid insn 
writes it */
	static assert(    ret.eax.sizeof * 8 == 32 && ret.ebx.sizeof * 8 
== 32
    	               && ret.ecx.sizeof * 8 == 32 && ret.edx.sizeof 
* 8 == 32 );
	asm pure nothrow  nogc
	    {
	    ".intel_syntax   " ~ "\n\t" ~
	
	    "cpuid"  	       ~ "\n\t" ~
		
	    ".att_syntax  	  \n"
	
	    : /* outputs : it is guaranteed that all bits 63…32 of 
rax/rbx/rcx/rdx etc are zeroed in output. */
	    	"=a" ( ret.eax ),	// an lhs ref, write-only; and only bits 
31…0 are significant
	    	"=b" ( ret.ebx ),	// ..  ..
	    	"=c" ( ret.ecx ),
	    	"=d" ( ret.edx )
	    : 	/* inputs : */
	    	"a"  ( in_eax ) 	// read.
	    				// /* no ecx input - this is the variant with input ecx 
omitted */
	    : 	/* no clobbers apart from the outputs already listed */
	        /* does cpuid set flags? - think not, so no "cc" clobber 
reqd */
	    ;
	    }
	return ret;
	}

/* ======== */

GDC 12.3::  -O3 -frelease -march=native

push	rbx
mov	eax, edi
cpuid
mov	rsi, rdx
sal		rbx, 32
mov	eax, eax
mov	edx, ecx
sal		rsi, 32
or		rax, rbx
pop	rbx
or		rdx, rsi
ret

====
GDC 13.1 = v. bad, same switches:  -O3 -frelease -march=native

push		bp
mov		eax, edi
mov		rbp, rsp
push		rbx
and		        rsp, -32
cpuid
vmovd		xmm3, eax
vmovd		xmm2, ecx
vpinsrd	        xmm1, xmm2, edx, 1
vpinsrd	        xmm0, xmm3, rbx, 1
vpunpcklqdq	  xmm4, xmm0, xmm1
vmovdqa	xmmword ptr [rsp-80], xmm4
mov		rax, qword ptr [rsp-80]
mov		rdx, qword ptr [rsp-72]
mov		rbx, qword ptr [rbp-8]	
leave
ret
/* ======== */
Jun 07 2023
parent reply Iain Buclaw <ibuclaw gdcproject.org> writes:
On Thursday, 8 June 2023 at 04:13:30 UTC, Cecil Ward wrote:
 I wrote a very small procedure in D and the x86-64 asm code 
 generated in GDC 12.3 was excellent whereas that from 13.1 was 
 insanely bloated, totally different. Note: the badness is 
 independent of the -On optimisation level (-O3 used initially.)
So you're saying that a 4-int struct is not being treated as equivalent to an `int[4]`? https://d.godbolt.org/z/G77fW48xG If possible, a bug report would be great on this. Thanks.
Jun 08 2023
parent Iain Buclaw <ibuclaw gdcproject.org> writes:
On Thursday, 8 June 2023 at 11:39:38 UTC, Iain Buclaw wrote:
 On Thursday, 8 June 2023 at 04:13:30 UTC, Cecil Ward wrote:
 I wrote a very small procedure in D and the x86-64 asm code 
 generated in GDC 12.3 was excellent whereas that from 13.1 was 
 insanely bloated, totally different. Note: the badness is 
 independent of the -On optimisation level (-O3 used initially.)
So you're saying that a 4-int struct is not being treated as equivalent to an `int[4]`? https://d.godbolt.org/z/G77fW48xG If possible, a bug report would be great on this.
Fix has been committed to mainline and gcc-13. https://gcc.gnu.org/git/?p=gcc.git;a=commitdiff;h=f2eeda5652438fe783d4e3878139481a1b8606b6;hp=09124b7ed7709721e86556b4083ef40925d7489b
Jul 01 2023