digitalmars.D - SSE asm with functions
- Byron (92/92) Jun 15 2011 In the attached file xmm.d I have a function xnormal that takes a vector...
- bearophile (4/5) Jun 15 2011 DMD doesn't compile asm code. My suggestion is to keep reducing your cod...
- Timon Gehr (9/41) Jun 15 2011 nan, nan ] and the
- Byron (61/61) Jun 16 2011 I reduced the complexity of the problem, seems to be SSE and returning l...
- Byron (1/1) Jun 16 2011 Same problem with 64-bit dmd on ubuntu. ( change EAX to RAX )
- Walter Bright (1/1) Jun 16 2011 http://d.puremagic.com/issues/show_bug.cgi?id=6166
- Walter Bright (7/8) Jun 16 2011 What you've run into is the "named return value" optimization. 'r' is re...
In the attached file xmm.d I have a function xnormal that takes a vector ( alias float[4] ) an computes the unit vector. The SSE code seems to work fine, but it keeps returning [nan, nan, nan, nan ] and the writeln prints the same. But if I change the return from r ( output vector ) to v ( input vector ) it prints the correct normal vector, and returns the input vector. Is this my bug or a compiler bug? DMD32 v2.053 OS X const(vector) xnormal( ref const(vector) v ) { vector r; asm { mov EAX, v; movups XMM0, [EAX]; //load vector movaps XMM2, XMM0; // copy original data // find x^2 + y^2 + z^2 + w^2 mulps XMM0, XMM0; // xx, yy, zz, ww movaps XMM1, XMM0; // copy, cause we will write into X0 shufps XMM0, XMM1, 0x4e; // 0100 1110 zwxy addps XMM0, XMM1; // xyzw + zwxy movaps XMM1, XMM0; // copy, cause we will write into X0 shufps XMM0, XMM1, 0x11; // 0001 0001 (y+w)(x+z)(y+w)(x+z) addps XMM0, XMM1; // (x+z)(y+w)(z+x)(w+y) + (y+w)(x+z)(y+w)(x+z) // (x+z+y+w)(y+w+x+z)(z+x+y+w)(w+y+x+z) rsqrtps XMM0, XMM0; // 1/sqrt(XMM0) mulps XMM2, XMM0; // x/sqrt(x^2+y^2+z^2+w^2) , ... movups r, XMM2; } writeln( "Result: ", r, "\t", v ); return r; } I would like to use D for a thesis projects, but wont be able to if its still this buggy. -Byron begin 644 xmm.d M;6]D=6QE('AM;3L*"FEM<&]R="!S=&0N<W1D:6\L('-T9"YM871H.PH*86QI M87, 9FQO871;-%T =F5C=&]R.PH*8V]N<W0H=F5C=&]R*2!N;W)M86PH(')E M9B!C;VYS="AV96-T;W(I('8 *0I["B` ("!V96-T;W( <F5T.PH ("` 9FQO M870 ;&5N(#T ,#L M("H ;&5N.PH*("` (')E='5R;B!R970["GT*"G9E8W1O<B!N;W)M86QI>F4H M86-H*"!I.R`P+BXT("D*("` ("` ("!L96X *ST =EMI72`J('9;:5T["B` M("!L96X /2`Q+C!F+W-Q<G0H;&5N*3L*"B` ("!F;W)E86-H*"!I.R`P+BXT M("D*("` ("` ("!V6VE=("H](&QE;CL*"B` ("!R971U<FX =CL*?0H*8V]N M<W0H=F5C=&]R*2!X;F]R;6%L*"!R968 8V]N<W0H=F5C=&]R*2!V("D*>PH M("` =F5C=&]R('(["B` ("!A<VT*("` ('L*("` ("` ("!M;W8 14%8+"!V M.PH ("` ("` (&UO=G5P<R!834TP+"!;14%873L +R]L;V%D('9E8W1O< H M("` ("` (&UO=F%P<R!834TR+"!834TP.R`O+R!C;W!Y(&]R:6=I;F%L(&1A M=&$*"B` ("` ("` +R\ 9FEN9"!X7C( *R!Y7C( *R!Z7C( *R!W7C(*("` M("` ("!M=6QP<R!834TP+"!834TP.R`O+R!X>"P >7DL('IZ+"!W=PH ("` M("` (&UO=F%P<R!834TQ+"!834TP.R`O+R!C;W!Y+"!C875S92!W92!W:6QL M.R`O+R!X>7IW("L >G=X>0H*("` ("` ("!M;W9A<', 6$U-,2P 6$U-,#L M+R\ 8V]P>2P 8V%U<V4 =V4 =VEL;"!W<FET92!I;G1O(% P"B` ("` ("` M*WHI*'DK=RDH>"MZ*0H ("` ("` (&%D9'!S(%A-33`L(%A-33$[("\O("AX M*WHI*'DK=RDH>BMX*2AW*WDI("L *'DK=RDH>"MZ*2AY*W<I*' K>BD*("` M("` ("` ("` ("` ("` ("` ("` ("`O+R`H>"MZ*WDK=RDH>2MW*W K>BDH M>BMX*WDK=RDH=RMY*W K>BD*("` ("` ("`*("` ("` ("!R<W%R='!S(%A- M(%A-33`[("\O(' O<W%R="AX7C(K>5XR*WI>,BMW7C(I("P +BXN"B` ("` M("` ;6]V=7!S('(L(%A-33(["B` ("!]"B` ("!W<FET96QN*"`B4F5S=6QT M.B`B+"!R+"`B7'0B+"!V("D["B` ("!R971U<FX <CL*?0H*=F5C=&]R('AN M;W)M86QI>F4H(')E9B!V96-T;W( =B`I"GL*("` (&%S;0H ("` >PH ("` M("` (&UO=B!%05 L('8["B` ("` ("` ;6]V=7!S(%A-33`L(%M%05A=.R`O M+VQO860 =F5C=&]R"B` ("` ("` ;6]V87!S(%A-33(L(%A-33`[("\O(&-O M<'D ;W)I9VEN86P 9&%T80H*("` ("` ("`O+R!F:6YD('A>,B`K('E>,B`K M('I>,B`K('=>, H ("` ("` (&UU;'!S(%A-33`L(%A-33`[("\O('AX+"!Y M>2P >GHL('=W"B` ("` ("` ;6]V87!S(%A-33$L(%A-33`[("\O(&-O<'DL M(&-A=7-E('=E('=I;&P =W)I=&4 :6YT;R!8,`H ("` ("` ('-H=69P<R!8 M9'!S(%A-33`L(%A-33$[("\O('AY>G< *R!Z=WAY" H ("` ("` (&UO=F%P M<R!834TQ+"!834TP.R`O+R!C;W!Y+"!C875S92!W92!W:6QL('=R:71E(&EN M,"P 6$U-,3L +R\ *' K>BDH>2MW*2AZ*W I*'<K>2D *R`H>2MW*2AX*WHI M*'DK=RDH>"MZ*0H ("` ("` ("` ("` ("` ("` ("` ("` ("\O("AX*WHK M>2MW*2AY*W<K>"MZ*2AZ*W K>2MW*2AW*WDK>"MZ*0H ("` ("` (`H ("` M("` (')S<7)T<', 6$U-,"P 6$U-,#L +R\ ,2]S<7)T*%A-33`I"B` ("` M("` ;75L<', 6$U-,BP 6$U-,#L +R\ >"]S<7)T*'A>,BMY7C(K>EXR*W=> M,BD +"`N+BX*("` ("` ("!M;W9U<', 6T5!6%TL(%A-33(["B` ("!]"B` M("!R971U<FX =CL*?0H*=F]I9"!M86EN*"D*>PH*("` ('9E8W1O<B!V,2`] M=W)I=&5L;B (EQN4F5T=7)N(&YE=R!N;W)M86QI>F5D('9E8W1O<G,B("D[ M"B` ("!W<FET96QN*"`B4U-%.B`B+"!V,2P (EQT(BP ('AN;W)M86PH('8Q M("DI.PH ("` =W)I=&5L;B (F9O<F5A8V Z("(L('8R+"`B7'0B+"!N;W)M M86PH('8R("DI.PH*("` ('=R:71E;&XH(")<;DYO<FUA;&EZ92!I;B!P;&%C M92( *3L*("` ('=R:71E;&XH(")34T4Z("(L('8Q+"`B7'0B+"` >&YO<FUA M;&EZ92 =C$ *2D["B` ("!W<FET96QN*"`B9F]R96%C:#H (BP =C(L(")< 9="(L(&YO<FUA;&EZ92 =C( *2D["GT*" `` ` end
Jun 15 2011
Byron:Is this my bug or a compiler bug?DMD doesn't compile asm code. My suggestion is to keep reducing your code until you understand what's going on. Also, a disassembler helps a bit here. Bye, bearophile
Jun 15 2011
Byron wrote:In the attached file xmm.d I have a function xnormal that takes a vector ( aliasfloat[4] ) an computes theunit vector. The SSE code seems to work fine, but it keeps returning [nan, nan,nan, nan ] and thewriteln prints the same. But if I change the return from r ( output vector ) tov ( input vector ) it printsthe correct normal vector, and returns the input vector. Is this my bug or acompiler bug?DMD32 v2.053 OS X const(vector) xnormal( ref const(vector) v ) { vector r; asm { mov EAX, v; movups XMM0, [EAX]; //load vector movaps XMM2, XMM0; // copy original data // find x^2 + y^2 + z^2 + w^2 mulps XMM0, XMM0; // xx, yy, zz, ww movaps XMM1, XMM0; // copy, cause we will write into X0 shufps XMM0, XMM1, 0x4e; // 0100 1110 zwxy addps XMM0, XMM1; // xyzw + zwxy movaps XMM1, XMM0; // copy, cause we will write into X0 shufps XMM0, XMM1, 0x11; // 0001 0001 (y+w)(x+z)(y+w)(x+z) addps XMM0, XMM1; // (x+z)(y+w)(z+x)(w+y) + (y+w)(x+z)(y+w)(x+z) // (x+z+y+w)(y+w+x+z)(z+x+y+w)(w+y+x+z) rsqrtps XMM0, XMM0; // 1/sqrt(XMM0) mulps XMM2, XMM0; // x/sqrt(x^2+y^2+z^2+w^2) , ... movups r, XMM2; } writeln( "Result: ", r, "\t", v ); return r; } I would like to use D for a thesis projects, but wont be able to if its stillthis buggy.-Byron << xmm.d >>It seems it is a backend bug in DMD as the same code works just fine with GDC. (frontend version 2.052 though, this might need some further investigation). Timon
Jun 15 2011
I reduced the complexity of the problem, seems to be SSE and returning local copies. $ dmd -run db.d v: [1, 2, 3, 4] test1 r: [nan, nan, nan, nan] test1: [nan, nan, nan, nan] test2 r: [1, 2, 3, 4] test2: [1, 2, 3, 4] halle109-251:asm byro //db.d import std.stdio; alias float[4] vector; const(vector) test1( ref const(vector) v ) { vector r; asm { mov EAX, v; movups XMM0, [EAX]; movups r, XMM0; } writeln( "test1 r: ", r ); return r; } const(vector) test2( ref const(vector) v ) { vector r, s; asm { mov EAX, v; movups XMM0, [EAX]; movups r, XMM0; } writeln( "test2 r: ", r ); s = r; return s; } void main() { vector v = [1,2,3,4]; writeln( "v: ", v ); writeln( "test1: ", test1(v)); writeln( "test2: ", test2(v)); } -Byron begin 644 db.d M:6UP;W)T('-T9"YS=&1I;SL*"F%L:6%S(&9L;V%T6S1=('9E8W1O<CL*"F-O M("!V96-T;W( <CL*("` (&%S;0H ("` >PH ("` ("` (&UO=B!%05 L('8[ M"B` ("` ("` ;6]V=7!S(%A-33`L(%M%05A=.PH ("` ("` (&UO=G5P<R!R M+"!834TP.PH ("` ?0H ("` =W)I=&5L;B (G1E<W0Q('(Z("(L('( *3L* M("` (')E='5R;B!R.PI]" IC;VYS="AV96-T;W(I('1E<W0R*"!R968 8V]N M<W0H=F5C=&]R*2!V("D*>PH ("` =F5C=&]R('(L(',["B` ("!A<VT*("` M('L*("` ("` ("!M;W8 14%8+"!V.PH ("` ("` (&UO=G5P<R!834TP+"!; M14%873L*("` ("` ("!M;W9U<', <BP 6$U-,#L*("` ('T*("` ('=R:71E M;&XH(")T97-T,B!R.B`B+"!R("D["B` ("!S(#T <CL*("` (')E='5R;B!S )=BDI.PI]" H* ` end
Jun 16 2011
Same problem with 64-bit dmd on ubuntu. ( change EAX to RAX )
Jun 16 2011
http://d.puremagic.com/issues/show_bug.cgi?id=6166
Jun 16 2011
On 6/16/2011 10:20 AM, Byron wrote:I reduced the complexity of the problem, seems to be SSE and returning local copies.What you've run into is the "named return value" optimization. 'r' is rewritten by the compiler as a reference to a vector in the caller's stack frame, this avoids unnecessary copying when returning r. The trouble is, the inline assembler does no such rewrites. I'll file this in bugzilla. In the meantime, you've already discovered the workaround in test2().
Jun 16 2011