digitalmars.D - One usage of GNU C
- bearophile (134/134) Jul 07 2010 This post is about a recent situation where I have used C instead of D2....
This post is about a recent situation where I have used C instead of D2. It's just a small example, feel free to ignore this post. This is the reduced version of a small C program (I have written the same program in D2 too), the little foo() is a critical function for the performance of the whole program: #define N 200 double m1[N][N]; double m2[N][N]; double foo(unsigned char* used, int current) { double sum = 0.0; int i; for (i = 0; i < N; i++) if (!used[i]) sum += m2[current][i] * (1.0 + m1[current][i]); return sum; } int main() { return 0; } Where about half of used[] bytes are zero. This is the 32bit asm of the foo() loop, generated by GCC 4.5 with: -O3 -fomit-frame-pointer -mfpmath=sse -msse2 -ffast-math L3: cmpb $0, (%ecx,%eax) jne L2 movsd _m1(%edx,%eax,8), %xmm0 addsd %xmm2, %xmm0 mulsd _m2(%edx,%eax,8), %xmm0 addsd %xmm0, %xmm1 L2: addl $1, %eax cmpl $200, %eax jne L3 You can speed up that code in D unrolling the loop once and using two sum, but the performance gain is not much. In D you can also rewrite that loop in assembly, but this loses the possibility to inline foo() (and in this program the inlining of foo() is important, I think dmd isn't doing it anyway), and I need too much time to write that in asm, it's not immediate at all. In C + GCC extensions it's almost easy to use XMM registers: #define N 200 #if (N % 2) #define N_NEXT_EVEN (N+1) #else #define N_NEXT_EVEN N #endif double m1[N][N_NEXT_EVEN]; double m2[N][N_NEXT_EVEN]; typedef double double2 __attribute__ ((vector_size(16))); #define first(pair) (((double*)&pair)[0]) #define second(pair) (((double*)&pair)[1]) double foo2(unsigned char used[N], int current) { double2 *m2_current2 = (double2*)&(m2[current]); double2 *m1_current2 = (double2*)&(m1[current]); double2 one = { 1.0, 1.0 }; double sum = 0.0; int ipair; for (ipair = 0; ipair < (N / 2); ipair++) if ((!used[2 * ipair]) || (!used[2 * ipair + 1])) { double2 adder = m2_current2[ipair] * (one + m1_current2[ipair]); if (!used[2 * ipair]) sum += first(adder); if (!used[2 * ipair + 1]) sum += second(adder); } if ((N & 1) && (!used[N-1])) sum += m2[current][N-1] * (1.0 + m1[current][N-1]); return sum; } int main() { if ((unsigned int)&m2[0][0] % 16 || (unsigned int)&m1[0][0] % 16) return 1; return 0; } This C code is not very clean, but it's far better than writing the same thing in asm (there are many other ways to write that code, but this seems efficient). The 32bit asm of the foo2() loop: L11: cmpb $0, 1(%edx) jne L4 movapd (%ecx,%eax), %xmm0 addpd %xmm4, %xmm0 mulpd (%ebx,%eax), %xmm0 L6: unpckhpd %xmm0, %xmm0 addsd %xmm0, %xmm1 L4: addl $16, %eax addl $2, %edx cmpl $1600, %eax je L10 L5: cmpb $0, (%edx) jne L11 cmpb $0, 1(%edx) movapd (%ecx,%eax), %xmm0 addpd %xmm3, %xmm0 mulpd (%ebx,%eax), %xmm0 movapd %xmm0, %xmm2 addsd %xmm2, %xmm1 je L6 addl $16, %eax addl $2, %edx cmpl $1600, %eax jne L5 Writing this by hand, plus a bit of code to set up the loop, can require almost a day or so to me, because I'm slow and not expert in asm yet, while writing that C code has required a short enough time and I've created only one bug along the way. D language is cuter and more handy because instead of writing something like: #if (N % 2) #define N_NEXT_EVEN (N+1) #else #define N_NEXT_EVEN N #endif I can write something nicer as: static if (N % 2) enum int N_NEXT_EVEN = N + 1; else enum int N_NEXT_EVEN = N; Or probably even: enum int N_NEXT_EVEN = N % 2 ? (N + 1) : N; But for this program C+GCC give more power, so here C is better despite it looks a little worse. Eventually I'd like to be able to write code like this in D2 (untested): // this whole function will need to be inlined double foo3(ubyte[N] used, int current) { auto m1_current2 = cast(double[2][])m1[current]; auto m2_current2 = cast(double[2][])m2[current]; double[2] one = [0, 0]; // no heap allocations here double sum = 0.0; foreach (ipair; 0 .. N / 2) if (!used[2 * ipair] || !used[2 * ipair + 1]) { double[2][] adder = m2_current2[ipair][] * (one[] + m1_current2[ipair][]); if (!used[2 * ipair]) sum += adder[0]; if (!used[2 * ipair + 1]) sum += adder[1]; } if ((N & 1) && (!used[N-1])) sum += m2[current][$-1] * (1.0 + m1[current][$-1]); return sum; } In theory you can already write code like that in D, but the mapping to single SSE2 instructions is not present yet (I have an enhancement request on this for LLVM: http://llvm.org/bugs/show_bug.cgi?id=6956 ). Bye, bearophile
Jul 07 2010