www.digitalmars.com         C & C++   DMDScript  

digitalmars.D - pi sample program w/ metaprogramming optimizations

reply "Craig Black" <cblack ara.com> writes:
I wanted to explore optimizing code in D so I spent a few hours optimizing
the pi.d sample program that comes with the dmd compiler.  I used some neat
metaprogramming tricks.  On my computer it is almost twice as fast as the
original version.

-Craig


begin 666 pi2.d


M,#L-
M;BAC:&%R6UU;72!A<F=S*0T*>PT*"71I;65?="!S=&%R=&EM92P 96YD=&EM

M<ULQ75LP72PB)60B+"9Q*3L-" E](&5L<V4 >PT*"0EP<FEN=&8H(E5S86=E

M("AQ(#P



M+R\ 0V]M<'5T92!O;F4 ;6]R92!D:6=I="!T:&%N('=E(&1I<W!L87D =&\ 

M('$ *R Q.PT*"70N;&5N9W1H(#T


M:6UE*3L-" T*"2\O(%)E='5R;B!T;R!T:&4 ;G5M8F5R(&]F(&1I9VET<R!W



M<%MI72DI.PT*"7!R:6YT9B B7&XB*3L-" EP<FEN=&8H(B5L9"!S96-O;F1S
M('1O(&-O;7!U=&4 <&D =VET:"!A('!R96-I<VEO;B!O9B E9"!D:6=I=',N

M+RH 5&5M<&QA=&4 =&AA="!D971E<FUI;F5S(&EF(&$ ;G5M8F5R(&ES(&$ 

M92!I<U!O=S(H=6EN="!N*2![(&-O;G-T(&)O;VP :7-0;W<R(#T *&X )2 R



M;"!I<U!O=S( /2!F86QS93L ?0T*=&5M<&QA=&4 :7-0;W<R*'5I;G0 ;B Z

M('1E;7!L871E('1O(&=E="!G;V]D(&-O;7!I;&4 =&EM92!O<'1I;6EZ871I
M;VYS(&9O< T*("H 9&EV:7-I;VX 86YD(&UU;'1I<&QI8V%T:6]N( T*("HO




M(&EF*&ES4&]W,B$H9&EV:7-O<BDI( T*"0D)"7( /2!B("4 9&EV:7-O<CL 
M+R\ 8V]M<&EL97( 8V%N(&]P=&EM:7IE(&UO9'5L=7, 9F]R('!O=V5R<R!O

M6VE=(#T <3L-" D)?0T*"7T-"GT-" T*+R\ 0V]M<'5T97, =&AE(&%R8W1A
M;F=E;G0 ;V8 , T*=F]I9"!A<F-T86XR*"D-"GL-" EI;G0 ;CL-" T*"71;

M('L-" D);75L*&XI.PT*"0EF87-T9&EV(2 T*3L-" D)9&EV*&X *ST ,BD[

M"65L<V4-" D)"7-U8B I.PT*"7T =VAI;&4 *"%T:7-Z97)O*"DI.PT*?0T*


M*3L-" EA9&0H*3L-" EN(#T



M9F]R("AI;G0 :B ]('$[(&H /CT ,#L :BTM*0T*"7L-" D):68 *'1;:ET 
M*R!P6VI=(#X





M" EF;W( *&EN="!I(#T <3L :2 ^/2 P.R!I+2TI('L-" D):6YT(&( /2!T


M;G0 8SL-" T*"69O<B H:6YT(&D /2!Q.R!I(#X
M("  ("  ("  ("  (&EN="!B(#T



M87-T97( =&\ 9&\ 9FQO871I;G0 <&]I;G0 ;75L=&EP;&EC871I;VX =&AA
M; T*("  ("  ("  *B!I;G1E9V5R(&1I=FES:6]N+B!4:&ES(&ES(&)E8V%U
M<V4 =&AE<F4 :7, ;F\ :6YT96=E<B!D:79I<VEO;B!I;G-T<G5C=&EO;BX-
M"B  ("  ("  ("H 56YD97( =&AE(&AO;V0L(&EN=&5G97( 9&EV:7-I;VX 



M("  ("  ("!I;G0 8B ]('1;:5T *R!R("H ,3 [( T*"0EI;G0 <75O=&EE
M;G0 /2!C87-T*&EN="DH8V%S="AD;W5B;&4I8B J(&ED:78I.PT*"0ER(#T 
M8B M('%U;W1I96YT("H 9&EV:7-O<CL-" D)=%MI72 ]('%U;W1I96YT.PT*
M"7T-"GT-" T*:6YT('1I<WIE<F\H*0T*>PT*"69O<B H:6YT(&L /2 P.R!K
M(#P]('$[(&LK*RD-" D):68 *'1;:UT (3T ,"D-" D)"7)E='5R;B!F86QS

`
end
Apr 29 2006
next sibling parent reply "Craig Black" <cblack ara.com> writes:
	charset="iso-8859-1"
Content-Transfer-Encoding: quoted-printable

For some reason the attachment doesn't seem to be working, at least for =
me.  Here's the code just in case.  Please excuse the spacing.  For some =
reason the tabs are only represented by one space.

import std.c.stdio;
import std.c.stdlib;
import std.c.time;

const int LONG_TIME=3D4000;

byte[] p;
byte[] t;
int q;

int main(char[][] args)
{
 time_t startime, endtime;

 if (args.length =3D=3D 2) {
  sscanf(&args[1][0],"%d",&q);
 } else {
  printf("Usage: pi [precision]\n");
  exit(55);
 }

 if (q < 0)
 {
  printf("Precision was too low, running with precision of 0.\n");
  q =3D 0;
 }

 if (q > LONG_TIME)
 {
     printf("Be prepared to wait a while...\n");
 }

 // Compute one more digit than we display to compensate for rounding
 q++;

 p.length =3D q + 1;
 t.length =3D q + 1;

 // Compute pi
 std.c.time.time(&startime);
 arctan2();
 arctan3();
 mul4();
 std.c.time.time(&endtime);

 // Return to the number of digits we want to display
 q--;

 // Print pi
 printf("pi =3D %d.",cast(int)(p[0]));
 for (int i =3D 1; i <=3D q; i++)
  printf("%d",cast(int)(p[i]));
 printf("\n");
 printf("%ld seconds to compute pi with a precision of %d =
digits.\n",endtime-startime,q);

 return 0;
}

/* Template that determines if a number is a power of=20
 * 2 at compile time
 */
template isPow2(uint n) { const bool isPow2 =3D (n % 2) !=3D 0 ? false : =
isPow2!(n / 2); }
template isPow2(uint n : 0) { const bool isPow2 =3D false; }
template isPow2(uint n : 1) { const bool isPow2 =3D false; }
template isPow2(uint n : 2) { const bool isPow2 =3D true; }

/* Use a template to get good compile time optimizations for
 * division and multiplication=20
 */
template fastdiv(int divisor)
{
 void fastdiv()
 {
  int r; // remainder

=20
  for (int i =3D 0; i <=3D q; i++) {
   int b =3D t[i] + r * 10;
   int q =3D b / divisor;
   static if(isPow2!(divisor))=20
    r =3D b % divisor; // compiler can optimize modulus for powers of 2
   else=20
    r =3D b - q * divisor;
   t[i] =3D q;
  }
 }
}

// Computes the arctangent of 2
void arctan2()
{
 int n;

 t[0] =3D 1;
 fastdiv!(2);
 add();
 n =3D 1;
 do {
  mul(n);
  fastdiv!(4);
  div(n +=3D 2);
  if ((((n-1) / 2) % 2) =3D=3D 0)
   add();
  else
   sub();
 } while (!tiszero());
}


// Computes the arctangent of 3
void arctan3()
{
 int n;

 t[0] =3D 1;
 fastdiv!(3);
 add();
 n =3D 1;
 do {
  mul(n);
  fastdiv!(9);
  div(n +=3D 2);
  if ((((n-1) / 2) % 2) =3D=3D 0)
   add();
  else
   sub();
 } while (!tiszero());
}

void add()
{

 for (int j =3D q; j >=3D 0; j--)
 {
  if (t[j] + p[j] > 9) {
   p[j] +=3D t[j] - 10;
   p[j-1]++;
  } else
   p[j] +=3D t[j];
 }
}

void sub()
{
 for (int j =3D q; j >=3D 0; j--)
  if (p[j] < t[j]) {
   p[j] -=3D t[j] - 10;
   p[j-1]--;
  } else
   p[j] -=3D t[j];
}


void mul(int multiplier)
{
 int c;


 for (int i =3D q; i >=3D 0; i--) {
  int b =3D t[i] * multiplier + c;
  c =3D b / 10;
  t[i] =3D b - c * 10;=20
 }
}


void mul4()
{
 int c;

 for (int i =3D q; i >=3D 0; i--) {
                int b =3D p[i] * 4 + c;
                c =3D b / 10;
  p[i] =3D b - c * 10;=20
 }
}


void div(int divisor)
{
 int r; // remainder

        /* Optimization: It's faster to do floatint point multiplication =
than
         * integer division. This is because there is no integer =
division instruction.
         * Under the hood, integer division is essentially =
floating-point division.
         */
        double idiv =3D 1.0 / divisor;

 for (int i =3D 0; i <=3D q; i++) {
                int b =3D t[i] + r * 10;=20
  int quotient =3D cast(int)(cast(double)b * idiv);
  r =3D b - quotient * divisor;
  t[i] =3D quotient;
 }
}

int tiszero()
{
 for (int k =3D 0; k <=3D q; k++)
  if (t[k] !=3D 0)
   return false;
 return true;
}
Apr 29 2006
parent James Dunne <james.jdunne gmail.com> writes:
Craig Black wrote:
 For some reason the attachment doesn't seem to be working, at least for 
 me.  Here's the code just in case.  Please excuse the spacing.  For some 
 reason the tabs are only represented by one space.
 
 [snip]
  
Attachments don't work properly with the web-interface, but any decent newsgroup reader will work fine. It's been this way for quite a while... -- Regards, James Dunne
Apr 30 2006
prev sibling parent "Craig Black" <cblack ara.com> writes:
Here' a version that uses more templates.  It performs the same as the first
version I submitted.

import std.c.stdio;
import std.c.stdlib;
import std.c.time;

const int LONG_TIME=4000;

byte[] p;
byte[] t;
int q;

int main(char[][] args)
{
 if (args.length == 2) {
  sscanf(&args[1][0],"%d",&q);
 } else {
  printf("Usage: pi [precision]\n");
  exit(55);
 }

 if (q < 0)
 {
  printf("Precision was too low, running with precision of 0.\n");
  q = 0;
 }

 if (q > LONG_TIME)
 {
     printf("Be prepared to wait a while...\n");
 }

 // Compute one more digit than we display to compensate for rounding
 q++;

 p.length = q + 1;
 t.length = q + 1;

 // Compute pi
 time_t startime, endtime;
 std.c.time.time(&startime);
 arctan!(2);
 arctan!(3);
 fastmul!(4);
 std.c.time.time(&endtime);

 // Return to the number of digits we want to display
 q--;

 // Print pi
 printf("pi = %d.",cast(int)(p[0]));
 for (int i = 1; i <= q; i++)
  printf("%d",cast(int)(p[i]));
 printf("\n");
 printf("%ld seconds to compute pi with a precision of %d
digits.\n",endtime-startime,q);

 return 0;
}

/* Template that determines if a number is a power of
 * 2 at compile time
 */
template isPow2(uint n) { const bool isPow2 = (n % 2) != 0 ? false :
isPow2!(n / 2); }
template isPow2(uint n : 0) { const bool isPow2 = false; }
template isPow2(uint n : 1) { const bool isPow2 = false; }
template isPow2(uint n : 2) { const bool isPow2 = true; }

/* Use a template to get good compile time optimizations for
 * division and multiplication
 */
template fastdiv(int divisor)
{
 void fastdiv()
 {
  int r; // remainder


  for (int i = 0; i <= q; i++) {
   int b = t[i] + r * 10;
   int q = b / divisor;
   static if(isPow2!(divisor))
    r = b % divisor; // compiler can optimize modulus for powers of 2
   else
    r = b - q * divisor;
   t[i] = q;
  }
 }
}

/* Template to compute the arctangent.
 * The constant input parameter allows the compiler to optimize.
 */
template arctan(int s)
{
 void arctan()
 {
  int n;

  t[0] = 1;
  fastdiv!(s);
  add();
  n = 1;
  do {
   mul(n);
   fastdiv!(s*s);
   div(n += 2);
   if ((((n-1) / 2) % 2) == 0)
    add();
   else
    sub();
  } while (!tiszero());
 }
}

void add()
{
 for (int j = q; j >= 0; j--)
 {
  if (t[j] + p[j] > 9) {
   p[j] += t[j] - 10;
   p[j-1]++;
  } else
   p[j] += t[j];
 }
}

void sub()
{
 for (int j = q; j >= 0; j--)
  if (p[j] < t[j]) {
   p[j] -= t[j] - 10;
   p[j-1]--;
  } else
   p[j] -= t[j];
}


void mul(int multiplier)
{
 int c;

 for (int i = q; i >= 0; i--) {
  int b = t[i] * multiplier + c;
  c = b / 10;
  t[i] = b - c * 10;
 }
}

/* Fast multiplication using template */
template fastmul(int multiplier)
{
 void fastmul()
 {
  int c;

  for (int i = q; i >= 0; i--) {
   int b = p[i] * multiplier + c;
   c = b / 10;
   p[i] = b - c * 10;
  }
 }
}


void div(int divisor)
{
 int r; // remainder

        /* Optimization: It's faster to do floatint point multiplication
than
         * integer division. This is because there is no integer division
instruction.
         * Under the hood, integer division is essentially floating-point
division.
         */
        double idiv = 1.0 / divisor;

 for (int i = 0; i <= q; i++) {
                int b = t[i] + r * 10;
  int quotient = cast(int)(cast(double)b * idiv);
  r = b - quotient * divisor;
  t[i] = quotient;
 }
}

int tiszero()
{
 for (int k = 0; k <= q; k++)
  if (t[k] != 0)
   return false;
 return true;
}
Apr 29 2006