www.digitalmars.com         C & C++   DMDScript  

digitalmars.D.announce - Template BigNum implemented without runtime loops

reply BCS <BCS pathlink.com> writes:
I have implemented a arbitrary precision integer type using a struct and 
templates. Currently it supports ==, +, -, *, +=, -= and *=. It should 
be fairly efficient as it uses static foreaches and asm blocks to unroll 
all of the looping and conditionals.

I haven't tested the multiplication much yet so it may well contain 
bugs. (If it doesn't I will be vary surprised)

On my system DMD crashes for size > 73 (e.i. 73 * 32b = 2336b).

Here it is.
<code>
import std.stdio;

debug
{
     template decimalDigit(int n)
     {
       const char[] decimalDigit = "0123456789"[n..n+1];
     }

     template itoa(long n)
     {
       static if (n < 0)
          const char[] itoa = "-" ~ itoa!(-n);
       else static if (n < 10)
          const char[] itoa = decimalDigit!(n);
       else
          const char[] itoa = itoa!(n/10L) ~ decimalDigit!(n%10L);
     }
}

unittest
{
     BigNum!(5) tmp,add,sum;

     tmp.set = uint .max;
     assert(tmp.values[0] == uint.max, "set(uint) failed");
     assert(tmp == uint.max, "opEqual(uint) false neg");
     assert(tmp != 1, "opEqual(uint) false pos");

     add.set = 1;
     assert(add.values[0] == 1, "set(uint) failed");
     assert(add == 1, "opEqual(uint) false neg");
     assert(add != uint.max, "opEqual(uint) false pos");

     tmp.values[1] = uint.max;

     sum = tmp+add;
     assert(sum.values[0] == 0, "low word failed");
     assert(sum.values[1] == 0, "carry word failed");
     assert(sum.values[2] == 1, "high word failed");

     sum = tmp;
     sum += add;
     assert(sum.values[0] == 0, "low word failed");
     assert(sum.values[1] == 0, "carry word failed");
     assert(sum.values[2] == 1, "high word failed");

     tmp.values[2] = tmp.values[0] = 0;
     tmp.values[1] = 1;
     sum = tmp - add;
     assert(sum.values[0] == uint.max, "low word failed");
     assert(sum.values[1] == 0, "borrow word failed");
     assert(sum.values[2] == 0, "high word failed");

     sum = tmp;
     sum -= add;
     assert(sum.values[0] == uint.max, "low word failed");
     assert(sum.values[1] == 0, "borrow word failed");
     assert(sum.values[2] == 0, "high word failed");

     sum = tmp*add;
}


template BigNum(uint size)
{
     static if(size > 73)
     pragma(mgs, "Ok, I'll try it. But don't say I didn't warn you!!!");
     static if(size <= 2)
     static assert(false, "BigNum!(size) is not valid for size <= 2");
     else
         alias build!(size, size-2, size-1) BigNum;
}

template build(uint size, uint i, V...)
{
     static if(i==1)
         alias use!(size, 1, V) build;
     else
         alias build!(size, i-1, i, V) build;
}

template use(uint size, V...)
{
     struct use
     {
             //DMD0.174 seg-v for size > 73
         static if(size > 73)
             pragma(mgs, "You have set a new world record!!!!");

         static assert(0 == values.offsetof);
         uint[size] values;

         void Emit()
         {
             for(int i=size-1; i>0; i--)
                 writef("%x:",values[i]);
             writef("%x\n",values[0]);
         }

         void set(uint val)
         {
             values[0] = val;
             static if(size>1)
                 foreach(inout v; values[1..$])
                     v = 0;
         }

         bool opEquals(uint i)
         {
             if (values[0] != i) return false;
             static if(size>1)
                 foreach(inout v; values[1..$])
                     if(v != 0) return false;
             return true;
         }

         bool opEquals(use!(size, V)  i)
         {
             return i.values[] == values[];
         }

         use!(size, V) opAdd(use!(size, V) to)
         {
             uint* ths  = this.values.ptr;
             uint* dest = to.values.ptr;
                 // DON'T USE "this" after this line!!!!!!!
             asm
             {
                 mov EAX, ths[0];
                 mov EBX, dest[0];
                 mov EDX, [EAX+0];
                 add [EBX+0], EDX;
             }
             foreach(j,i; V)
             {
                 const uint off = V[j]*4;   // this is a work-around
                 asm
                 {
                     mov EDX, [EAX+off];
                     adc [EBX+off], EDX;
                 }
             }
             return to;
         }

         use!(size, V) opSub(use!(size, V) to)
         {
             uint* ths  = this.values.ptr;
             uint* dest = to.values.ptr;
                 // DON'T USE "this" after this line!!!!!!!
             asm
             {                // dest = ths - dest
                 mov EAX, ths[0];    // dest = *EAX - dest
                 mov EBX, dest[0];    // dest = *EAX - *EBX

                 mov EDX, [EAX+0];    // dest = EDX - *EBX
                 sub EDX, [EBX+0];    // EDX = EDX - *EBX
                 mov [EBX+0], EDX;    // *EBX
                             // dest
             }
             foreach(j,i; V)
             {
                 const uint off = V[j]*4;     // this is a work-around
                 asm
                 {
                                 // dest = ths - dest
                                 // dest = *EAX - *EBX
                     mov EDX, [EAX+off];    // dest = EDX - *EBX
                     sbb EDX, [EBX+off];    // EDX = EDX - ECX
                     mov [EBX+off], EDX;    // *EBX
                                 // dest
                 }
             }
             return to;
         }

         use!(size, V) opMul(use!(size, V) to)
         {
             uint* ths  = this.values.ptr;
             use!(size, V) ret;
                 // DON'T USE "this" after this line!!!!!!!

             // EDX:EAX    mul in/out
             asm
             {
             // load values.ptr[0] -> ECX
             mov ECX, ths[0];
             mov ECX, [ECX];
             // load to.values.ptr[0] -> EAX
             mov EAX, to[0];
             // EDX:EAX = mul(EAX, ECX)
             imul ECX;
             // load EAX -> ret.values[0]
             mov ret[0], EAX;
             // load EDX -> ret.values[1]
             mov ret[1], EDX;
             }

             debug pragma(msg, itoa!(__LINE__)~": set ret[0] from ths[0] 
and to[0]");
             debug pragma(msg, itoa!(__LINE__)~": set ret[1] from ths[0] 
and to[0]");

             foreach(i,j; V)
             {
                     // get rank of this row
                 const uint OutI = V[i];     // this is a work-around
                 asm
                 {
                 // load values.ptr[OutI] -> ECX
                 mov ECX, ths;
                 mov ECX, [ECX+OutI];
                 // load to.values.ptr[0] -> EAX
                 mov EAX, to[0];
                 // EDX:EAX = mul(EAX, ECX)
                 imul ECX;

                 // load ret.values[OutI] -> EBX
                 mov EBX, ret[OutI];
                 // EBX = add(EBX, EAX)
                 add EBX, EAX;
                 // load EBX -> ret.values[OutI]
                 mov ret[OutI], EBX;
                 }
                 debug pragma(msg, itoa!(__LINE__)~": set 
ret["~itoa!(OutI)~"] from ths["~itoa!(OutI)~"] and to[0]");

                 static if(OutI+1 < size)
                 {
                     asm
                     {
                     // load ret.values[OutI+1] -> EBX
                     mov EBX, ret[OutI+1];
                     // EBX = adc(EBX, EDX)
                     adc EBX, EDX;
                     // load EBX -> ret.values[OutI+1]
                     mov ret[OutI+1], EBX;
                             nop;
                     }
                     debug pragma(msg, itoa!(__LINE__)~": set 
ret["~itoa!(OutI+1)~"] from ths["~itoa!(OutI)~"] and to[0]");
                 }

                 foreach(k,l; V)
                 {
                     const uint InI = V[i];     // this is a work-around
                     const uint DesI = OutI + InI;
                     static if(DesI < size)
                     {
                         asm
                         {
                         // load to.values.ptr[InI] -> EAX
                         mov EAX, to[InI];
                         // EDX:EAX = mul(EAX, ECX)
                         imul ECX;

                         // load ret.values[DesI] -> EBX
                         mov EBX, ret[DesI];
                         // EBX = add(EBX, EAX)
                         add EBX, EAX;
                         // load EBX -> ret.values[DesI]
                         mov ret[DesI], EBX;
                         }
                         debug pragma(msg, itoa!(__LINE__)~": set 
ret["~itoa!(DesI)~"] from ths["~itoa!(OutI)~"] and to["~itoa!(InI)~"]");

                         static if(DesI+1 < size)
                         {
                             asm
                             {
                             // load ret.values[DesI+1] -> EBX
                             mov EBX, ret[DesI+1];
                             // EBX = adc(EBX, EDX)
                             adc EBX, EDX;
                             // load EBX -> ret.values[DesI+1]
                             mov ret[DesI+1], EBX;
                             }
                             debug pragma(msg, itoa!(__LINE__)~": set 
ret["~itoa!(DesI+1)~"] from ths["~itoa!(OutI)~"] and to["~itoa!(InI)~"]");
                         }
                     }
                 }
             }
             return ret;
         }


         use!(size, V) opAddAssign(use!(size, V) to)
         {
             uint* ths  = this.values.ptr;
             uint* dest = to.values.ptr;
                 // DON'T USE "this" after this line!!!!!!!
             asm
             {
                 mov EBX, ths[0];
                 mov EAX, dest[0];
                 mov EDX, [EAX+0];
                 add [EBX+0], EDX;
                 // no store needed because dest is memory
             }
             foreach(j,i; V)
             {
                 const uint off = V[j]*4;// this is a work-around
                 asm
                 {
                     mov EDX, [EAX+off];
                     adc [EBX+off], EDX;
                     // no store needed because dest is memory
                 }
             }
             return to;
         }

         use!(size, V) opSubAssign(use!(size, V) to)
         {
             uint* ths  = this.values.ptr;
             uint* dest = to.values.ptr;
                 // DON'T USE "this" after this line!!!!!!!
             asm
             {
                 mov EBX, ths[0];
                 mov EAX, dest[0];
                 mov EDX, [EAX+0];
                 sub [EBX+0], EDX;
                 // no store needed because dest is memory
             }
             foreach(j,i; V)
             {
                 const uint off = V[j]*4;// this is a work-around
                 asm
                 {
                     mov EDX, [EAX+off];
                     sbb [EBX+off], EDX;
                     // no store needed because dest is memory
                 }
             }
             return to;
         }

         use!(size, V) opMulAssign(use!(size, V) to)
         {
             return *this = opMul(to);
         }
     }
}
</code>
Dec 11 2006
next sibling parent reply Don Clugston <dac nospam.com.au> writes:
BCS wrote:
 I have implemented a arbitrary precision integer type using a struct and 
 templates. Currently it supports ==, +, -, *, +=, -= and *=. It should 
 be fairly efficient as it uses static foreaches and asm blocks to unroll 
 all of the looping and conditionals.
 
 I haven't tested the multiplication much yet so it may well contain 
 bugs. (If it doesn't I will be vary surprised)
 
 On my system DMD crashes for size > 73 (e.i. 73 * 32b = 2336b).
Of course, there's not much value in unrolling loops to that extent (cache misses will cost you more than you'll gain by eliminating the last couple of branch instructions). However, the use of tuples and static foreach to enforce asm unrolling is very cool. This ought to work with mixins as well as direct asm instructions. I'd never found a way of mixing in multiple blocks of code at the same level -- this technique will probably solve the problem. (And it might be a way of doing very high performance expression templates :-) ).
Dec 12 2006
parent BCS <BCS pathlink.com> writes:
Don Clugston wrote:
 BCS wrote:
 
 I have implemented a arbitrary precision integer type using a struct 
 and templates. Currently it supports ==, +, -, *, +=, -= and *=. 
[...]
 On my system DMD crashes for size > 73 (e.i. 73 * 32b = 2336b).
Of course, there's not much value in unrolling loops to that extent (cache misses will cost you more than you'll gain by eliminating the last couple of branch instructions).
Yah, unrolling the 2Kb version probably won't gain much, however I expect that it would do 128b/256b and such just fine. Hmm, I'll have to look into having it do a loop for the higher size values and then benchmark the two versions.
 
 However, the use of tuples and static foreach to enforce asm unrolling 
 is very cool. This ought to work with mixins as well as direct asm 
 instructions. I'd never found a way of mixing in multiple blocks of code 
 at the same level -- this technique will probably solve the problem.
 (And it might be a way of doing very high performance expression 
 templates :-)  ).
I guess it does more as a proof of concept that as a REALLY high precision type.
Dec 12 2006
prev sibling parent BCS <BCS pathlink.com> writes:
version next.bugfix

There was a typo bug and a logical bug in the opMul code.
Also opAssign(uint) is now implemented as well as opDiv(uint) and 
opDivAssign(uint).


http://www.webpages.uidaho.edu/~shro8822/bignum.d
Dec 13 2006