www.digitalmars.com         C & C++   DMDScript  

D - A third example use of C macros that has not been discussed

reply "Paul Sheer" <psheer icon.co.za> writes:
There are many situations where you want to reimplement
a piece of code several times, with a different type
for each. This can happen for optimization, or for
marshalling functions. This is the strongest reason
for support of a macro preprocessor.

This example shows an optimized memcpy function that
copies in words at a time, properly accounting for
possible alignment differences on processors that
do not support non-aligned word stores.

It is eligantly done with macros of course.

-paul

--------

/* this must be set to the most efficient copying type - usually
unsigned long: */
typedef unsigned long cpy_t;

#define word_copy(t,d,s,count)					\
    do {							\
	unsigned int c;						\
	char *d8 = (char *) (d);				\
	char *s8 = (char *) (s);				\
	register t a0, a1, *dst, *src;				\
	c = (unsigned long) (d) & (sizeof (t) - 1);		\
	while (count && (c & (sizeof (t) - 1)))			\
	     (*d8++ = *s8++), count--, c++;			\
	dst = (t *) d8;						\
	src = (t *) s8;						\
	while (count >= (sizeof (t)) * 2) {			\
	    a0 = src[0];					\
	    a1 = src[1];					\
	    count -= (sizeof (t)) * 2;				\
	    dst[0] = a0;					\
	    dst[1] = a1;					\
	    src += 2;						\
	    dst += 2;						\
	}							\
	while (count >= (sizeof (t))) {				\
	    *dst++ = *src++;					\
	    count -= sizeof (t);				\
	}							\
	d8 = (char *) dst;					\
	s8 = (char *) src;					\
	while (count--)						\
	    *d8++ = *s8++;					\
    } while (0)

#define byte_copy(t,d,s,count)					\
    do {							\
	char *d8 = (char *) (d);				\
	char *s8 = (char *) (s);				\
	while (count--)						\
	    *d8++ = *s8++;					\
    } while (0)

void *memcpy (void *_dest, const void *_src, size_t count)
{
    unsigned int f;
/* check alignment */
    f = sizeof (cpy_t);
    while ((((unsigned long) _src) & (f - 1)) != 
                (((unsigned long) _dest & (f - 1))))
	f >>= 1;
    switch (f) {
    case 8:
	word_copy (u_int64_t, _dest, _src, count);
	break;
    case 4:
	word_copy (u_int32_t, _dest, _src, count);
	break;
    case 2:
	word_copy (u_int16_t, _dest, _src, count);
	break;
    case 1:
	byte_copy (u_int8_t, _dest, _src, count);
	break;
    }
    return (void *) _dest;
}
Feb 01 2003
next sibling parent Andy Friesen <andy ikagames.com> writes:
Maybe I'm missing something, but it seems to me that you could do that 
with templates pretty easily.  In so doing, you get a bit of typesafety, 
and you avoid arguments from being evaluated more than once.

     template CopyLoop(T)
     {
         void word_copy(T* dest, T* src, int count)
         {
             do
             {
                 uint c;
                 char *d8 = (char *) (dest);
                 char *s8 = (char *) (src);
                 register t a0, a1, *dst, *src;
                 c = (ulong) (dest) & (sizeof (t) - 1);
                 while (count && (c & (sizeof (t) - 1)))
                      (*d8++ = *s8++), count--, c++;
                 dst = (t *) d8;
                 src = (t *) s8;
                 while (count >= (sizeof (T)) * 2)
                 {
                     a0 = src[0];
                     a1 = src[1];
                     count -= (sizeof (T)) * 2;
                     dst[0] = a0;
                     dst[1] = a1;
                     src += 2;
                     dst += 2;
                 }
                 while (count >= (sizeof (T)))
                 {
                     *dst++ = *src++;
                     count -= sizeof (T);
                 }
                 d8 = (char *) dst;
                 s8 = (char *) src;
                 while (count--)
                     *d8++ = *s8++;
             } while (0);
         }
     }


Paul Sheer wrote:
 There are many situations where you want to reimplement
 a piece of code several times, with a different type
 for each. This can happen for optimization, or for
 marshalling functions. This is the strongest reason
 for support of a macro preprocessor.
 
 This example shows an optimized memcpy function that
 copies in words at a time, properly accounting for
 possible alignment differences on processors that
 do not support non-aligned word stores.
 
 It is eligantly done with macros of course.
 
 -paul
 
 --------
 
 /* this must be set to the most efficient copying type - usually
 unsigned long: */
 typedef unsigned long cpy_t;
 
 #define word_copy(t,d,s,count)					\
     do {							\
 	unsigned int c;						\
 	char *d8 = (char *) (d);				\
 	char *s8 = (char *) (s);				\
 	register t a0, a1, *dst, *src;				\
 	c = (unsigned long) (d) & (sizeof (t) - 1);		\
 	while (count && (c & (sizeof (t) - 1)))			\
 	     (*d8++ = *s8++), count--, c++;			\
 	dst = (t *) d8;						\
 	src = (t *) s8;						\
 	while (count >= (sizeof (t)) * 2) {			\
 	    a0 = src[0];					\
 	    a1 = src[1];					\
 	    count -= (sizeof (t)) * 2;				\
 	    dst[0] = a0;					\
 	    dst[1] = a1;					\
 	    src += 2;						\
 	    dst += 2;						\
 	}							\
 	while (count >= (sizeof (t))) {				\
 	    *dst++ = *src++;					\
 	    count -= sizeof (t);				\
 	}							\
 	d8 = (char *) dst;					\
 	s8 = (char *) src;					\
 	while (count--)						\
 	    *d8++ = *s8++;					\
     } while (0)
 
 #define byte_copy(t,d,s,count)					\
     do {							\
 	char *d8 = (char *) (d);				\
 	char *s8 = (char *) (s);				\
 	while (count--)						\
 	    *d8++ = *s8++;					\
     } while (0)
 
 void *memcpy (void *_dest, const void *_src, size_t count)
 {
     unsigned int f;
 /* check alignment */
     f = sizeof (cpy_t);
     while ((((unsigned long) _src) & (f - 1)) != 
                 (((unsigned long) _dest & (f - 1))))
 	f >>= 1;
     switch (f) {
     case 8:
 	word_copy (u_int64_t, _dest, _src, count);
 	break;
     case 4:
 	word_copy (u_int32_t, _dest, _src, count);
 	break;
     case 2:
 	word_copy (u_int16_t, _dest, _src, count);
 	break;
     case 1:
 	byte_copy (u_int8_t, _dest, _src, count);
 	break;
     }
     return (void *) _dest;
 }
 
 
Feb 01 2003
prev sibling parent "Mike Wynn" <mike.wynn l8night.co.uk> writes:
IMHO:
the compiler should generate an optimised memcpy from the  src

memcpy( foo, bar, len );

(inlined if optimised for speed, either way the fastest for the platform and
the cpu's supported instruction set).

and you've not put a duff's device in there, (they're legal in D) tight
loops kill performance.
and on some architectures unaligned int reads are allowed and less expencive
than 4 byte reads. I'm sure less expensive than 4 byte reads and 4 branches.

Mike.


"Paul Sheer" <psheer icon.co.za> wrote in message
news:b1gs7d$2i9c$1 digitaldaemon.com...
 There are many situations where you want to reimplement
 a piece of code several times, with a different type
 for each. This can happen for optimization, or for
 marshalling functions. This is the strongest reason
 for support of a macro preprocessor.

 This example shows an optimized memcpy function that
 copies in words at a time, properly accounting for
 possible alignment differences on processors that
 do not support non-aligned word stores.

 It is eligantly done with macros of course.

 -paul

 --------

 /* this must be set to the most efficient copying type - usually
 unsigned long: */
 typedef unsigned long cpy_t;

 #define word_copy(t,d,s,count) \
     do { \
 unsigned int c; \
 char *d8 = (char *) (d); \
 char *s8 = (char *) (s); \
 register t a0, a1, *dst, *src; \
 c = (unsigned long) (d) & (sizeof (t) - 1); \
 while (count && (c & (sizeof (t) - 1))) \
      (*d8++ = *s8++), count--, c++; \
 dst = (t *) d8; \
 src = (t *) s8; \
 while (count >= (sizeof (t)) * 2) { \
     a0 = src[0]; \
     a1 = src[1]; \
     count -= (sizeof (t)) * 2; \
     dst[0] = a0; \
     dst[1] = a1; \
     src += 2; \
     dst += 2; \
 } \
 while (count >= (sizeof (t))) { \
     *dst++ = *src++; \
     count -= sizeof (t); \
 } \
 d8 = (char *) dst; \
 s8 = (char *) src; \
 while (count--) \
     *d8++ = *s8++; \
     } while (0)

 #define byte_copy(t,d,s,count) \
     do { \
 char *d8 = (char *) (d); \
 char *s8 = (char *) (s); \
 while (count--) \
     *d8++ = *s8++; \
     } while (0)

 void *memcpy (void *_dest, const void *_src, size_t count)
 {
     unsigned int f;
 /* check alignment */
     f = sizeof (cpy_t);
     while ((((unsigned long) _src) & (f - 1)) !=
                 (((unsigned long) _dest & (f - 1))))
 f >>= 1;
     switch (f) {
     case 8:
 word_copy (u_int64_t, _dest, _src, count);
 break;
     case 4:
 word_copy (u_int32_t, _dest, _src, count);
 break;
     case 2:
 word_copy (u_int16_t, _dest, _src, count);
 break;
     case 1:
 byte_copy (u_int8_t, _dest, _src, count);
 break;
     }
     return (void *) _dest;
 }
Feb 01 2003