www.digitalmars.com         C & C++   DMDScript  

digitalmars.D - fun with mixins

reply Chad J <gamerChad _spamIsBad_gmail.com> writes:
Here's the jist of the attached source:

template doSomething()
{
   auto dummy = value = some + expression;
}

uint func( uint some, uint expression )
{
   uint value = 52;
   mixin doSomething!();
   return value;
}

It seems pretty hackish to me, yet useful.

Attached is a really long-winded alpha blending routine.  The advantage 
is that it's perhaps the most generalized alpha blending routine I've 
ever written that is still decently fast (yeah, could be a lot better 
with simd, gpu usage, or <insert common optimization that doesn't work 
in general on my pda>).  It could soon do things totally unrelated to 
alpha blending.  It seems kinda like something that the C preprocessor 
would be used for, though the thought of using C kinda scares me, and I 
have hope that D templates/mixins are up to the job.  Maybe someday when 
I have a lot of time on my hands I can figure out how to make the 
templates generate runtime for-loops, complete with custom-tailored 
innerloop code, which would make it a lot easier to optimize edge cases 
like sourcePixel[i+1] where there may or may not be an i+1'th pixel and 
I don't want to afford an 'if'.

I have to wonder, has someone done this stuff already (the mixin trick, 
or some sort of graphics routine framework in D)?
Jan 26 2007
parent reply Bill Baxter <dnewsgroup billbaxter.com> writes:
I confess I'm not sure what all is going on in your code there.  At a 
glance it looks like there's a lot of hard coded 8/16/24/32's in there. 
  *Seems* like you should be able to make something even more general 
than that and perhaps in the process make it even leaner and meaner.  :-)

I'm looking forward to the day when someone cranks out something like 
GIL using D.

   http://opensource.adobe.com/gil/presentation/index.htm

And a better AGG using D uber-templates would be nice too.

   http://www.antigrain.com/

--bb

Chad J wrote:
 Here's the jist of the attached source:
 
 template doSomething()
 {
   auto dummy = value = some + expression;
 }
 
 uint func( uint some, uint expression )
 {
   uint value = 52;
   mixin doSomething!();
   return value;
 }
 
 It seems pretty hackish to me, yet useful.
 
 Attached is a really long-winded alpha blending routine.  The advantage 
 is that it's perhaps the most generalized alpha blending routine I've 
 ever written that is still decently fast (yeah, could be a lot better 
 with simd, gpu usage, or <insert common optimization that doesn't work 
 in general on my pda>).  It could soon do things totally unrelated to 
 alpha blending.  It seems kinda like something that the C preprocessor 
 would be used for, though the thought of using C kinda scares me, and I 
 have hope that D templates/mixins are up to the job.  Maybe someday when 
 I have a lot of time on my hands I can figure out how to make the 
 templates generate runtime for-loops, complete with custom-tailored 
 innerloop code, which would make it a lot easier to optimize edge cases 
 like sourcePixel[i+1] where there may or may not be an i+1'th pixel and 
 I don't want to afford an 'if'.
 
 I have to wonder, has someone done this stuff already (the mixin trick, 
 or some sort of graphics routine framework in D)?
 
 
 ------------------------------------------------------------------------
 
 /+ Alpha blended blitting routine. +/
 
 import std.stdio;
 
 version = SDL;
 version( SDL )
 {
 	import derelict.sdl.sdl;
 }
 
 // TODO: RGB32?
 enum : uint
 {
 	INVALID = 0,
 	RGBA32,
 	RGB24,
 	RGB16_555,
 	RGB16_565,
 	RGBA8_I32, // indexed to 32 bit values
 	A8,
 }
 
 private template readSource( uint RGBA )
 {
 	static if ( RGBA == RGBA32 )
 	{
 		uint readS_dummy1 = srgb = source[si];
 		uint readS_dummy2 = alpha = srgb & sourceAMask;
 		uint readS_dummy3 = srgb = srgb & ~sourceAMask;
 	}
 	else static if ( RGBA == RGB24 )
 	{
 		// There is no such thing as an array with 24-bit elements, so we have
 		//   to use pointers.  
 		uint readS_dummy1 = srgb = *(cast(uint*)(source + si));
 	}
 	else static if ( RGBA == RGB16_555 || RGBA == RGB16_565 )
 	{
 		// cast(uint) is not necessary in all cases, only if dest is 32 bpp
 		uint readS_dummy1 = srgb = cast(uint)source[si];
 	}
 	else static if ( RGBA == RGBA8_I32 )
 	{
 		uint readS_dummy1 = srgb = rgbaTable[source[si]];
 		uint readS_dummy2 = alpha = srgb & sourceAMask;
 		uint readS_dummy3 = srgb = srgb & ~sourceAMask;
 	}
 	else static if ( RGBA == A8 )
 	{
 		uint readS_dummy1 = alpha = cast(uint)source[si];
 	}
 	else
 	{
 		pragma(msg,"Invalid source RGBA format for reading.");
 		static assert(0);
 	}
 }
 
 private template readDestination( uint RGBA, ubyte half16bpp = NOT_APPLICABLE )
 {
 	static if ( RGBA == RGBA32 )
 	{
 		uint readD_dummy1 = drgb = dest[di];
 	}
 	else static if ( RGBA == RGB24 )
 	{
 		// There is no such thing as an array with 24-bit elements, so we have
 		//   to use pointers.  
 		uint readD_dummy1 = drgb = *(cast(uint*)(dest + di));
 		
 		// Since we can't write 24 bits, we can either write 3 bytes (slow),
 		//   or we can overwrite 8 bits of the next pixel.  The latter is 
 		//   faster and can be done safely if we overwrite those 8 bits with 
 		//   their previous contents.  
 		uint drgbOriginal = drgb;
 	}
 	else static if ( RGBA == RGB16_555 || RGBA == RGB16_565 )
 	{
 		uint readD_dummy1 = drgb = dest[di];
 		
 		static if ( half16bpp == LOW_ADDRESS_HALF || 
 				    half16bpp == HIGH_ADDRESS_HALF  )
 		{
 			// Store the original values of both pixels being read.
 			// When reading and writing 2 pixels at a time, it is impossible
 			//   to prevent overwriting a pixel that we don't want to.  At 
 			//   least not without some rather complicated code.  So instead, 
 			//   we just make sure that the pixel we don't want to overwrite 
 			//   is overwritten with it's original value.  The original value 
 			//   is stored here.  
 			uint drgbOriginal = destReadResult;
 		}
 	}
 	else static if ( RGBA == RGBA8_I32 )
 	{
 		uint readD_dummy1 = drgb = rgbaTable[dest[di]];
 	}
 	else
 	{
 		pragma(msg,"Invalid destination RGBA format for reading.");
 		static assert(0);
 	}
 }
 
 private template read( uint sourceRGBA, uint destRGBA, 
                        ubyte half16bpp = NOT_APPLICABLE )
 {
 	mixin readSource!( sourceRGBA );
 	mixin readDestination!( destRGBA, half16bpp );
 }
 
 private template convert( uint sourceRGBA, uint destRGBA )
 {
 	static if ( sourceRGBA == RGBA32 || sourceRGBA == RGB24 || sourceRGBA ==
RGBA8_I32 )
 	{
 		static if ( destRGBA == RGBA32 || destRGBA == RGB24 )
 		{
 			alias sourceReadResult srgb; // do nothing
 		}
 		static if ( destRGBA == RGB16_565 )
 		{
 			// Here we must shrink a 32 bit pixel from the source into a
 			//   16 bit pixel.
 			// in this situation we write the 16 bit resultant pixels one at
 			//   a time so the extra 16 bits will be safely discarded.
 			uint convert_dummy1 = 
 			srgb = ((0xf800 & (sourceReadResult >> 8 )) +
 			        (0x07e0 & (sourceReadResult >> 5 )) +
 			        (0x001f & (sourceReadResult >> 3 )));
 		}
 		else static assert(0);
 	}
 	else static if ( sourceRGBA == RGB16_565 )
 	{
 		static if ( destRGBA == RGBA32 || destRGBA == RGB24 )
 		{
 			// Here we must expand a 16 bit pixel from the source into a
 			//   32 bit pixel.
 			// In this situation we read the 16 bit pixels one at a time
 			//   so the extra 16 bits can be safely discarded.
 			uint convert_dummy1 = 
 			srgb = (((sourceReadResult & 0xf800) << 8 ) +
 			        ((sourceReadResult & 0x07e0) << 5 ) +
 			        ((sourceReadResult & 0x001f) << 3 ));
 		}
 		else static if ( destRGBA == RGB16_565 )
 		{
 			//alias sourceReadResult srgb; // do nothing
 		}
 		else static assert(0);
 	}
 	else static if ( sourceRGBA == A8 )
 	{
 		//alias srcColor srgb;
 	}
 	else static assert(0);
 }
 
 private template blend( uint RGBA )
 {
 	// Note that this will get it right regardless of which color is in which 
 	//   channel.  Of course, the channels' placements must be correct.  
 	// It also preserves the destination's alpha channel, if present.  
 	static if ( RGBA == RGBA32 || RGBA == RGB24 || RGBA == RGBA8_I32 || 
 	                 RGBA == RGB16_565 || RGBA == RGB16_555 )
 	{
 		static if ( RGBA == RGBA32 || RGBA == RGB24 || RGBA == RGBA8_I32 )
 		{
 			const shift = 8;
 			const evenMask = 0x00ff00ff;
 		}
 		else
 		{
 			// For 16bpp formats:
 			// alpha must be a 5 bit value (the 3 hi bits MUST be clear)
 			// this does 2 16bit pixels at a time in one 32 bit word.  
 			// endianness doesn't matter on 565 formats due to symmetry
 			// TODO: take into account endianness on 555 formats
 			//        (probably only noticable on big endian machines)
 			const shift = 5;
 			const evenMask = 0x07e0f81f;
 		}
 		const oddMask = ~evenMask;
 		
 		static if ( RGBA == RGBA32 || RGBA == RGBA8_I32 )
 			uint originalDestAlpha = drgb & sourceAMask;
 		
 		static if ( destbpp == 16 )
 			uint blend_dummy1 = alpha = alpha >> 3;
 		
 		
 		static if ( destbpp == 16 && sourceRGBA == A8 )
 		{
 			// Extract the middle channel and shift it into the high 16 bits, giving
 			//   at least 5 bits above it to hold the multiplication overflow, and at
 			//   least 5 bits below it to hold the high channel's multiplication
overflow.
 			uint sourceChannels = ((srgb << 16) | srgb) & evenMask;
 			uint destChannels =   ((drgb << 16) | drgb) & evenMask;
 			
 			// do the blending
 			uint blend_temp =
 				(((sourceChannels - destChannels) * alpha) >> shift) + destChannels;
 			
 			// Now we move the middle channel from the high 16 bits, back into its 
 			//   rightful place in the middle.  
 			uint blend_dummy2 = 
 			drgb = (blend_temp & (evenMask & 0x0000ffff)) | 
 				  ((blend_temp & (evenMask & 0xffff0000)) >> 16 );
 		}
 		else
 		{
 			uint blend_dummy2 =
 			drgb = 
 				((((((srgb & evenMask)-(drgb & evenMask))  * alpha) >> shift) + drgb) &
evenMask) |
 				((((((srgb & oddMask )-(drgb & oddMask )) >> shift)  * alpha) + drgb) &
oddMask);
 		}
 		
 		static if ( RGBA == RGBA32 || RGBA == RGBA8_I32 ) // preserve alpha
 			uint blend_dummy3 = drgb = (drgb & ~sourceAMask) | originalDestAlpha;
 	}
 	else
 	{
 		pragma(msg,"Invalid RGBA format for alpha blending.");
 		static assert(0);
 	}
 }
 
 
 private template write( uint RGBA, ubyte half16bpp = NOT_APPLICABLE )
 {
 	
 	static if ( RGBA == RGBA32 )
 	{
 		uint write_dummy1 = dest[di] = drgb;
 	}
 	else static if ( RGBA == RGB24 )
 	{
 		uint* address = cast(uint*)(dest + di);
 		
 		version ( BigEndian )
 			uint write_dummy1 = *address = (drgb & 0xffffff00) | (drgbOriginal &
0x000000ff);
 		else
 			uint write_dummy1 = *address = (drgb & 0x00ffffff) | (drgbOriginal &
0xff000000);
 	}
 	else static if ( RGBA == RGB16_565 || RGBA == RGB16_555 )
 	{
 		// for selecting the lowest or highest pixel in terms of 
 		//   address in memory rather than place in the word/register
 		version ( BigEndian )
 			const writeMask = 0x0000ffff;
 		else
 			const writeMask = 0xffff0000;
 		
 		static if ( half16bpp == HIGH_ADDRESS_HALF )
 			uint write_dummy1 = dest[di] = (drgb & writeMask) | (drgbOriginal &
~writeMask);
 		else static if ( half16bpp == LOW_ADDRESS_HALF )
 			uint write_dummy1 = dest[di] = (drgb & ~writeMask) | (drgbOriginal &
writeMask);
 		else
 			uint write_dummy1 = dest[di] = drgb;
 	}
 	// TODO:  writing RGBA8_I32.  needs an algo to reverse a 32 bpp value into
 	//          the an 8 bit indexed value.  
 	else
 	{
 		pragma(msg,"Invalid RGBA format for alpha blending.");
 		static assert(0);
 	}
 	
 	uint write_dummy2 = si = si + sourceIncrement;
 	uint write_dummy3 = di = di + destIncrement;
 }
 
 private enum : ubyte
 {
 	NOT_APPLICABLE = 0,
 	LOW_ADDRESS_HALF,
 	HIGH_ADDRESS_HALF,
 }
 
 private template innerLoop( uint sourceRGBA, uint destRGBA, 
                             ubyte half16bpp = NOT_APPLICABLE )
 {
 	static if ( !(destRGBA == RGB16_565 || destRGBA == RGB16_555) && half16bpp >
0 )
 	{
 		pragma(msg,"The half16bpp argument is only to be used when the "
 		           "destination format is 16 bits per pixel.");
 		static assert(0);
 	}
 	
 	mixin read!(sourceRGBA,destRGBA,half16bpp);
 	mixin convert!(sourceRGBA,destRGBA);
 	mixin blend!(destRGBA);
 	mixin write!(destRGBA,half16bpp);
 }
 
 private template calculatePaddingAndArrays( bool isSource )
 {
 
 	static if ( isSource )
 	{
 		alias srcSurface surface;
 		alias sourcebpp bpp;
 		alias destbpp otherbpp;
 	}
 	else
 	{
 		alias dstSurface surface;
 		alias destbpp bpp;
 		alias sourcebpp otherbpp;
 	}
 	
 	// Padding is the amount of extra data at the end of a scanline used to
 	//   ensure that the end of the scanline lines up on a 32 bit boundary.
 	// spadding = source padding
 	// dpadding = dest padding
 	// In this case, the units padding is measured in change depending on
 	//   the source and destination format.  
 	// The amount of data that is handled in each iteration also changes,
 	//   and is reflected by the different types of arrays.  
 	
 	static if ( bpp == 32 )
 	{
 		auto padding = 0;
 		uint[] pixelData = cast(uint[])surface.pixels;
 	}
 	else static if ( bpp == 24 )
 	{
 		// padding measured in bytes
 		auto padding = surface.pitch - (surface.width * 3);
 		ubyte* pixelData = surface.pixels.ptr;
 	}
 	else static if ( bpp == 16 )
 	{
 		static if ( otherbpp != 16 /+otherbpp == 32 || otherbpp == 24 || otherbpp ==
8+/ )
 		{
 			// padding measured in shorts
 			auto padding = (surface.pitch >> 1) - surface.width;
 			ushort[] pixelData = cast(ushort[])surface.pixels;
 		}
 		else
 		{
 			auto padding = 0;
 			uint[] pixelData = cast(uint[])surface.pixels;
 		}
 	}
 	else static if ( bpp == 8 )
 	{
 		auto padding = surface.pitch - surface.width; // padding measured in bytes
 		ubyte[] pixelData = surface.pixels;
 	}
 	else
 		static assert(0);
 	
 	static if ( isSource )
 	{
 		alias padding spadding;
 		alias pixelData source;
 	}
 	else
 	{
 		alias padding dpadding;
 		alias pixelData dest;
 	}
 }
 
 // This function shall do no clipping.  
 
 void blit( uint sourceRGBA, uint destRGBA )
 		( short sourceX, short sourceY, 
 		short destX, short destY, short width, short height, 
 		inout Surface srcSurface, inout Surface dstSurface, 
 		uint srcColor, uint alpha )
 {
 	// this stuff just determines the bits per pixel of the source and 
 	//   destination surfaces
 	static if ( sourceRGBA == RGBA32 )
 		const sourcebpp = 32;
 	else static if ( sourceRGBA == RGB24 )
 		const sourcebpp = 24;
 	else static if ( sourceRGBA == RGB16_565 || sourceRGBA == RGB16_555 )
 		const sourcebpp = 16;
 	else
 		const sourcebpp = 8;
 	
 	static if ( destRGBA == RGBA32 )
 		const destbpp = 32;
 	else static if ( destRGBA == RGB24 )
 		const destbpp = 24;
 	else static if ( destRGBA == RGB16_565 || destRGBA == RGB16_555 )
 		const destbpp = 16;
 	else
 		const destbpp = 8;
 	//
 	
 	static if ( (sourcebpp == 32 || sourcebpp == 24) && destbpp == 16 )
 		const convert32to16 = true;
 	else
 		const convert32to16 = false;
 	
 	static if ( sourcebpp == 16 && (destbpp == 32 || destbpp == 24) )
 		const convert16to32 = true;
 	else
 		const convert16to32 = false;
 	
 	static if ( (destRGBA == RGB16_565 || destRGBA == RGB16_555) && sourceRGBA !=
A8 )
 		srcColor |= (srcColor << 16);
 	
 	static if ( destbpp == 16 )
 		alpha >>= 3;
 	
 	// note that the padding quantities are necessarily zero if
 	//   unitWidth = width / 2;
 	//   that's important because they have different units of measurement!
 	
 	mixin calculatePaddingAndArrays!( true );
 	mixin calculatePaddingAndArrays!( false );
 	
 	static if ( destbpp == 24 )
 	{
 		uint lineWidth = width * 3;
 		
 		static if ( sourcebpp == 24 )
 		{
 			// same as: unitSrcSurfaceWidth = srcSurface.width * 3;
 			uint unitSrcSurfaceWidth = srcSurface.pitch - spadding;
 			uint unitSrcWidth = lineWidth;
 		}
 		else
 		{
 			uint unitSrcSurfaceWidth = srcSurface.width;
 			uint unitSrcWidth = width;
 		}
 		
 		uint unitDstSurfaceWidth = dstSurface.pitch - dpadding;
 		
 		uint unitDstWidth = lineWidth;
 	}
 	else static if ( sourcebpp == 16 && destbpp == 16 )
 	{
 		uint lineWidth = width / 2; // because we do 2 pixels at a time
 		
 		// The +(width & 1) part is used to make the division round up.  
 		uint unitSrcSurfaceWidth = (srcSurface.width / 2) + (srcSurface.width & 1);
 		uint unitDstSurfaceWidth = (dstSurface.width / 2) + (dstSurface.width & 1);
 		
 		// The lineWidth variable rounds down on division, so it may be
 		//   missing a pixel.  That is desirable since we don't want alphablend
 		//   onto the pixel next to the missing pixel.  Of course, we will 
 		//   handle the missing pixel individually, but it is still useful to 
 		//   have access to a rounded-up version of the blit's width.  
 		uint unitSrcWidth = lineWidth + (width & 1);
 		uint unitDstWidth = unitSrcWidth;
 	}
 	else
 	{
 		uint lineWidth = width;
 		uint unitSrcWidth = width;
 		uint unitDstWidth = width;
 		uint unitSrcSurfaceWidth = srcSurface.width;
 		uint unitDstSurfaceWidth = dstSurface.width;
 	}
 	
 	uint sourceAMask = srcSurface.alphaMask;
 	
 	version( SDL )
 	{
 		auto sourceSdlSurface = srcSurface.sdl_surface;
 		if ( sourceSdlSurface !is null )
 		{
 			bool srcLocked = lock( sourceSdlSurface );
 			scope(exit)
 			{
 				if ( srcLocked )
 					SDL_UnlockSurface( sourceSdlSurface );
 			}
 		}
 		
 		auto destSdlSurface = srcSurface.sdl_surface;
 		if ( destSdlSurface !is null )
 		{
 			bool dstLocked = lock( destSdlSurface );
 			scope(exit)
 			{
 				if ( dstLocked )
 					SDL_UnlockSurface( destSdlSurface );
 			}
 		}
 	}
 	
 	static if ( sourcebpp == 24 )
 		uint sourceIncrement = 3;
 	else
 		uint sourceIncrement = 1;
 	
 	static if ( destbpp == 24 )
 		uint destIncrement = 3;
 	else
 		uint destIncrement = 1;
 	
 	// Since we are not necessarily blitting accross the entire width of the 
 	//   destination surface or source surface, we have to skip some of the
 	//   pixels on the end of the current scanline and on the beginning of
 	//   the next scanline.  
 	// Add that to the padding (which is explained above), and the result
 	//   is these source/dest LineExtra variables.  
 	int sLineExtra = unitSrcSurfaceWidth + spadding - unitSrcWidth;
 	int dLineExtra = unitDstSurfaceWidth + dpadding - unitDstWidth;
 	
 	// initialize the index variables
 	// si = source index
 	// di = destination index
 	int si = (sourceX * sourceIncrement) + (unitSrcSurfaceWidth * sourceY);
 	int di = (destX   * destIncrement)   + (unitDstSurfaceWidth * destY);
 	
 	// nextLine is always ahead of di by the amount of pixels left in one line
 	//   of the blit.  
 	int nextLine;
 	
 	// endi is the index to stop at.  
 	//int endi = destX + unitWidth + (unitDstSurfaceWidth + dpadding) * (destY +
height);
 	int endi = di + (unitDstSurfaceWidth * height);
 	
 	assert( lineWidth + dLineExtra == unitDstSurfaceWidth + dpadding );
 	
 	// TODO: remove this
 	void writeHex ( char[] name, uint number )
 	{
 		writef( "(",name,std.string.toString( cast(ulong)number, cast(uint)16 ),")|"
);
 	}
 	//
 	
 	uint srgb;
 	uint drgb;
 	
 	static if ( sourceRGBA == A8 )
 		srgb = srcColor;
 	
 	while( di < endi )
 	{
 		nextLine = di + lineWidth;
 		
 		static if ( convert32to16 || convert16to32 )
 			mixin innerLoop!(sourceRGBA,destRGBA,HIGH_ADDRESS_HALF);
 		
 		while( di < nextLine )
 		{
 			mixin innerLoop!(sourceRGBA,destRGBA);
 		}
 		
 		static if ( convert32to16 || convert16to32 )
 			mixin innerLoop!(sourceRGBA,destRGBA,LOW_ADDRESS_HALF);
 		
 		si += sLineExtra;
 		di += dLineExtra;
 	}
 }
 
 version( SDL )
 {
 	
 	private bool lock( SDL_Surface* surface )
 	{
 		if ( SDL_MUSTLOCK( surface ) && !surface.locked )
 		{
 			safe_SDL_LockSurface( surface );
 			return true;
 		}
 		return false;
 	}
 	
 	// automatically throw errors resulting from the SDL_LockSurface function.
 	private void safe_SDL_LockSurface( SDL_Surface* surface )
 	{
 		if ( SDL_LockSurface( surface ) != 0 )
 		{
 			char* sdlError = SDL_GetError();
 			char[] error = sdlError[0..std.c.string.strlen(sdlError)];
 			throw new Exception( "SDL_LockSurface failed to lock a surface: "~error );
 		}
 	}
 }
 
 struct Surface
 {
 	ubyte[] pixels;
 	uint alphaMask = 0;
 	ushort width = 0xffff;
 	ushort height = 0xffff;
 	ushort pitch = 0xffff; /// width of a scanline in bytes.  
 	ushort RGBAformat = INVALID;
 	
 	/// width and height are in pixels.
 	static Surface opCall( ubyte[] pixels, uint alphaMask,
 	                       ushort width, ushort height, ushort pitch,
 	                       ushort RGBAformat )
 	{
 		Surface result;
 		assert( pixels !is null );
 		result.pixels = pixels;
 		result.width = width;
 		result.height = height;
 		result.pitch = pitch;
 		result.RGBAformat = RGBAformat;
 		result.alphaMask = alphaMask;
 		return result;
 	}
 	
 	version ( SDL )
 	{
 		SDL_Surface* sdl_surface = null;
 		
 		static Surface opCall( SDL_Surface* surface, ushort RGBAformat )
 		{
 			Surface result;
 			result.pixels = cast(ubyte[])surface.pixels[0.. surface.pitch * surface.h];
 			result.width = surface.w;
 			result.height = surface.h;
 			result.pitch = surface.pitch;
 			assert ( RGBAformat != INVALID );
 			result.RGBAformat = RGBAformat;
 			result.alphaMask = surface.format.Amask;
 			result.sdl_surface = surface;
 			return result;
 		}
 	}
 }
Jan 26 2007
parent Chad J <gamerChad _spamIsBad_gmail.com> writes:
Oh, sorry, I'm not so much talking about optimizations or bit twiddling, 
but rather D's templating.  It's all about the mixin template.  Normally 
you can't have statements in template bodies, only declarations.  Well, 
I can get around that for most code by prepending the statement with 
"auto dummyVar = ", which turns it into a declaration.  The end result 
has an extraneous assignment to a variable that never gets used, but I 
think the compiler (dmd at least) optimizes that away because I tried 
making a lot of such dummy variables in the middle of my loop and 
performance never decreased.

Thanks for the links.  As far as my example is concerned, GIL is very 
similar to what I am trying to accomplish.  What I would like to do is 
have a situation where you hand the graphics lib a template that 
contains the simple graphics operations you want to perform on each 
source pixel, and you can access as many destination pixels as you want 
and they can be at any coordinates.  The graphics lib takes care of 
creating all of the loops, iteration, weird setup code, and stuff like 
that.  The stuff that the programmer doesn't control should be at least 
nearly as performant as hand rolled code, and ideally this should be 
easily optimized with SIMD instructions.  On top of such a framework, it 
should be easy enough to roll common canned routines like alphablend, 
gradient, hue, bumpmap, rotozoom, even 3d transforms, etc etc, and I 
suppose that would accomplish something like antigrain.

Bill Baxter wrote:
 I confess I'm not sure what all is going on in your code there.  At a 
 glance it looks like there's a lot of hard coded 8/16/24/32's in there. 
  *Seems* like you should be able to make something even more general 
 than that and perhaps in the process make it even leaner and meaner.  :-)
 
 I'm looking forward to the day when someone cranks out something like 
 GIL using D.
 
   http://opensource.adobe.com/gil/presentation/index.htm
 
 And a better AGG using D uber-templates would be nice too.
 
   http://www.antigrain.com/
 
 --bb
 
 Chad J wrote:
 
 Here's the jist of the attached source:

 template doSomething()
 {
   auto dummy = value = some + expression;
 }

 uint func( uint some, uint expression )
 {
   uint value = 52;
   mixin doSomething!();
   return value;
 }

 It seems pretty hackish to me, yet useful.

 Attached is a really long-winded alpha blending routine.  The 
 advantage is that it's perhaps the most generalized alpha blending 
 routine I've ever written that is still decently fast (yeah, could be 
 a lot better with simd, gpu usage, or <insert common optimization that 
 doesn't work in general on my pda>).  It could soon do things totally 
 unrelated to alpha blending.  It seems kinda like something that the C 
 preprocessor would be used for, though the thought of using C kinda 
 scares me, and I have hope that D templates/mixins are up to the job.  
 Maybe someday when I have a lot of time on my hands I can figure out 
 how to make the templates generate runtime for-loops, complete with 
 custom-tailored innerloop code, which would make it a lot easier to 
 optimize edge cases like sourcePixel[i+1] where there may or may not 
 be an i+1'th pixel and I don't want to afford an 'if'.

 I have to wonder, has someone done this stuff already (the mixin 
 trick, or some sort of graphics routine framework in D)?


 ------------------------------------------------------------------------

 /+ Alpha blended blitting routine. +/

 import std.stdio;

 version = SDL;
 version( SDL )
 {
     import derelict.sdl.sdl;
 }

 // TODO: RGB32?
 enum : uint
 {
     INVALID = 0,
     RGBA32,
     RGB24,
     RGB16_555,
     RGB16_565,
     RGBA8_I32, // indexed to 32 bit values
     A8,
 }

 private template readSource( uint RGBA )
 {
     static if ( RGBA == RGBA32 )
     {
         uint readS_dummy1 = srgb = source[si];
         uint readS_dummy2 = alpha = srgb & sourceAMask;
         uint readS_dummy3 = srgb = srgb & ~sourceAMask;
     }
     else static if ( RGBA == RGB24 )
     {
         // There is no such thing as an array with 24-bit elements, so 
 we have
         //   to use pointers.          uint readS_dummy1 = srgb = 
 *(cast(uint*)(source + si));
     }
     else static if ( RGBA == RGB16_555 || RGBA == RGB16_565 )
     {
         // cast(uint) is not necessary in all cases, only if dest is 
 32 bpp
         uint readS_dummy1 = srgb = cast(uint)source[si];
     }
     else static if ( RGBA == RGBA8_I32 )
     {
         uint readS_dummy1 = srgb = rgbaTable[source[si]];
         uint readS_dummy2 = alpha = srgb & sourceAMask;
         uint readS_dummy3 = srgb = srgb & ~sourceAMask;
     }
     else static if ( RGBA == A8 )
     {
         uint readS_dummy1 = alpha = cast(uint)source[si];
     }
     else
     {
         pragma(msg,"Invalid source RGBA format for reading.");
         static assert(0);
     }
 }

 private template readDestination( uint RGBA, ubyte half16bpp = 
 NOT_APPLICABLE )
 {
     static if ( RGBA == RGBA32 )
     {
         uint readD_dummy1 = drgb = dest[di];
     }
     else static if ( RGBA == RGB24 )
     {
         // There is no such thing as an array with 24-bit elements, so 
 we have
         //   to use pointers.          uint readD_dummy1 = drgb = 
 *(cast(uint*)(dest + di));
        
         // Since we can't write 24 bits, we can either write 3 bytes 
 (slow),
         //   or we can overwrite 8 bits of the next pixel.  The latter 
 is         //   faster and can be done safely if we overwrite those 8 
 bits with         //   their previous contents.          uint 
 drgbOriginal = drgb;
     }
     else static if ( RGBA == RGB16_555 || RGBA == RGB16_565 )
     {
         uint readD_dummy1 = drgb = dest[di];
        
         static if ( half16bpp == LOW_ADDRESS_HALF || 
                     half16bpp == HIGH_ADDRESS_HALF  )
         {
             // Store the original values of both pixels being read.
             // When reading and writing 2 pixels at a time, it is 
 impossible
             //   to prevent overwriting a pixel that we don't want 
 to.  At             //   least not without some rather complicated 
 code.  So instead,             //   we just make sure that the pixel 
 we don't want to overwrite             //   is overwritten with it's 
 original value.  The original value             //   is stored here.  
             uint drgbOriginal = destReadResult;
         }
     }
     else static if ( RGBA == RGBA8_I32 )
     {
         uint readD_dummy1 = drgb = rgbaTable[dest[di]];
     }
     else
     {
         pragma(msg,"Invalid destination RGBA format for reading.");
         static assert(0);
     }
 }

 private template read( uint sourceRGBA, uint destRGBA, 
                        ubyte half16bpp = NOT_APPLICABLE )
 {
     mixin readSource!( sourceRGBA );
     mixin readDestination!( destRGBA, half16bpp );
 }

 private template convert( uint sourceRGBA, uint destRGBA )
 {
     static if ( sourceRGBA == RGBA32 || sourceRGBA == RGB24 || 
 sourceRGBA == RGBA8_I32 )
     {
         static if ( destRGBA == RGBA32 || destRGBA == RGB24 )
         {
             alias sourceReadResult srgb; // do nothing
         }
         static if ( destRGBA == RGB16_565 )
         {
             // Here we must shrink a 32 bit pixel from the source into a
             //   16 bit pixel.
             // in this situation we write the 16 bit resultant pixels 
 one at
             //   a time so the extra 16 bits will be safely discarded.
             uint convert_dummy1 =             srgb = ((0xf800 & 
 (sourceReadResult >> 8 )) +
                     (0x07e0 & (sourceReadResult >> 5 )) +
                     (0x001f & (sourceReadResult >> 3 )));
         }
         else static assert(0);
     }
     else static if ( sourceRGBA == RGB16_565 )
     {
         static if ( destRGBA == RGBA32 || destRGBA == RGB24 )
         {
             // Here we must expand a 16 bit pixel from the source into a
             //   32 bit pixel.
             // In this situation we read the 16 bit pixels one at a time
             //   so the extra 16 bits can be safely discarded.
             uint convert_dummy1 =             srgb = 
 (((sourceReadResult & 0xf800) << 8 ) +
                     ((sourceReadResult & 0x07e0) << 5 ) +
                     ((sourceReadResult & 0x001f) << 3 ));
         }
         else static if ( destRGBA == RGB16_565 )
         {
             //alias sourceReadResult srgb; // do nothing
         }
         else static assert(0);
     }
     else static if ( sourceRGBA == A8 )
     {
         //alias srcColor srgb;
     }
     else static assert(0);
 }

 private template blend( uint RGBA )
 {
     // Note that this will get it right regardless of which color is 
 in which     //   channel.  Of course, the channels' placements must 
 be correct.      // It also preserves the destination's alpha channel, 
 if present.      static if ( RGBA == RGBA32 || RGBA == RGB24 || RGBA 
 == RGBA8_I32 ||                      RGBA == RGB16_565 || RGBA == 
 RGB16_555 )
     {
         static if ( RGBA == RGBA32 || RGBA == RGB24 || RGBA == 
 RGBA8_I32 )
         {
             const shift = 8;
             const evenMask = 0x00ff00ff;
         }
         else
         {
             // For 16bpp formats:
             // alpha must be a 5 bit value (the 3 hi bits MUST be clear)
             // this does 2 16bit pixels at a time in one 32 bit word.  
             // endianness doesn't matter on 565 formats due to symmetry
             // TODO: take into account endianness on 555 formats
             //        (probably only noticable on big endian machines)
             const shift = 5;
             const evenMask = 0x07e0f81f;
         }
         const oddMask = ~evenMask;
        
         static if ( RGBA == RGBA32 || RGBA == RGBA8_I32 )
             uint originalDestAlpha = drgb & sourceAMask;
        
         static if ( destbpp == 16 )
             uint blend_dummy1 = alpha = alpha >> 3;
        
        
         static if ( destbpp == 16 && sourceRGBA == A8 )
         {
             // Extract the middle channel and shift it into the high 
 16 bits, giving
             //   at least 5 bits above it to hold the multiplication 
 overflow, and at
             //   least 5 bits below it to hold the high channel's 
 multiplication overflow.
             uint sourceChannels = ((srgb << 16) | srgb) & evenMask;
             uint destChannels =   ((drgb << 16) | drgb) & evenMask;
            
             // do the blending
             uint blend_temp =
                 (((sourceChannels - destChannels) * alpha) >> shift) + 
 destChannels;
            
             // Now we move the middle channel from the high 16 bits, 
 back into its             //   rightful place in the middle.  
             uint blend_dummy2 =             drgb = (blend_temp & 
 (evenMask & 0x0000ffff)) |                   ((blend_temp & (evenMask 
 & 0xffff0000)) >> 16 );
         }
         else
         {
             uint blend_dummy2 =
             drgb =                 ((((((srgb & evenMask)-(drgb & 
 evenMask))  * alpha) >> shift) + drgb) & evenMask) |
                 ((((((srgb & oddMask )-(drgb & oddMask )) >> shift)  * 
 alpha) + drgb) & oddMask);
         }
        
         static if ( RGBA == RGBA32 || RGBA == RGBA8_I32 ) // preserve 
 alpha
             uint blend_dummy3 = drgb = (drgb & ~sourceAMask) | 
 originalDestAlpha;
     }
     else
     {
         pragma(msg,"Invalid RGBA format for alpha blending.");
         static assert(0);
     }
 }


 private template write( uint RGBA, ubyte half16bpp = NOT_APPLICABLE )
 {
     
     static if ( RGBA == RGBA32 )
     {
         uint write_dummy1 = dest[di] = drgb;
     }
     else static if ( RGBA == RGB24 )
     {
         uint* address = cast(uint*)(dest + di);
        
         version ( BigEndian )
             uint write_dummy1 = *address = (drgb & 0xffffff00) | 
 (drgbOriginal & 0x000000ff);
         else
             uint write_dummy1 = *address = (drgb & 0x00ffffff) | 
 (drgbOriginal & 0xff000000);
     }
     else static if ( RGBA == RGB16_565 || RGBA == RGB16_555 )
     {
         // for selecting the lowest or highest pixel in terms of 
         //   address in memory rather than place in the word/register
         version ( BigEndian )
             const writeMask = 0x0000ffff;
         else
             const writeMask = 0xffff0000;
        
         static if ( half16bpp == HIGH_ADDRESS_HALF )
             uint write_dummy1 = dest[di] = (drgb & writeMask) | 
 (drgbOriginal & ~writeMask);
         else static if ( half16bpp == LOW_ADDRESS_HALF )
             uint write_dummy1 = dest[di] = (drgb & ~writeMask) | 
 (drgbOriginal & writeMask);
         else
             uint write_dummy1 = dest[di] = drgb;
     }
     // TODO:  writing RGBA8_I32.  needs an algo to reverse a 32 bpp 
 value into
     //          the an 8 bit indexed value.      else
     {
         pragma(msg,"Invalid RGBA format for alpha blending.");
         static assert(0);
     }
     
     uint write_dummy2 = si = si + sourceIncrement;
     uint write_dummy3 = di = di + destIncrement;
 }

 private enum : ubyte
 {
     NOT_APPLICABLE = 0,
     LOW_ADDRESS_HALF,
     HIGH_ADDRESS_HALF,
 }

 private template innerLoop( uint sourceRGBA, uint destRGBA, 
                             ubyte half16bpp = NOT_APPLICABLE )
 {
     static if ( !(destRGBA == RGB16_565 || destRGBA == RGB16_555) && 
 half16bpp > 0 )
     {
         pragma(msg,"The half16bpp argument is only to be used when the "
                    "destination format is 16 bits per pixel.");
         static assert(0);
     }
     
     mixin read!(sourceRGBA,destRGBA,half16bpp);
     mixin convert!(sourceRGBA,destRGBA);
     mixin blend!(destRGBA);
     mixin write!(destRGBA,half16bpp);
 }

 private template calculatePaddingAndArrays( bool isSource )
 {

     static if ( isSource )
     {
         alias srcSurface surface;
         alias sourcebpp bpp;
         alias destbpp otherbpp;
     }
     else
     {
         alias dstSurface surface;
         alias destbpp bpp;
         alias sourcebpp otherbpp;
     }
     
     // Padding is the amount of extra data at the end of a scanline 
 used to
     //   ensure that the end of the scanline lines up on a 32 bit 
 boundary.
     // spadding = source padding
     // dpadding = dest padding
     // In this case, the units padding is measured in change depending on
     //   the source and destination format.      // The amount of data 
 that is handled in each iteration also changes,
     //   and is reflected by the different types of arrays.      
     static if ( bpp == 32 )
     {
         auto padding = 0;
         uint[] pixelData = cast(uint[])surface.pixels;
     }
     else static if ( bpp == 24 )
     {
         // padding measured in bytes
         auto padding = surface.pitch - (surface.width * 3);
         ubyte* pixelData = surface.pixels.ptr;
     }
     else static if ( bpp == 16 )
     {
         static if ( otherbpp != 16 /+otherbpp == 32 || otherbpp == 24 
 || otherbpp == 8+/ )
         {
             // padding measured in shorts
             auto padding = (surface.pitch >> 1) - surface.width;
             ushort[] pixelData = cast(ushort[])surface.pixels;
         }
         else
         {
             auto padding = 0;
             uint[] pixelData = cast(uint[])surface.pixels;
         }
     }
     else static if ( bpp == 8 )
     {
         auto padding = surface.pitch - surface.width; // padding 
 measured in bytes
         ubyte[] pixelData = surface.pixels;
     }
     else
         static assert(0);
     
     static if ( isSource )
     {
         alias padding spadding;
         alias pixelData source;
     }
     else
     {
         alias padding dpadding;
         alias pixelData dest;
     }
 }

 // This function shall do no clipping. 
 void blit( uint sourceRGBA, uint destRGBA )
         ( short sourceX, short sourceY,         short destX, short 
 destY, short width, short height,         inout Surface srcSurface, 
 inout Surface dstSurface,         uint srcColor, uint alpha )
 {
     // this stuff just determines the bits per pixel of the source and 
     //   destination surfaces
     static if ( sourceRGBA == RGBA32 )
         const sourcebpp = 32;
     else static if ( sourceRGBA == RGB24 )
         const sourcebpp = 24;
     else static if ( sourceRGBA == RGB16_565 || sourceRGBA == RGB16_555 )
         const sourcebpp = 16;
     else
         const sourcebpp = 8;
     
     static if ( destRGBA == RGBA32 )
         const destbpp = 32;
     else static if ( destRGBA == RGB24 )
         const destbpp = 24;
     else static if ( destRGBA == RGB16_565 || destRGBA == RGB16_555 )
         const destbpp = 16;
     else
         const destbpp = 8;
     //
     
     static if ( (sourcebpp == 32 || sourcebpp == 24) && destbpp == 16 )
         const convert32to16 = true;
     else
         const convert32to16 = false;
     
     static if ( sourcebpp == 16 && (destbpp == 32 || destbpp == 24) )
         const convert16to32 = true;
     else
         const convert16to32 = false;
     
     static if ( (destRGBA == RGB16_565 || destRGBA == RGB16_555) && 
 sourceRGBA != A8 )
         srcColor |= (srcColor << 16);
     
     static if ( destbpp == 16 )
         alpha >>= 3;
     
     // note that the padding quantities are necessarily zero if
     //   unitWidth = width / 2;
     //   that's important because they have different units of 
 measurement!
     
     mixin calculatePaddingAndArrays!( true );
     mixin calculatePaddingAndArrays!( false );
     
     static if ( destbpp == 24 )
     {
         uint lineWidth = width * 3;
        
         static if ( sourcebpp == 24 )
         {
             // same as: unitSrcSurfaceWidth = srcSurface.width * 3;
             uint unitSrcSurfaceWidth = srcSurface.pitch - spadding;
             uint unitSrcWidth = lineWidth;
         }
         else
         {
             uint unitSrcSurfaceWidth = srcSurface.width;
             uint unitSrcWidth = width;
         }
        
         uint unitDstSurfaceWidth = dstSurface.pitch - dpadding;
        
         uint unitDstWidth = lineWidth;
     }
     else static if ( sourcebpp == 16 && destbpp == 16 )
     {
         uint lineWidth = width / 2; // because we do 2 pixels at a time
        
         // The +(width & 1) part is used to make the division round 
 up.          uint unitSrcSurfaceWidth = (srcSurface.width / 2) + 
 (srcSurface.width & 1);
         uint unitDstSurfaceWidth = (dstSurface.width / 2) + 
 (dstSurface.width & 1);
        
         // The lineWidth variable rounds down on division, so it may be
         //   missing a pixel.  That is desirable since we don't want 
 alphablend
         //   onto the pixel next to the missing pixel.  Of course, we 
 will         //   handle the missing pixel individually, but it is 
 still useful to         //   have access to a rounded-up version of 
 the blit's width.          uint unitSrcWidth = lineWidth + (width & 1);
         uint unitDstWidth = unitSrcWidth;
     }
     else
     {
         uint lineWidth = width;
         uint unitSrcWidth = width;
         uint unitDstWidth = width;
         uint unitSrcSurfaceWidth = srcSurface.width;
         uint unitDstSurfaceWidth = dstSurface.width;
     }
     
     uint sourceAMask = srcSurface.alphaMask;
     
     version( SDL )
     {
         auto sourceSdlSurface = srcSurface.sdl_surface;
         if ( sourceSdlSurface !is null )
         {
             bool srcLocked = lock( sourceSdlSurface );
             scope(exit)
             {
                 if ( srcLocked )
                     SDL_UnlockSurface( sourceSdlSurface );
             }
         }
        
         auto destSdlSurface = srcSurface.sdl_surface;
         if ( destSdlSurface !is null )
         {
             bool dstLocked = lock( destSdlSurface );
             scope(exit)
             {
                 if ( dstLocked )
                     SDL_UnlockSurface( destSdlSurface );
             }
         }
     }
     
     static if ( sourcebpp == 24 )
         uint sourceIncrement = 3;
     else
         uint sourceIncrement = 1;
     
     static if ( destbpp == 24 )
         uint destIncrement = 3;
     else
         uint destIncrement = 1;
     
     // Since we are not necessarily blitting accross the entire width 
 of the     //   destination surface or source surface, we have to skip 
 some of the
     //   pixels on the end of the current scanline and on the 
 beginning of
     //   the next scanline.      // Add that to the padding (which is 
 explained above), and the result
     //   is these source/dest LineExtra variables.      int sLineExtra 
 = unitSrcSurfaceWidth + spadding - unitSrcWidth;
     int dLineExtra = unitDstSurfaceWidth + dpadding - unitDstWidth;
     
     // initialize the index variables
     // si = source index
     // di = destination index
     int si = (sourceX * sourceIncrement) + (unitSrcSurfaceWidth * 
 sourceY);
     int di = (destX   * destIncrement)   + (unitDstSurfaceWidth * destY);
     
     // nextLine is always ahead of di by the amount of pixels left in 
 one line
     //   of the blit.      int nextLine;
     
     // endi is the index to stop at.      //int endi = destX + 
 unitWidth + (unitDstSurfaceWidth + dpadding) * (destY + height);
     int endi = di + (unitDstSurfaceWidth * height);
     
     assert( lineWidth + dLineExtra == unitDstSurfaceWidth + dpadding );
     
     // TODO: remove this
     void writeHex ( char[] name, uint number )
     {
         writef( "(",name,std.string.toString( cast(ulong)number, 
 cast(uint)16 ),")|" );
     }
     //
     
     uint srgb;
     uint drgb;
     
     static if ( sourceRGBA == A8 )
         srgb = srcColor;
     
     while( di < endi )
     {
         nextLine = di + lineWidth;
        
         static if ( convert32to16 || convert16to32 )
             mixin innerLoop!(sourceRGBA,destRGBA,HIGH_ADDRESS_HALF);
        
         while( di < nextLine )
         {
             mixin innerLoop!(sourceRGBA,destRGBA);
         }
        
         static if ( convert32to16 || convert16to32 )
             mixin innerLoop!(sourceRGBA,destRGBA,LOW_ADDRESS_HALF);
        
         si += sLineExtra;
         di += dLineExtra;
     }
 }

 version( SDL )
 {
     
     private bool lock( SDL_Surface* surface )
     {
         if ( SDL_MUSTLOCK( surface ) && !surface.locked )
         {
             safe_SDL_LockSurface( surface );
             return true;
         }
         return false;
     }
     
     // automatically throw errors resulting from the SDL_LockSurface 
 function.
     private void safe_SDL_LockSurface( SDL_Surface* surface )
     {
         if ( SDL_LockSurface( surface ) != 0 )
         {
             char* sdlError = SDL_GetError();
             char[] error = sdlError[0..std.c.string.strlen(sdlError)];
             throw new Exception( "SDL_LockSurface failed to lock a 
 surface: "~error );
         }
     }
 }

 struct Surface
 {
     ubyte[] pixels;
     uint alphaMask = 0;
     ushort width = 0xffff;
     ushort height = 0xffff;
     ushort pitch = 0xffff; /// width of a scanline in bytes.  
     ushort RGBAformat = INVALID;
     
     /// width and height are in pixels.
     static Surface opCall( ubyte[] pixels, uint alphaMask,
                            ushort width, ushort height, ushort pitch,
                            ushort RGBAformat )
     {
         Surface result;
         assert( pixels !is null );
         result.pixels = pixels;
         result.width = width;
         result.height = height;
         result.pitch = pitch;
         result.RGBAformat = RGBAformat;
         result.alphaMask = alphaMask;
         return result;
     }
     
     version ( SDL )
     {
         SDL_Surface* sdl_surface = null;
        
         static Surface opCall( SDL_Surface* surface, ushort RGBAformat )
         {
             Surface result;
             result.pixels = cast(ubyte[])surface.pixels[0.. 
 surface.pitch * surface.h];
             result.width = surface.w;
             result.height = surface.h;
             result.pitch = surface.pitch;
             assert ( RGBAformat != INVALID );
             result.RGBAformat = RGBAformat;
             result.alphaMask = surface.format.Amask;
             result.sdl_surface = surface;
             return result;
         }
     }
 }
Jan 26 2007