digitalmars.D - fun with mixins
- Chad J (27/27) Jan 26 2007 Here's the jist of the attached source:
- Bill Baxter (11/673) Jan 26 2007 I confess I'm not sure what all is going on in your code there. At a
- Chad J (23/738) Jan 26 2007 Oh, sorry, I'm not so much talking about optimizations or bit twiddling,...
Here's the jist of the attached source: template doSomething() { auto dummy = value = some + expression; } uint func( uint some, uint expression ) { uint value = 52; mixin doSomething!(); return value; } It seems pretty hackish to me, yet useful. Attached is a really long-winded alpha blending routine. The advantage is that it's perhaps the most generalized alpha blending routine I've ever written that is still decently fast (yeah, could be a lot better with simd, gpu usage, or <insert common optimization that doesn't work in general on my pda>). It could soon do things totally unrelated to alpha blending. It seems kinda like something that the C preprocessor would be used for, though the thought of using C kinda scares me, and I have hope that D templates/mixins are up to the job. Maybe someday when I have a lot of time on my hands I can figure out how to make the templates generate runtime for-loops, complete with custom-tailored innerloop code, which would make it a lot easier to optimize edge cases like sourcePixel[i+1] where there may or may not be an i+1'th pixel and I don't want to afford an 'if'. I have to wonder, has someone done this stuff already (the mixin trick, or some sort of graphics routine framework in D)?
Jan 26 2007
I confess I'm not sure what all is going on in your code there. At a glance it looks like there's a lot of hard coded 8/16/24/32's in there. *Seems* like you should be able to make something even more general than that and perhaps in the process make it even leaner and meaner. :-) I'm looking forward to the day when someone cranks out something like GIL using D. http://opensource.adobe.com/gil/presentation/index.htm And a better AGG using D uber-templates would be nice too. http://www.antigrain.com/ --bb Chad J wrote:Here's the jist of the attached source: template doSomething() { auto dummy = value = some + expression; } uint func( uint some, uint expression ) { uint value = 52; mixin doSomething!(); return value; } It seems pretty hackish to me, yet useful. Attached is a really long-winded alpha blending routine. The advantage is that it's perhaps the most generalized alpha blending routine I've ever written that is still decently fast (yeah, could be a lot better with simd, gpu usage, or <insert common optimization that doesn't work in general on my pda>). It could soon do things totally unrelated to alpha blending. It seems kinda like something that the C preprocessor would be used for, though the thought of using C kinda scares me, and I have hope that D templates/mixins are up to the job. Maybe someday when I have a lot of time on my hands I can figure out how to make the templates generate runtime for-loops, complete with custom-tailored innerloop code, which would make it a lot easier to optimize edge cases like sourcePixel[i+1] where there may or may not be an i+1'th pixel and I don't want to afford an 'if'. I have to wonder, has someone done this stuff already (the mixin trick, or some sort of graphics routine framework in D)? ------------------------------------------------------------------------ /+ Alpha blended blitting routine. +/ import std.stdio; version = SDL; version( SDL ) { import derelict.sdl.sdl; } // TODO: RGB32? enum : uint { INVALID = 0, RGBA32, RGB24, RGB16_555, RGB16_565, RGBA8_I32, // indexed to 32 bit values A8, } private template readSource( uint RGBA ) { static if ( RGBA == RGBA32 ) { uint readS_dummy1 = srgb = source[si]; uint readS_dummy2 = alpha = srgb & sourceAMask; uint readS_dummy3 = srgb = srgb & ~sourceAMask; } else static if ( RGBA == RGB24 ) { // There is no such thing as an array with 24-bit elements, so we have // to use pointers. uint readS_dummy1 = srgb = *(cast(uint*)(source + si)); } else static if ( RGBA == RGB16_555 || RGBA == RGB16_565 ) { // cast(uint) is not necessary in all cases, only if dest is 32 bpp uint readS_dummy1 = srgb = cast(uint)source[si]; } else static if ( RGBA == RGBA8_I32 ) { uint readS_dummy1 = srgb = rgbaTable[source[si]]; uint readS_dummy2 = alpha = srgb & sourceAMask; uint readS_dummy3 = srgb = srgb & ~sourceAMask; } else static if ( RGBA == A8 ) { uint readS_dummy1 = alpha = cast(uint)source[si]; } else { pragma(msg,"Invalid source RGBA format for reading."); static assert(0); } } private template readDestination( uint RGBA, ubyte half16bpp = NOT_APPLICABLE ) { static if ( RGBA == RGBA32 ) { uint readD_dummy1 = drgb = dest[di]; } else static if ( RGBA == RGB24 ) { // There is no such thing as an array with 24-bit elements, so we have // to use pointers. uint readD_dummy1 = drgb = *(cast(uint*)(dest + di)); // Since we can't write 24 bits, we can either write 3 bytes (slow), // or we can overwrite 8 bits of the next pixel. The latter is // faster and can be done safely if we overwrite those 8 bits with // their previous contents. uint drgbOriginal = drgb; } else static if ( RGBA == RGB16_555 || RGBA == RGB16_565 ) { uint readD_dummy1 = drgb = dest[di]; static if ( half16bpp == LOW_ADDRESS_HALF || half16bpp == HIGH_ADDRESS_HALF ) { // Store the original values of both pixels being read. // When reading and writing 2 pixels at a time, it is impossible // to prevent overwriting a pixel that we don't want to. At // least not without some rather complicated code. So instead, // we just make sure that the pixel we don't want to overwrite // is overwritten with it's original value. The original value // is stored here. uint drgbOriginal = destReadResult; } } else static if ( RGBA == RGBA8_I32 ) { uint readD_dummy1 = drgb = rgbaTable[dest[di]]; } else { pragma(msg,"Invalid destination RGBA format for reading."); static assert(0); } } private template read( uint sourceRGBA, uint destRGBA, ubyte half16bpp = NOT_APPLICABLE ) { mixin readSource!( sourceRGBA ); mixin readDestination!( destRGBA, half16bpp ); } private template convert( uint sourceRGBA, uint destRGBA ) { static if ( sourceRGBA == RGBA32 || sourceRGBA == RGB24 || sourceRGBA == RGBA8_I32 ) { static if ( destRGBA == RGBA32 || destRGBA == RGB24 ) { alias sourceReadResult srgb; // do nothing } static if ( destRGBA == RGB16_565 ) { // Here we must shrink a 32 bit pixel from the source into a // 16 bit pixel. // in this situation we write the 16 bit resultant pixels one at // a time so the extra 16 bits will be safely discarded. uint convert_dummy1 = srgb = ((0xf800 & (sourceReadResult >> 8 )) + (0x07e0 & (sourceReadResult >> 5 )) + (0x001f & (sourceReadResult >> 3 ))); } else static assert(0); } else static if ( sourceRGBA == RGB16_565 ) { static if ( destRGBA == RGBA32 || destRGBA == RGB24 ) { // Here we must expand a 16 bit pixel from the source into a // 32 bit pixel. // In this situation we read the 16 bit pixels one at a time // so the extra 16 bits can be safely discarded. uint convert_dummy1 = srgb = (((sourceReadResult & 0xf800) << 8 ) + ((sourceReadResult & 0x07e0) << 5 ) + ((sourceReadResult & 0x001f) << 3 )); } else static if ( destRGBA == RGB16_565 ) { //alias sourceReadResult srgb; // do nothing } else static assert(0); } else static if ( sourceRGBA == A8 ) { //alias srcColor srgb; } else static assert(0); } private template blend( uint RGBA ) { // Note that this will get it right regardless of which color is in which // channel. Of course, the channels' placements must be correct. // It also preserves the destination's alpha channel, if present. static if ( RGBA == RGBA32 || RGBA == RGB24 || RGBA == RGBA8_I32 || RGBA == RGB16_565 || RGBA == RGB16_555 ) { static if ( RGBA == RGBA32 || RGBA == RGB24 || RGBA == RGBA8_I32 ) { const shift = 8; const evenMask = 0x00ff00ff; } else { // For 16bpp formats: // alpha must be a 5 bit value (the 3 hi bits MUST be clear) // this does 2 16bit pixels at a time in one 32 bit word. // endianness doesn't matter on 565 formats due to symmetry // TODO: take into account endianness on 555 formats // (probably only noticable on big endian machines) const shift = 5; const evenMask = 0x07e0f81f; } const oddMask = ~evenMask; static if ( RGBA == RGBA32 || RGBA == RGBA8_I32 ) uint originalDestAlpha = drgb & sourceAMask; static if ( destbpp == 16 ) uint blend_dummy1 = alpha = alpha >> 3; static if ( destbpp == 16 && sourceRGBA == A8 ) { // Extract the middle channel and shift it into the high 16 bits, giving // at least 5 bits above it to hold the multiplication overflow, and at // least 5 bits below it to hold the high channel's multiplication overflow. uint sourceChannels = ((srgb << 16) | srgb) & evenMask; uint destChannels = ((drgb << 16) | drgb) & evenMask; // do the blending uint blend_temp = (((sourceChannels - destChannels) * alpha) >> shift) + destChannels; // Now we move the middle channel from the high 16 bits, back into its // rightful place in the middle. uint blend_dummy2 = drgb = (blend_temp & (evenMask & 0x0000ffff)) | ((blend_temp & (evenMask & 0xffff0000)) >> 16 ); } else { uint blend_dummy2 = drgb = ((((((srgb & evenMask)-(drgb & evenMask)) * alpha) >> shift) + drgb) & evenMask) | ((((((srgb & oddMask )-(drgb & oddMask )) >> shift) * alpha) + drgb) & oddMask); } static if ( RGBA == RGBA32 || RGBA == RGBA8_I32 ) // preserve alpha uint blend_dummy3 = drgb = (drgb & ~sourceAMask) | originalDestAlpha; } else { pragma(msg,"Invalid RGBA format for alpha blending."); static assert(0); } } private template write( uint RGBA, ubyte half16bpp = NOT_APPLICABLE ) { static if ( RGBA == RGBA32 ) { uint write_dummy1 = dest[di] = drgb; } else static if ( RGBA == RGB24 ) { uint* address = cast(uint*)(dest + di); version ( BigEndian ) uint write_dummy1 = *address = (drgb & 0xffffff00) | (drgbOriginal & 0x000000ff); else uint write_dummy1 = *address = (drgb & 0x00ffffff) | (drgbOriginal & 0xff000000); } else static if ( RGBA == RGB16_565 || RGBA == RGB16_555 ) { // for selecting the lowest or highest pixel in terms of // address in memory rather than place in the word/register version ( BigEndian ) const writeMask = 0x0000ffff; else const writeMask = 0xffff0000; static if ( half16bpp == HIGH_ADDRESS_HALF ) uint write_dummy1 = dest[di] = (drgb & writeMask) | (drgbOriginal & ~writeMask); else static if ( half16bpp == LOW_ADDRESS_HALF ) uint write_dummy1 = dest[di] = (drgb & ~writeMask) | (drgbOriginal & writeMask); else uint write_dummy1 = dest[di] = drgb; } // TODO: writing RGBA8_I32. needs an algo to reverse a 32 bpp value into // the an 8 bit indexed value. else { pragma(msg,"Invalid RGBA format for alpha blending."); static assert(0); } uint write_dummy2 = si = si + sourceIncrement; uint write_dummy3 = di = di + destIncrement; } private enum : ubyte { NOT_APPLICABLE = 0, LOW_ADDRESS_HALF, HIGH_ADDRESS_HALF, } private template innerLoop( uint sourceRGBA, uint destRGBA, ubyte half16bpp = NOT_APPLICABLE ) { static if ( !(destRGBA == RGB16_565 || destRGBA == RGB16_555) && half16bpp > 0 ) { pragma(msg,"The half16bpp argument is only to be used when the " "destination format is 16 bits per pixel."); static assert(0); } mixin read!(sourceRGBA,destRGBA,half16bpp); mixin convert!(sourceRGBA,destRGBA); mixin blend!(destRGBA); mixin write!(destRGBA,half16bpp); } private template calculatePaddingAndArrays( bool isSource ) { static if ( isSource ) { alias srcSurface surface; alias sourcebpp bpp; alias destbpp otherbpp; } else { alias dstSurface surface; alias destbpp bpp; alias sourcebpp otherbpp; } // Padding is the amount of extra data at the end of a scanline used to // ensure that the end of the scanline lines up on a 32 bit boundary. // spadding = source padding // dpadding = dest padding // In this case, the units padding is measured in change depending on // the source and destination format. // The amount of data that is handled in each iteration also changes, // and is reflected by the different types of arrays. static if ( bpp == 32 ) { auto padding = 0; uint[] pixelData = cast(uint[])surface.pixels; } else static if ( bpp == 24 ) { // padding measured in bytes auto padding = surface.pitch - (surface.width * 3); ubyte* pixelData = surface.pixels.ptr; } else static if ( bpp == 16 ) { static if ( otherbpp != 16 /+otherbpp == 32 || otherbpp == 24 || otherbpp == 8+/ ) { // padding measured in shorts auto padding = (surface.pitch >> 1) - surface.width; ushort[] pixelData = cast(ushort[])surface.pixels; } else { auto padding = 0; uint[] pixelData = cast(uint[])surface.pixels; } } else static if ( bpp == 8 ) { auto padding = surface.pitch - surface.width; // padding measured in bytes ubyte[] pixelData = surface.pixels; } else static assert(0); static if ( isSource ) { alias padding spadding; alias pixelData source; } else { alias padding dpadding; alias pixelData dest; } } // This function shall do no clipping. void blit( uint sourceRGBA, uint destRGBA ) ( short sourceX, short sourceY, short destX, short destY, short width, short height, inout Surface srcSurface, inout Surface dstSurface, uint srcColor, uint alpha ) { // this stuff just determines the bits per pixel of the source and // destination surfaces static if ( sourceRGBA == RGBA32 ) const sourcebpp = 32; else static if ( sourceRGBA == RGB24 ) const sourcebpp = 24; else static if ( sourceRGBA == RGB16_565 || sourceRGBA == RGB16_555 ) const sourcebpp = 16; else const sourcebpp = 8; static if ( destRGBA == RGBA32 ) const destbpp = 32; else static if ( destRGBA == RGB24 ) const destbpp = 24; else static if ( destRGBA == RGB16_565 || destRGBA == RGB16_555 ) const destbpp = 16; else const destbpp = 8; // static if ( (sourcebpp == 32 || sourcebpp == 24) && destbpp == 16 ) const convert32to16 = true; else const convert32to16 = false; static if ( sourcebpp == 16 && (destbpp == 32 || destbpp == 24) ) const convert16to32 = true; else const convert16to32 = false; static if ( (destRGBA == RGB16_565 || destRGBA == RGB16_555) && sourceRGBA != A8 ) srcColor |= (srcColor << 16); static if ( destbpp == 16 ) alpha >>= 3; // note that the padding quantities are necessarily zero if // unitWidth = width / 2; // that's important because they have different units of measurement! mixin calculatePaddingAndArrays!( true ); mixin calculatePaddingAndArrays!( false ); static if ( destbpp == 24 ) { uint lineWidth = width * 3; static if ( sourcebpp == 24 ) { // same as: unitSrcSurfaceWidth = srcSurface.width * 3; uint unitSrcSurfaceWidth = srcSurface.pitch - spadding; uint unitSrcWidth = lineWidth; } else { uint unitSrcSurfaceWidth = srcSurface.width; uint unitSrcWidth = width; } uint unitDstSurfaceWidth = dstSurface.pitch - dpadding; uint unitDstWidth = lineWidth; } else static if ( sourcebpp == 16 && destbpp == 16 ) { uint lineWidth = width / 2; // because we do 2 pixels at a time // The +(width & 1) part is used to make the division round up. uint unitSrcSurfaceWidth = (srcSurface.width / 2) + (srcSurface.width & 1); uint unitDstSurfaceWidth = (dstSurface.width / 2) + (dstSurface.width & 1); // The lineWidth variable rounds down on division, so it may be // missing a pixel. That is desirable since we don't want alphablend // onto the pixel next to the missing pixel. Of course, we will // handle the missing pixel individually, but it is still useful to // have access to a rounded-up version of the blit's width. uint unitSrcWidth = lineWidth + (width & 1); uint unitDstWidth = unitSrcWidth; } else { uint lineWidth = width; uint unitSrcWidth = width; uint unitDstWidth = width; uint unitSrcSurfaceWidth = srcSurface.width; uint unitDstSurfaceWidth = dstSurface.width; } uint sourceAMask = srcSurface.alphaMask; version( SDL ) { auto sourceSdlSurface = srcSurface.sdl_surface; if ( sourceSdlSurface !is null ) { bool srcLocked = lock( sourceSdlSurface ); scope(exit) { if ( srcLocked ) SDL_UnlockSurface( sourceSdlSurface ); } } auto destSdlSurface = srcSurface.sdl_surface; if ( destSdlSurface !is null ) { bool dstLocked = lock( destSdlSurface ); scope(exit) { if ( dstLocked ) SDL_UnlockSurface( destSdlSurface ); } } } static if ( sourcebpp == 24 ) uint sourceIncrement = 3; else uint sourceIncrement = 1; static if ( destbpp == 24 ) uint destIncrement = 3; else uint destIncrement = 1; // Since we are not necessarily blitting accross the entire width of the // destination surface or source surface, we have to skip some of the // pixels on the end of the current scanline and on the beginning of // the next scanline. // Add that to the padding (which is explained above), and the result // is these source/dest LineExtra variables. int sLineExtra = unitSrcSurfaceWidth + spadding - unitSrcWidth; int dLineExtra = unitDstSurfaceWidth + dpadding - unitDstWidth; // initialize the index variables // si = source index // di = destination index int si = (sourceX * sourceIncrement) + (unitSrcSurfaceWidth * sourceY); int di = (destX * destIncrement) + (unitDstSurfaceWidth * destY); // nextLine is always ahead of di by the amount of pixels left in one line // of the blit. int nextLine; // endi is the index to stop at. //int endi = destX + unitWidth + (unitDstSurfaceWidth + dpadding) * (destY + height); int endi = di + (unitDstSurfaceWidth * height); assert( lineWidth + dLineExtra == unitDstSurfaceWidth + dpadding ); // TODO: remove this void writeHex ( char[] name, uint number ) { writef( "(",name,std.string.toString( cast(ulong)number, cast(uint)16 ),")|" ); } // uint srgb; uint drgb; static if ( sourceRGBA == A8 ) srgb = srcColor; while( di < endi ) { nextLine = di + lineWidth; static if ( convert32to16 || convert16to32 ) mixin innerLoop!(sourceRGBA,destRGBA,HIGH_ADDRESS_HALF); while( di < nextLine ) { mixin innerLoop!(sourceRGBA,destRGBA); } static if ( convert32to16 || convert16to32 ) mixin innerLoop!(sourceRGBA,destRGBA,LOW_ADDRESS_HALF); si += sLineExtra; di += dLineExtra; } } version( SDL ) { private bool lock( SDL_Surface* surface ) { if ( SDL_MUSTLOCK( surface ) && !surface.locked ) { safe_SDL_LockSurface( surface ); return true; } return false; } // automatically throw errors resulting from the SDL_LockSurface function. private void safe_SDL_LockSurface( SDL_Surface* surface ) { if ( SDL_LockSurface( surface ) != 0 ) { char* sdlError = SDL_GetError(); char[] error = sdlError[0..std.c.string.strlen(sdlError)]; throw new Exception( "SDL_LockSurface failed to lock a surface: "~error ); } } } struct Surface { ubyte[] pixels; uint alphaMask = 0; ushort width = 0xffff; ushort height = 0xffff; ushort pitch = 0xffff; /// width of a scanline in bytes. ushort RGBAformat = INVALID; /// width and height are in pixels. static Surface opCall( ubyte[] pixels, uint alphaMask, ushort width, ushort height, ushort pitch, ushort RGBAformat ) { Surface result; assert( pixels !is null ); result.pixels = pixels; result.width = width; result.height = height; result.pitch = pitch; result.RGBAformat = RGBAformat; result.alphaMask = alphaMask; return result; } version ( SDL ) { SDL_Surface* sdl_surface = null; static Surface opCall( SDL_Surface* surface, ushort RGBAformat ) { Surface result; result.pixels = cast(ubyte[])surface.pixels[0.. surface.pitch * surface.h]; result.width = surface.w; result.height = surface.h; result.pitch = surface.pitch; assert ( RGBAformat != INVALID ); result.RGBAformat = RGBAformat; result.alphaMask = surface.format.Amask; result.sdl_surface = surface; return result; } } }
Jan 26 2007
Oh, sorry, I'm not so much talking about optimizations or bit twiddling, but rather D's templating. It's all about the mixin template. Normally you can't have statements in template bodies, only declarations. Well, I can get around that for most code by prepending the statement with "auto dummyVar = ", which turns it into a declaration. The end result has an extraneous assignment to a variable that never gets used, but I think the compiler (dmd at least) optimizes that away because I tried making a lot of such dummy variables in the middle of my loop and performance never decreased. Thanks for the links. As far as my example is concerned, GIL is very similar to what I am trying to accomplish. What I would like to do is have a situation where you hand the graphics lib a template that contains the simple graphics operations you want to perform on each source pixel, and you can access as many destination pixels as you want and they can be at any coordinates. The graphics lib takes care of creating all of the loops, iteration, weird setup code, and stuff like that. The stuff that the programmer doesn't control should be at least nearly as performant as hand rolled code, and ideally this should be easily optimized with SIMD instructions. On top of such a framework, it should be easy enough to roll common canned routines like alphablend, gradient, hue, bumpmap, rotozoom, even 3d transforms, etc etc, and I suppose that would accomplish something like antigrain. Bill Baxter wrote:I confess I'm not sure what all is going on in your code there. At a glance it looks like there's a lot of hard coded 8/16/24/32's in there. *Seems* like you should be able to make something even more general than that and perhaps in the process make it even leaner and meaner. :-) I'm looking forward to the day when someone cranks out something like GIL using D. http://opensource.adobe.com/gil/presentation/index.htm And a better AGG using D uber-templates would be nice too. http://www.antigrain.com/ --bb Chad J wrote:Here's the jist of the attached source: template doSomething() { auto dummy = value = some + expression; } uint func( uint some, uint expression ) { uint value = 52; mixin doSomething!(); return value; } It seems pretty hackish to me, yet useful. Attached is a really long-winded alpha blending routine. The advantage is that it's perhaps the most generalized alpha blending routine I've ever written that is still decently fast (yeah, could be a lot better with simd, gpu usage, or <insert common optimization that doesn't work in general on my pda>). It could soon do things totally unrelated to alpha blending. It seems kinda like something that the C preprocessor would be used for, though the thought of using C kinda scares me, and I have hope that D templates/mixins are up to the job. Maybe someday when I have a lot of time on my hands I can figure out how to make the templates generate runtime for-loops, complete with custom-tailored innerloop code, which would make it a lot easier to optimize edge cases like sourcePixel[i+1] where there may or may not be an i+1'th pixel and I don't want to afford an 'if'. I have to wonder, has someone done this stuff already (the mixin trick, or some sort of graphics routine framework in D)? ------------------------------------------------------------------------ /+ Alpha blended blitting routine. +/ import std.stdio; version = SDL; version( SDL ) { import derelict.sdl.sdl; } // TODO: RGB32? enum : uint { INVALID = 0, RGBA32, RGB24, RGB16_555, RGB16_565, RGBA8_I32, // indexed to 32 bit values A8, } private template readSource( uint RGBA ) { static if ( RGBA == RGBA32 ) { uint readS_dummy1 = srgb = source[si]; uint readS_dummy2 = alpha = srgb & sourceAMask; uint readS_dummy3 = srgb = srgb & ~sourceAMask; } else static if ( RGBA == RGB24 ) { // There is no such thing as an array with 24-bit elements, so we have // to use pointers. uint readS_dummy1 = srgb = *(cast(uint*)(source + si)); } else static if ( RGBA == RGB16_555 || RGBA == RGB16_565 ) { // cast(uint) is not necessary in all cases, only if dest is 32 bpp uint readS_dummy1 = srgb = cast(uint)source[si]; } else static if ( RGBA == RGBA8_I32 ) { uint readS_dummy1 = srgb = rgbaTable[source[si]]; uint readS_dummy2 = alpha = srgb & sourceAMask; uint readS_dummy3 = srgb = srgb & ~sourceAMask; } else static if ( RGBA == A8 ) { uint readS_dummy1 = alpha = cast(uint)source[si]; } else { pragma(msg,"Invalid source RGBA format for reading."); static assert(0); } } private template readDestination( uint RGBA, ubyte half16bpp = NOT_APPLICABLE ) { static if ( RGBA == RGBA32 ) { uint readD_dummy1 = drgb = dest[di]; } else static if ( RGBA == RGB24 ) { // There is no such thing as an array with 24-bit elements, so we have // to use pointers. uint readD_dummy1 = drgb = *(cast(uint*)(dest + di)); // Since we can't write 24 bits, we can either write 3 bytes (slow), // or we can overwrite 8 bits of the next pixel. The latter is // faster and can be done safely if we overwrite those 8 bits with // their previous contents. uint drgbOriginal = drgb; } else static if ( RGBA == RGB16_555 || RGBA == RGB16_565 ) { uint readD_dummy1 = drgb = dest[di]; static if ( half16bpp == LOW_ADDRESS_HALF || half16bpp == HIGH_ADDRESS_HALF ) { // Store the original values of both pixels being read. // When reading and writing 2 pixels at a time, it is impossible // to prevent overwriting a pixel that we don't want to. At // least not without some rather complicated code. So instead, // we just make sure that the pixel we don't want to overwrite // is overwritten with it's original value. The original value // is stored here. uint drgbOriginal = destReadResult; } } else static if ( RGBA == RGBA8_I32 ) { uint readD_dummy1 = drgb = rgbaTable[dest[di]]; } else { pragma(msg,"Invalid destination RGBA format for reading."); static assert(0); } } private template read( uint sourceRGBA, uint destRGBA, ubyte half16bpp = NOT_APPLICABLE ) { mixin readSource!( sourceRGBA ); mixin readDestination!( destRGBA, half16bpp ); } private template convert( uint sourceRGBA, uint destRGBA ) { static if ( sourceRGBA == RGBA32 || sourceRGBA == RGB24 || sourceRGBA == RGBA8_I32 ) { static if ( destRGBA == RGBA32 || destRGBA == RGB24 ) { alias sourceReadResult srgb; // do nothing } static if ( destRGBA == RGB16_565 ) { // Here we must shrink a 32 bit pixel from the source into a // 16 bit pixel. // in this situation we write the 16 bit resultant pixels one at // a time so the extra 16 bits will be safely discarded. uint convert_dummy1 = srgb = ((0xf800 & (sourceReadResult >> 8 )) + (0x07e0 & (sourceReadResult >> 5 )) + (0x001f & (sourceReadResult >> 3 ))); } else static assert(0); } else static if ( sourceRGBA == RGB16_565 ) { static if ( destRGBA == RGBA32 || destRGBA == RGB24 ) { // Here we must expand a 16 bit pixel from the source into a // 32 bit pixel. // In this situation we read the 16 bit pixels one at a time // so the extra 16 bits can be safely discarded. uint convert_dummy1 = srgb = (((sourceReadResult & 0xf800) << 8 ) + ((sourceReadResult & 0x07e0) << 5 ) + ((sourceReadResult & 0x001f) << 3 )); } else static if ( destRGBA == RGB16_565 ) { //alias sourceReadResult srgb; // do nothing } else static assert(0); } else static if ( sourceRGBA == A8 ) { //alias srcColor srgb; } else static assert(0); } private template blend( uint RGBA ) { // Note that this will get it right regardless of which color is in which // channel. Of course, the channels' placements must be correct. // It also preserves the destination's alpha channel, if present. static if ( RGBA == RGBA32 || RGBA == RGB24 || RGBA == RGBA8_I32 || RGBA == RGB16_565 || RGBA == RGB16_555 ) { static if ( RGBA == RGBA32 || RGBA == RGB24 || RGBA == RGBA8_I32 ) { const shift = 8; const evenMask = 0x00ff00ff; } else { // For 16bpp formats: // alpha must be a 5 bit value (the 3 hi bits MUST be clear) // this does 2 16bit pixels at a time in one 32 bit word. // endianness doesn't matter on 565 formats due to symmetry // TODO: take into account endianness on 555 formats // (probably only noticable on big endian machines) const shift = 5; const evenMask = 0x07e0f81f; } const oddMask = ~evenMask; static if ( RGBA == RGBA32 || RGBA == RGBA8_I32 ) uint originalDestAlpha = drgb & sourceAMask; static if ( destbpp == 16 ) uint blend_dummy1 = alpha = alpha >> 3; static if ( destbpp == 16 && sourceRGBA == A8 ) { // Extract the middle channel and shift it into the high 16 bits, giving // at least 5 bits above it to hold the multiplication overflow, and at // least 5 bits below it to hold the high channel's multiplication overflow. uint sourceChannels = ((srgb << 16) | srgb) & evenMask; uint destChannels = ((drgb << 16) | drgb) & evenMask; // do the blending uint blend_temp = (((sourceChannels - destChannels) * alpha) >> shift) + destChannels; // Now we move the middle channel from the high 16 bits, back into its // rightful place in the middle. uint blend_dummy2 = drgb = (blend_temp & (evenMask & 0x0000ffff)) | ((blend_temp & (evenMask & 0xffff0000)) >> 16 ); } else { uint blend_dummy2 = drgb = ((((((srgb & evenMask)-(drgb & evenMask)) * alpha) >> shift) + drgb) & evenMask) | ((((((srgb & oddMask )-(drgb & oddMask )) >> shift) * alpha) + drgb) & oddMask); } static if ( RGBA == RGBA32 || RGBA == RGBA8_I32 ) // preserve alpha uint blend_dummy3 = drgb = (drgb & ~sourceAMask) | originalDestAlpha; } else { pragma(msg,"Invalid RGBA format for alpha blending."); static assert(0); } } private template write( uint RGBA, ubyte half16bpp = NOT_APPLICABLE ) { static if ( RGBA == RGBA32 ) { uint write_dummy1 = dest[di] = drgb; } else static if ( RGBA == RGB24 ) { uint* address = cast(uint*)(dest + di); version ( BigEndian ) uint write_dummy1 = *address = (drgb & 0xffffff00) | (drgbOriginal & 0x000000ff); else uint write_dummy1 = *address = (drgb & 0x00ffffff) | (drgbOriginal & 0xff000000); } else static if ( RGBA == RGB16_565 || RGBA == RGB16_555 ) { // for selecting the lowest or highest pixel in terms of // address in memory rather than place in the word/register version ( BigEndian ) const writeMask = 0x0000ffff; else const writeMask = 0xffff0000; static if ( half16bpp == HIGH_ADDRESS_HALF ) uint write_dummy1 = dest[di] = (drgb & writeMask) | (drgbOriginal & ~writeMask); else static if ( half16bpp == LOW_ADDRESS_HALF ) uint write_dummy1 = dest[di] = (drgb & ~writeMask) | (drgbOriginal & writeMask); else uint write_dummy1 = dest[di] = drgb; } // TODO: writing RGBA8_I32. needs an algo to reverse a 32 bpp value into // the an 8 bit indexed value. else { pragma(msg,"Invalid RGBA format for alpha blending."); static assert(0); } uint write_dummy2 = si = si + sourceIncrement; uint write_dummy3 = di = di + destIncrement; } private enum : ubyte { NOT_APPLICABLE = 0, LOW_ADDRESS_HALF, HIGH_ADDRESS_HALF, } private template innerLoop( uint sourceRGBA, uint destRGBA, ubyte half16bpp = NOT_APPLICABLE ) { static if ( !(destRGBA == RGB16_565 || destRGBA == RGB16_555) && half16bpp > 0 ) { pragma(msg,"The half16bpp argument is only to be used when the " "destination format is 16 bits per pixel."); static assert(0); } mixin read!(sourceRGBA,destRGBA,half16bpp); mixin convert!(sourceRGBA,destRGBA); mixin blend!(destRGBA); mixin write!(destRGBA,half16bpp); } private template calculatePaddingAndArrays( bool isSource ) { static if ( isSource ) { alias srcSurface surface; alias sourcebpp bpp; alias destbpp otherbpp; } else { alias dstSurface surface; alias destbpp bpp; alias sourcebpp otherbpp; } // Padding is the amount of extra data at the end of a scanline used to // ensure that the end of the scanline lines up on a 32 bit boundary. // spadding = source padding // dpadding = dest padding // In this case, the units padding is measured in change depending on // the source and destination format. // The amount of data that is handled in each iteration also changes, // and is reflected by the different types of arrays. static if ( bpp == 32 ) { auto padding = 0; uint[] pixelData = cast(uint[])surface.pixels; } else static if ( bpp == 24 ) { // padding measured in bytes auto padding = surface.pitch - (surface.width * 3); ubyte* pixelData = surface.pixels.ptr; } else static if ( bpp == 16 ) { static if ( otherbpp != 16 /+otherbpp == 32 || otherbpp == 24 || otherbpp == 8+/ ) { // padding measured in shorts auto padding = (surface.pitch >> 1) - surface.width; ushort[] pixelData = cast(ushort[])surface.pixels; } else { auto padding = 0; uint[] pixelData = cast(uint[])surface.pixels; } } else static if ( bpp == 8 ) { auto padding = surface.pitch - surface.width; // padding measured in bytes ubyte[] pixelData = surface.pixels; } else static assert(0); static if ( isSource ) { alias padding spadding; alias pixelData source; } else { alias padding dpadding; alias pixelData dest; } } // This function shall do no clipping. void blit( uint sourceRGBA, uint destRGBA ) ( short sourceX, short sourceY, short destX, short destY, short width, short height, inout Surface srcSurface, inout Surface dstSurface, uint srcColor, uint alpha ) { // this stuff just determines the bits per pixel of the source and // destination surfaces static if ( sourceRGBA == RGBA32 ) const sourcebpp = 32; else static if ( sourceRGBA == RGB24 ) const sourcebpp = 24; else static if ( sourceRGBA == RGB16_565 || sourceRGBA == RGB16_555 ) const sourcebpp = 16; else const sourcebpp = 8; static if ( destRGBA == RGBA32 ) const destbpp = 32; else static if ( destRGBA == RGB24 ) const destbpp = 24; else static if ( destRGBA == RGB16_565 || destRGBA == RGB16_555 ) const destbpp = 16; else const destbpp = 8; // static if ( (sourcebpp == 32 || sourcebpp == 24) && destbpp == 16 ) const convert32to16 = true; else const convert32to16 = false; static if ( sourcebpp == 16 && (destbpp == 32 || destbpp == 24) ) const convert16to32 = true; else const convert16to32 = false; static if ( (destRGBA == RGB16_565 || destRGBA == RGB16_555) && sourceRGBA != A8 ) srcColor |= (srcColor << 16); static if ( destbpp == 16 ) alpha >>= 3; // note that the padding quantities are necessarily zero if // unitWidth = width / 2; // that's important because they have different units of measurement! mixin calculatePaddingAndArrays!( true ); mixin calculatePaddingAndArrays!( false ); static if ( destbpp == 24 ) { uint lineWidth = width * 3; static if ( sourcebpp == 24 ) { // same as: unitSrcSurfaceWidth = srcSurface.width * 3; uint unitSrcSurfaceWidth = srcSurface.pitch - spadding; uint unitSrcWidth = lineWidth; } else { uint unitSrcSurfaceWidth = srcSurface.width; uint unitSrcWidth = width; } uint unitDstSurfaceWidth = dstSurface.pitch - dpadding; uint unitDstWidth = lineWidth; } else static if ( sourcebpp == 16 && destbpp == 16 ) { uint lineWidth = width / 2; // because we do 2 pixels at a time // The +(width & 1) part is used to make the division round up. uint unitSrcSurfaceWidth = (srcSurface.width / 2) + (srcSurface.width & 1); uint unitDstSurfaceWidth = (dstSurface.width / 2) + (dstSurface.width & 1); // The lineWidth variable rounds down on division, so it may be // missing a pixel. That is desirable since we don't want alphablend // onto the pixel next to the missing pixel. Of course, we will // handle the missing pixel individually, but it is still useful to // have access to a rounded-up version of the blit's width. uint unitSrcWidth = lineWidth + (width & 1); uint unitDstWidth = unitSrcWidth; } else { uint lineWidth = width; uint unitSrcWidth = width; uint unitDstWidth = width; uint unitSrcSurfaceWidth = srcSurface.width; uint unitDstSurfaceWidth = dstSurface.width; } uint sourceAMask = srcSurface.alphaMask; version( SDL ) { auto sourceSdlSurface = srcSurface.sdl_surface; if ( sourceSdlSurface !is null ) { bool srcLocked = lock( sourceSdlSurface ); scope(exit) { if ( srcLocked ) SDL_UnlockSurface( sourceSdlSurface ); } } auto destSdlSurface = srcSurface.sdl_surface; if ( destSdlSurface !is null ) { bool dstLocked = lock( destSdlSurface ); scope(exit) { if ( dstLocked ) SDL_UnlockSurface( destSdlSurface ); } } } static if ( sourcebpp == 24 ) uint sourceIncrement = 3; else uint sourceIncrement = 1; static if ( destbpp == 24 ) uint destIncrement = 3; else uint destIncrement = 1; // Since we are not necessarily blitting accross the entire width of the // destination surface or source surface, we have to skip some of the // pixels on the end of the current scanline and on the beginning of // the next scanline. // Add that to the padding (which is explained above), and the result // is these source/dest LineExtra variables. int sLineExtra = unitSrcSurfaceWidth + spadding - unitSrcWidth; int dLineExtra = unitDstSurfaceWidth + dpadding - unitDstWidth; // initialize the index variables // si = source index // di = destination index int si = (sourceX * sourceIncrement) + (unitSrcSurfaceWidth * sourceY); int di = (destX * destIncrement) + (unitDstSurfaceWidth * destY); // nextLine is always ahead of di by the amount of pixels left in one line // of the blit. int nextLine; // endi is the index to stop at. //int endi = destX + unitWidth + (unitDstSurfaceWidth + dpadding) * (destY + height); int endi = di + (unitDstSurfaceWidth * height); assert( lineWidth + dLineExtra == unitDstSurfaceWidth + dpadding ); // TODO: remove this void writeHex ( char[] name, uint number ) { writef( "(",name,std.string.toString( cast(ulong)number, cast(uint)16 ),")|" ); } // uint srgb; uint drgb; static if ( sourceRGBA == A8 ) srgb = srcColor; while( di < endi ) { nextLine = di + lineWidth; static if ( convert32to16 || convert16to32 ) mixin innerLoop!(sourceRGBA,destRGBA,HIGH_ADDRESS_HALF); while( di < nextLine ) { mixin innerLoop!(sourceRGBA,destRGBA); } static if ( convert32to16 || convert16to32 ) mixin innerLoop!(sourceRGBA,destRGBA,LOW_ADDRESS_HALF); si += sLineExtra; di += dLineExtra; } } version( SDL ) { private bool lock( SDL_Surface* surface ) { if ( SDL_MUSTLOCK( surface ) && !surface.locked ) { safe_SDL_LockSurface( surface ); return true; } return false; } // automatically throw errors resulting from the SDL_LockSurface function. private void safe_SDL_LockSurface( SDL_Surface* surface ) { if ( SDL_LockSurface( surface ) != 0 ) { char* sdlError = SDL_GetError(); char[] error = sdlError[0..std.c.string.strlen(sdlError)]; throw new Exception( "SDL_LockSurface failed to lock a surface: "~error ); } } } struct Surface { ubyte[] pixels; uint alphaMask = 0; ushort width = 0xffff; ushort height = 0xffff; ushort pitch = 0xffff; /// width of a scanline in bytes. ushort RGBAformat = INVALID; /// width and height are in pixels. static Surface opCall( ubyte[] pixels, uint alphaMask, ushort width, ushort height, ushort pitch, ushort RGBAformat ) { Surface result; assert( pixels !is null ); result.pixels = pixels; result.width = width; result.height = height; result.pitch = pitch; result.RGBAformat = RGBAformat; result.alphaMask = alphaMask; return result; } version ( SDL ) { SDL_Surface* sdl_surface = null; static Surface opCall( SDL_Surface* surface, ushort RGBAformat ) { Surface result; result.pixels = cast(ubyte[])surface.pixels[0.. surface.pitch * surface.h]; result.width = surface.w; result.height = surface.h; result.pitch = surface.pitch; assert ( RGBAformat != INVALID ); result.RGBAformat = RGBAformat; result.alphaMask = surface.format.Amask; result.sdl_surface = surface; return result; } } }
Jan 26 2007