I've decided to stop apologizing for using MMX everywhere. Everyone has it, and it's just too much fun.
I thought I'd put up some code to do wicked-fast crossfades -- I got a lot of attention at MetaCreations for all the image transition code I did there -- ahem, related, but not the same code. This is, uh, quite a bit faster. :)
This code has benchmarked at 103fps on my P3/600 for crossfading between two 640x480 images to a third. Even with bitblt, it can still do 60fps, which is fast enough.
Also, my brother-in-law Brent Elliott works at Intel, and he gave me some of the prefetch code on this page. Prefetch lets you tell the processor, "Hey dummy, go get this memory, I'm going to need it." It's worth maybe a 30% speedup here. Unfortunately, there's no compiler support, so you have to do it at the icky opcode level.
What I learned: saving multiplies isn't always worth it. This code does four multiplies instead of two, and it's better for it.
UINT64 neg64 = 0x00FF00FF00FF00FF;
// three cache line prefetch...it's fast here
#define pfNTA_ECX __asm __emit 0x0f __asm __emit 0x18 __asm __emit 0x41 __asm __emit 0x60
#define pfNTA_EDX __asm __emit 0x0f __asm __emit 0x18 __asm __emit 0x42 __asm __emit 0x60
// bitmap sizes must be multiple of 2 or you lose a pixel
error CrossFade(uint32 opac, Bitmap &result, Bitmap &src, Bitmap &dst)
{
Rect area = result.Size();
area.Intersect(src.Size());
uint32 h = area.Height();
uint32 w = area.Width();
if (opac > 0) opac --;
__asm {
pxor mm7, mm7
movd mm6, opac
punpcklwd mm6, mm6
punpckldq mm6, mm6
movq mm5, mm6
pxor mm5, neg64
}
// 2-pixel loop
w /= 2;
for (uint32 y = 0; y < h; y++) {
uint32 *p0 = result.Pixel(0, y);
uint32 *s0 = src.Pixel(0, y);
uint32 *s1 = dst.Pixel(0, y);
__asm {
mov eax, w
mov ebx, p0
mov ecx, s0
mov edx, s1
pixelloop:
movq mm0, [edx]
movq mm1, [ecx]
movq mm2, mm0
movq mm3, mm1
add edx, 8
punpcklbw mm0, mm7
punpcklbw mm1, mm7
add ecx, 8
punpckhbw mm2, mm7
punpckhbw mm3, mm7
pmullw mm0, mm5
pmullw mm1, mm6
pmullw mm2, mm5
pmullw mm3, mm6
paddw mm0, mm1
psrlw mm0, 8
paddw mm2, mm3
psrlw mm2, 8
packuswb mm0, mm2
movq [ebx], mm0
add ebx, 8
pfNTA_ECX
pfNTA_EDX
dec eax
jg pixelloop
}
}
__asm emms
return success;
}