234 Optimizing with SIMD Instructions Chapter 9
25112 Rev. 3.06 September 2005
Software Optimization Guide for AMD64 Processors
; THE 4 ASM LINES BELOW LOAD THE FUNCTION’S ARGUMENTS INTO GENERAL-PURPOSE
; REGISTERS (GPRs)
; eax = address of Transposed Rotation Matrix
; edx = address of vertices to rotate
; ecx = # of vertices to rotate
; ebx = address of rotated vertices
;==============================================================================
mov eax, [ebp+8] ; ESI = ->R
mov edx, [ebp+12] ; EDI = ->v
mov ecx, [ebp+16] ; ECX = num_vertices_to_rotate
mov ebx, [ebp+20] ; EAX = ->rotv
femms ; Clear MMX state.
ALIGN 16 ; Ensure optimal branch alignment.
;==============================================================================
; THIS LOOP ROTATES "num_vertices_to_rotate" VERTICES BY THE TRANSPOSED
; ROTATION MATRIX "R" PASSED INTO THE ROUTINE AND STORES THE ROTATED
; VERTICES TO "rotv".
;==============================================================================
rotate_vertices_loop:
add ebx,16 ; Increment ->v to next vertex.
movq mm0,[edx] ; MM0 = [y,x]
movq mm1,[edx+8] ; MM1 = [w,z]
add edx,16 ; Increment ->rotv to next transformed vertex.
movq mm2,mm0 ; MM2 = [y,x]
movq mm3,[eax] ; MM3 = [R01,R00]
punpckldq mm0,mm0 ; MM0 = [x,x]
movq mm4,[eax+16] ; MM4 = [R11,R10]
pfmul mm3,mm0 ; MM3 = [x*R01,x*R00]
punpckhdq mm2,mm2 ; MM2 = [y,y]
pfmul mm4,mm2 ; MM4 = [y*R11,y*R10]
movq mm5,[eax+8] ; MM5 = [R03,R02]
movq mm7,[eax+24] ; MM7 = [R13,R12]
movq mm6,mm1 ; MM6 = [w,z]
pfmul mm5,mm0 ; MM5 = [x*R03,x*R02]
movq mm0,[eax+32] ; MM0 = [R21,R20]
punpckldq mm1,mm1 ; MM1 = [z,z]
pfmul mm7,mm2 ; MM7 = [y*R13,y*R12]
movq mm2,[eax+40] ; MM2 = [R23,R22]
pfmul mm0,mm1 ; MM0 = [z*R21,z*R20]
pfadd mm3,mm4 ; MM3 = [x*R01+y*R11,x*R00+y*R10]
movq mm4,[eax+48] ; MM4 = [R31,R30]
pfmul mm2,mm1 ; MM2 = [z*R23,z*R22]
pfadd mm5,mm7 ; MM5 = [x*R03+y*R13],x*R02+y*R12]
movq mm1,[eax+56] ; MM1 = [R33,R32]
punpckhdq mm6,mm6 ; MM6 = [w,w]
pfadd mm3,mm0 ; MM3 = [x*R01+y*R11+z*R21,x*R00+y*R10+z*R20]
pfmul mm4,mm6 ; MM4 = [w*R31,w*R30]
pfmul mm1,mm6 ; MM1 = [w*R33,w*R32]
pfadd mm5,mm2 ; MM5 = [x*R03+y*R13+z*R23,x*R02+y*R12+z*R22]
pfadd mm3,mm4 ; MM3 = [x*R01+y*R11+z*R21+w*R31,