Software Optimization Guide for AMD64 Processors

25112 Rev. 3.06 September 2005

;THE 4 ASM LINES BELOW LOAD THE FUNCTION’S ARGUMENTS INTO GENERAL-PURPOSE

;REGISTERS (GPRs)

;eax = address of Transposed Rotation Matrix

;edx = address of vertices to rotate

;ecx = # of vertices to rotate

;ebx = address of rotated vertices ;==============================================================================

mov eax, [ebp+8]

; ESI =

->R

mov edx, [ebp+12]

; EDI =

->v

mov ecx, [ebp+16]

; ECX =

num_vertices_to_rotate

mov ebx, [ebp+20]

; EAX =

->rotv

femms

; Clear

MMX state.

ALIGN 16

; Ensure optimal branch alignment.

;==============================================================================

;THIS LOOP ROTATES "num_vertices_to_rotate" VERTICES BY THE TRANSPOSED

;ROTATION MATRIX "R" PASSED INTO THE ROUTINE AND STORES THE ROTATED

;VERTICES TO "rotv". ;============================================================================== rotate_vertices_loop:

add

ebx,16

; Increment ->v to next vertex.

movq

mm0,[edx]

; MM0 = [y,x]

movq

mm1,[edx+8]

; MM1 = [w,z]

add

edx,16

; Increment ->rotv to next transformed vertex.

movq

mm2,mm0

; MM2 = [y,x]

movq

mm3,[eax]

; MM3 = [R01,R00]

punpckldq

mm0,mm0

; MM0 = [x,x]

movq

mm4,[eax+16]

; MM4 = [R11,R10]

pfmul

mm3,mm0

; MM3 = [x*R01,x*R00]

punpckhdq

mm2,mm2

; MM2 = [y,y]

pfmul

mm4,mm2

; MM4 = [y*R11,y*R10]

movq

mm5,[eax+8]

; MM5 = [R03,R02]

movq

mm7,[eax+24]

; MM7 = [R13,R12]

movq

mm6,mm1

; MM6 = [w,z]

pfmul

mm5,mm0

; MM5 = [x*R03,x*R02]

movq

mm0,[eax+32]

; MM0 = [R21,R20]

punpckldq

mm1,mm1

; MM1 = [z,z]

pfmul

mm7,mm2

; MM7 = [y*R13,y*R12]

movq

mm2,[eax+40]

; MM2 = [R23,R22]

pfmul

mm0,mm1

; MM0 = [z*R21,z*R20]

pfadd

mm3,mm4

; MM3 = [x*R01+y*R11,x*R00+y*R10]

movq

mm4,[eax+48]

; MM4 = [R31,R30]

pfmul

mm2,mm1

; MM2 = [z*R23,z*R22]

pfadd

mm5,mm7

; MM5 = [x*R03+y*R13],x*R02+y*R12]

movq

mm1,[eax+56]

; MM1 = [R33,R32]

punpckhdq

mm6,mm6

; MM6 = [w,w]

pfadd

mm3,mm0

; MM3 = [x*R01+y*R11+z*R21,x*R00+y*R10+z*R20]

pfmul

mm4,mm6

; MM4 = [w*R31,w*R30]

pfmul

mm1,mm6

; MM1 = [w*R33,w*R32]

pfadd

mm5,mm2

; MM5 = [x*R03+y*R13+z*R23,x*R02+y*R12+z*R22]

pfadd

mm3,mm4

; MM3 = [x*R01+y*R11+z*R21+w*R31,

234

Optimizing with SIMD Instructions

Chapter 9

Page 250
Image 250
AMD 250 manual 234