Software Optimization Guide for AMD64 Processors

25112 Rev. 3.06 September 2005

imul

ecx, 2

; ECX = # quadwords of vertices to rotate

add

edi, edx

; EDI = -> end

of

"v"

add

eax, edx

; EAX

=

->

end

of

"rotv"

neg

ecx

; ECX

=

-#

quadwords of vertices to rotate

;==============================================================================

;THE 4 ASM LINES BELOW LOAD THE TRANSPOSED ROTATION MATRIX "R" INTO XMM0-XMM3

;IN THE FOLLOWING MANNER:

;xmm0 = column 0 of "R" or row 0 of "R" transpose

;xmm1 = column 1 of "R" or row 1 of "R" transpose

;xmm2 = column 2 of "R" or row 2 of "R" transpose

;xmm3 = column 3 of "R" or row 3 of "R" transpose ;==============================================================================

movaps xmm0, [esi]

; XMM0 = [R30,R20,R10,R00]

movaps xmm1, [esi+16]

; XMM1 = [R31,R21,R11,R01]

movaps

xmm2,

[esi+32]

;

XMM2

=

[R32,R22,R12,R02]

movaps

xmm3,

[esi+48]

;

XMM3

=

[R33,R23,R13,R03]

;==============================================================================

;THIS LOOP ROTATES "num_vertices_to_rotate" VERTICES BY THE TRANSPOSED

;ROTATION MATRIX "R" PASSED INTO THE ROUTINE AND STORES THE ROTATED

;VERTICES TO "rotv". ;==============================================================================

ALIGN 16

 

; Align address of loop to a 16-byte boundary.

rotate_vertices_loop:

 

 

movlps

xmm4, [edi+8*ecx]

; XMM4=[,,v1,v0]

movlps

xmm6, [edi+8*ecx+8]

; XMM6=[,,v3,v2]

unpcklps

xmm4, xmm4

; XMM4=[v1,v1,v0,v0]

unpcklps

xmm6, xmm6

; XMM6=[v3,v3,v2,v2]

movhlps

xmm5, xmm4

; XMM5=[,,v1,v1]

movhlps

xmm7, xmm6

; XMM7=[,,v3,v3]

movlhps

xmm4, xmm4

; XMM4=[v0,v0,v0,v0]

mulps

xmm4, xmm0

; XMM4=[R30*v0,R20*v0,R10*v0,R00*v0]

movlhps

xmm5, xmm5

; XMM5=[v1,v1,v1,v1]

mulps

xmm5, xmm1

; XMM5=[R31*v1,R21*v1,R11*v1,R01*v1]

movlhps

xmm6, xmm6

; XMM6=[v2,v2,v2,v2]

mulps

xmm6, xmm2

; XMM6=[R32*v2,R22*v2,R12*v2,R02*v2]

addps

xmm4, xmm5

; XMM4=[R30*v0+R31*v1,R20*v0+R21*v1,

 

 

;

R10*v0+R11*v1,R00*v0+R01*v1]

movlhps

xmm7, xmm7

; XMM7=[v3,v3,v3,v3]

mulps

xmm7, xmm3

; XMM6=[R33*v3,R23*v3,R13*v3,R03*v3]

addps

xmm6, xmm7

; XMM6=[R32*v2+R33*v3,R22*v2+R23*v3,

 

 

;

R12*v2+R13*v3,R02*v2+R03*v3]

addps

xmm4, xmm6

; XMM4=New rotated vertex

movntps

[eax+8*ecx], xmm4

; Store rotated vertex to rotv.

add

ecx, 2

; Decrement the # of QWORDs to rotate by 2.

jnz

rotate_vertices_loop

 

 

sfence

 

; Finish all memory writes.

;==============================================================================

;INSTRUCTIONS BELOW RESTORE THE REGISTER STATE WITH WHICH THIS ROUTINE

;WAS ENTERED

232

Optimizing with SIMD Instructions

Chapter 9

Page 248
Image 248
AMD 250 manual 232, XMM3