Software Optimization Guide for AMD64 Processors

25112 Rev. 3.06 September 2005

;[ebp+8] = ->r

;[ebp+12] = ->rcp_sqrt_r

;[ebp+16] = num_points ;==============================================================================

push ebx push esi push edi

;==============================================================================

;THE FIRST 3 ASM LINES BELOW LOAD THE FUNCTION'S ARGUMENTS INTO GENERAL-PURPOSE

;REGISTERS (GPRS)

;esi = address of "r"'s to calculate the reciprocal square root of

;edi = address of "rcp_sqrt_r"'s to store reciprocal square root to

;ecx = num_points ;==============================================================================

mov

esi,[ebp+8]

; ESI = ->r

mov

edi,[ebp+12]

; EDI = ->rcp_sqrt_r

mov

ecx,[ebp+16]

; ECX = num_points

mov

edx,ecx

; EDX = num_points

mov

eax,ecx

; EAX = num_points

shl

edx,2

; EDX = 4*num_points

shr

eax,4

; EAX = num_points/16

add

edi,edx

; EDI = -> end of "r"

add

esi,edx

; EAX = -> end of "rcp_sqrt_r"

neg

ecx

; ECX = -# quadwords of vertices to rotate

or

eax,eax

; If num_points/16 = 0, then skip

;reciprocal square root. jz skip_recprcl_sqrt_4xloop ; Unroll loop by 4 to work

;on 16 floats at a time.

;==============================================================================

;THIS LOOP RECIPROCATES AND SQUARE ROOTS 16 FLOATING-POINT NUMBERS EACH

;LOOP ITERATION AND WORDS WITH THOSE ELEMENTS OF "r" THAT OCCUPY A

;FULL CACHELINE ;==============================================================================

ALIGN 16

 

; Align address of loop to a 16-byte boundary.

reciprocal_sqrt_4xloop:

 

prefetchnta [esi+4*ecx+256]

; Prefetch the elements "r" 4 cache lines

 

 

; ahead to reciprocate and squareroot 4 loops

 

 

; from now.

movaps

xmm0, [esi+4*ecx]

; XMM0=[r3,r2,r1,r0]

sqrtps

xmm0, xmm0

; XMM0=[sqrtr3,sqrtr2,sqrtr0,sqrtr0]

rcpps

xmm0, xmm0

; XMM0=[1/sqrtr3,1/sqrtr2,1/sqrtr0,1/sqrtr0]

movaps

xmm1, [esi+4*ecx+16]

; XMM1=[r7,r6,r5,r4]

sqrtps

xmm1, xmm1

; XMM1=[sqrtr7,sqrtr6,sqrtr5,sqrtr4]

rcpps

xmm1, xmm1

; XMM1=[1/sqrtr7,1/sqrtr6,1/sqrtr5,1/sqrtr4]

movaps

xmm2, [esi+4*ecx+32]

; XMM2=[r11,r10,r9,r8]

sqrtps

xmm2, xmm2

; XMM2=[sqrtr11,sqrtr10,sqrtr9,sqrtr8]

rcpps

xmm2, xmm2

; XMM2=[1/sqrtr11,1/sqrtr10,1/sqrtr9,1/sqrtr8]

movaps

xmm3, [esi+4*ecx+48]

; XMM2=[r15,r14,r13,r12]

sqrtps

xmm3, xmm3

; XMM2=[sqrtr15,sqrtr14,sqrtr13,sqrtr12]

rcpps

xmm3, xmm3

; XMM2=[1/sqrtr15,1/sqrtr14,1/sqrtr13,1/sqrtr12]

movntps

[edi+4*ecx], xmm0

; Store reciprocal square root to rcp_sqrt_r.

212

Optimizing with SIMD Instructions

Chapter 9

Page 228
Image 228
AMD 250 manual 212