212

25112 Rev. 3.06 September 2005

;[ebp+8] = ->r

;[ebp+12] = ->rcp_sqrt_r

;[ebp+16] = num_points ;==============================================================================

push ebx push esi push edi

;==============================================================================

;THE FIRST 3 ASM LINES BELOW LOAD THE FUNCTION'S ARGUMENTS INTO GENERAL-PURPOSE

;REGISTERS (GPRS)

;esi = address of "r"'s to calculate the reciprocal square root of

;edi = address of "rcp_sqrt_r"'s to store reciprocal square root to

;ecx = num_points ;==============================================================================

mov	esi,[ebp+8]	; ESI = ->r
mov	edi,[ebp+12]	; EDI = ->rcp_sqrt_r
mov	ecx,[ebp+16]	; ECX = num_points
mov	edx,ecx	; EDX = num_points
mov	eax,ecx	; EAX = num_points
shl	edx,2	; EDX = 4*num_points
shr	eax,4	; EAX = num_points/16
add	edi,edx	; EDI = -> end of "r"
add	esi,edx	; EAX = -> end of "rcp_sqrt_r"
neg	ecx	; ECX = -# quadwords of vertices to rotate
or	eax,eax	; If num_points/16 = 0, then skip

;reciprocal square root. jz skip_recprcl_sqrt_4xloop ; Unroll loop by 4 to work

;on 16 floats at a time.

;==============================================================================

;THIS LOOP RECIPROCATES AND SQUARE ROOTS 16 FLOATING-POINT NUMBERS EACH

;LOOP ITERATION AND WORDS WITH THOSE ELEMENTS OF "r" THAT OCCUPY A

;FULL CACHELINE ;==============================================================================

ALIGN 16		; Align address of loop to a 16-byte boundary.
reciprocal_sqrt_4xloop:
prefetchnta [esi+4*ecx+256]		; Prefetch the elements "r" 4 cache lines
		; ahead to reciprocate and squareroot 4 loops
		; from now.
movaps	xmm0, [esi+4*ecx]	; XMM0=[r3,r2,r1,r0]
sqrtps	xmm0, xmm0	; XMM0=[sqrtr3,sqrtr2,sqrtr0,sqrtr0]
rcpps	xmm0, xmm0	; XMM0=[1/sqrtr3,1/sqrtr2,1/sqrtr0,1/sqrtr0]
movaps	xmm1, [esi+4*ecx+16]	; XMM1=[r7,r6,r5,r4]
sqrtps	xmm1, xmm1	; XMM1=[sqrtr7,sqrtr6,sqrtr5,sqrtr4]
rcpps	xmm1, xmm1	; XMM1=[1/sqrtr7,1/sqrtr6,1/sqrtr5,1/sqrtr4]
movaps	xmm2, [esi+4*ecx+32]	; XMM2=[r11,r10,r9,r8]
sqrtps	xmm2, xmm2	; XMM2=[sqrtr11,sqrtr10,sqrtr9,sqrtr8]
rcpps	xmm2, xmm2	; XMM2=[1/sqrtr11,1/sqrtr10,1/sqrtr9,1/sqrtr8]
movaps	xmm3, [esi+4*ecx+48]	; XMM2=[r15,r14,r13,r12]
sqrtps	xmm3, xmm3	; XMM2=[sqrtr15,sqrtr14,sqrtr13,sqrtr12]
rcpps	xmm3, xmm3	; XMM2=[1/sqrtr15,1/sqrtr14,1/sqrtr13,1/sqrtr12]
movntps	[edi+4*ecx], xmm0	; Store reciprocal square root to rcp_sqrt_r.

Optimizing with SIMD Instructions

Chapter 9

AMD 250 manual 212