; Finish all memory writes.

25112 Rev. 3.06 September 2005

Software Optimization Guide for AMD64 Processors

movntps

[edi+4*ecx+16], xmm1 ; Store reciprocal square

root to rcp_sqrt_r.

movntps

[edi+4*ecx+32], xmm2 ; Store reciprocal square

root to rcp_sqrt_r.

movntps

[edi+4*ecx+48], xmm3 ; Store reciprocal square

root to rcp_sqrt_r.

add

ecx, 16

; Decrement

the # of reciprocal square

 

 

; roots to

calculate by 16.

dec

eax

; Decrement

# of 16 float

reciprocal square

;root loops to perform by 1.

jnz reciprocal_sqrt_4xloop

jmp

skip_recprcl_sqrt_4xloop

; Jump into

loop

to calculate

reciprocal

 

 

;

square

root of floats that

don't

 

 

;

occupy

a

full

cache line.

 

;==============================================================================

;THIS LOOP RECIPROCATES AND SQUARE ROOTS 1 FLOATING POINT NUMBER EACH

;LOOP ITERATION ;==============================================================================

ALIGN 16

 

; Align address of loop to a 16-byte boundary.

reciprocal_sqrt_1xloop:

 

 

movss

xmm0, [esi+4*ecx]

; XMM0=[,,,r0]

sqrtss

xmm0, xmm0

; XMM0=[,,,sqrt(r0)]

rcpss

xmm0, xmm0

; XMM0=[,,,1/sqrt(r0)]

movss

[edi+4*ecx], xmm0

; Store reciprocal square root to rcp_sqrt_r.

inc

ecx

; Decrement the # of reciprocal square roots

 

 

;

to calculate.

skip_recprcl_sqrt_4xloop:

 

 

or

ecx, ecx

; If ECX != 0, then calculate the reciprocal

 

 

;

square root of another float.

jnz reciprocal_sqrt_1xloop

sfence

;==============================================================================

;INSTRUCTIONS BELOW RESTORE THE REGISTER STATE WITH WHICH THIS ROUTINE

;WAS ENTERED.

;REGISTERS EAX, ECX, AND EDX ARE CONSIDERED VOLATILE AND ASSUMED TO BE CHANGED,

;WHILE THE REGISTERS BELOW MUST BE PRESERVED IF THE USER IS CHANGING THEM pop edi

pop esi pop ebx mov esp,ebp pop ebp

;=============================================================================== ret

_reciprocal_sqrt_sse ENDP _TEXT ENDS

END

The preceding code illustrates the use of separate loops for optimal performance. The loop titled reciprocal_sqrt_4xloop works with 16 floating-point numbers in each iteration and is unrolled to keep the processor busy by masking the latencies of the reciprocal and square-root instructions. In

Chapter 9

Optimizing with SIMD Instructions

213

Page 229
Image 229
AMD 250 manual 213