25112 Rev. 3.06 September 2005

Software Optimization Guide for AMD64 Processors

mov [esp+4*ebx+12], esi

;

Move into address esp+4*ebx the single-precision

mov [esp+4*ebx+8],

edi

;

floating-point sign mask.

mov [esp+4*ebx+4],

esi

 

 

mov [esp+4*ebx], edi ;==============================================================================

;THE 4 ASM LINES BELOW LOAD THE FUNCTION's ARGUMENTS INTO GENERAL-PURPOSE

;REGISTERS (GPRS)

;esi = address of array "x"

;edi = address of array "y"

;ecx = # of cmplx products to compute

;eax = address of product to which results are stored ;==============================================================================

mov esi, [ebp+8]

; esi = ->x

mov edi, [ebp+12]

; edi = ->y

mov

ecx,

[ebp+16]

; ecx

=

num_cmplx_elem

mov

eax,

[ebp+20]

; eax

=

->prod

;==============================================================================

;THE 6 ASM LINES BELOW OFFSET THE ADDRESS TO THE ARRAYS x[] AND y[] SUCH

;THAT THEY CAN BE ACCESSED IN THE MOST EFFICIENT MANNER AS ILLUSTRATED

;BELOW IN THE LOOP mult4cmplxnum_loop WITH THE MINIMUM NUMBER OF

;ADDRESS INCREMENTS ;==============================================================================

mov edx, ecx

;

edx

= num_cmplx_elem

 

 

neg ecx

;

ecx

= -num_cmplx_elem

 

 

shl edx, 3

; edx =

8 * num_cmplx_elem

= # bytes

in x[] and y[] to multiply

add esi, edx

; esi =

-> to last element

of x[] to

multiply

add edi, edx

; edi =

-> to last element

of y[] to

multiply

add eax, edx

; eax =

-> end of prod[] to calculate

 

;==============================================================================

;THIS LOOP MULTIPLIES 4 COMPLEX #s FROM "x[]" UPON 4 COMPLEX #s FROM "y[]"

;AND RETURNS THE PRODUCT IN "prod[]". ;==============================================================================

ALIGN 16

; Align

address of loop to a 16-byte boundary.

eight_cmplx_prod_loop:

 

 

movaps xmm0, [esi+ecx*8]

;

xmm0=[x1i,x1r,x0i,x0r]

movaps xmm1, [esi+ecx*8+16]

;

xmm1=[x3i,x3r,x2i,x2r]

movaps xmm4, [edi+ecx*8]

;

xmm4=[y1i,y1r,y0i,y0r]

movaps xmm5, [edi+ecx*8+16]

;

xmm5=[y3i,y3r,y2i,y2r]

movaps xmm2, xmm0

;

xmm2=[x1i,x1r,x0i,x0r]

movaps xmm3, xmm1

;

xmm3=[x3i,x3r,x2i,x2r]

movaps xmm6, xmm4

;

xmm6=[y1i,y1r,y0i,y0r]

movaps xmm7, xmm5

;

xmm7=[y3i,y3r,y2i,y2r]

shufps xmm0, xmm0, 10100000b

;

xmm0=[x1r,x1r,x0r,x0r]

shufps xmm1, xmm1, 10100000b

;

xmm1=[x3r,x3r,x2r,x2r]

shufps xmm2, xmm2, 11110101b

;

xmm2=[x1i,x1i,x0i,x0i]

shufps

xmm3, xmm3, 11110101b

;

xmm3=[x3i,x3i,x2i,x2i]

xorps

xmm6, [esp+4*ebx]

;

xmm6=[-y1i,y1r,-y0i,y0r]

xorps

xmm7, [esp+4*ebx]

;

xmm7=[-y3i,y3r,-y2i,y2r]

mulps xmm0, xmm4

;

xmm0=[x1r*y1i,x1r*y1r,x0r*y0i,x0r*y0r]

mulps xmm1, xmm5

;

xmm1=[x3r*y3i,x3r*y3r,x2r*y2i,x2r*y2r]

shufps xmm7, xmm7, 10110001b

;

xmm7=[y3r,-y3i,y2r,-y2i]

Chapter 9

Optimizing with SIMD Instructions

223

Page 239
Image 239
AMD 250 manual 223