25112 Rev. 3.06 September 2005

Software Optimization Guide for AMD64 Processors

_cmplx_multiply_3dnow PROC NEAR ;==============================================================================

;INSTRUCTIONS BELOW SAVE THE REGISTER STATE WITH WHICH THIS ROUTINE WAS ENTERED

;REGISTERS EAX, ECX, EDX ARE CONSIDERED VOLATILE AND ASSUMED TO BE CHANGED

;WHILE THE REGISTERS BELOW MUST BE PRESERVED IF THE USER IS CHANGING THEM push ebp

mov ebp, esp ;==============================================================================

;Parameters passed into routine:

;[ebp+8] = ->x

;[ebp+12] = ->y

;[ebp+16] = num_cmplx_elem

;[ebp+20] = ->prod ;==============================================================================

push ebx push esi push edi

;==============================================================================

;THE 4 ASM LINES BELOW LOAD THE FUNCTION's ARGUMENTS INTO GENERAL-PURPOSE

;REGISTERS (GPRS)

;esi = address of array "x"

;edi = address of array "y"

;ecx = # of cmplx products to compute

;eax = address of product to which results are stored ;==============================================================================

mov esi, [ebp+8]

; esi = ->x

mov edi, [ebp+12]

; edi = ->y

mov ecx,

[ebp+16]

;

ecx

=

num_cmplx_elem

mov eax,

[ebp+20]

;

eax

=

->prod

;==============================================================================

;THE 6 ASM LINES BELOW OFFSET THE ADDRESS TO THE ARRAYS x[] AND y[] SUCH

;THAT THEY CAN BE ACCESSED IN THE MOST EFFICIENT MANNER AS ILLUSTRATED

;BELOW IN THE LOOP mult4cmplxnum_loop WITH THE MINIMUM NUMBER OF

;ADDRESS INCREMENTS ;==============================================================================

mov

edx, ecx

; edx = num_cmplx_elem]

neg

ecx

; ecx = -num_cmplx_elem

imul edx, 8

; edx = 8 * num_cmplx_elem = # bytes in x[] and y[] to multiply

add

esi, edx

; esi = -> to last element of x[] to multiply

add

edi, edx

; edi = -> to last element of y[] to multiply

add

eax, edx

; eax = -> end of prod[] to calculate

;==============================================================================

;THIS LOOP MULTIPLIES 4 COMPLEX #s FROM "x[]" UPON 4 COMPLEX #s FROM "y[]"

;AND RETURNS THE PRODUCT IN "prod[]". ;==============================================================================

ALIGN 16

 

; Align address of loop to a 16-byte boundary.

four_cmplx_prod_loop:

 

;

movq

mm0, QWORD

PTR [esi+ecx*8]

; mm0=[x0i,x0r]

movq

mm1, QWORD

PTR [esi+ecx*8+8]

; mm1=[x1i,x1r]

movq

mm2, QWORD

PTR [esi+ecx*8+16]

; mm2=[x2i,x2r]

movq

mm3, QWORD

PTR [esi+ecx*8+24]

; mm3=[x3i,x3r]

Chapter 9

Optimizing with SIMD Instructions

225

Page 241
Image 241
AMD 250 manual 225