Software Optimization Guide for AMD64 Processors

25112 Rev. 3.06 September 2005

mulps xmm2, xmm6

; xmm2=[x1i*y1r,-x1i*y1i,x0i*y0r,-x0i*y0i]

mulps xmm3, xmm7

; xmm3=[x3i*y3r,-x3i*y3i,x2i*y2r,-x2i*y2i]

addps xmm0, xmm2

; xmm0=[x1r*y1i+x1i*y1r,x1r*y1r-x1i*y1i,

 

 

;

x0r*y0i+x0i*y0r,x0r*y0r-x0i*y0i]

addps xmm1, xmm3

; xmm1=[x3r*y3i+x3i*y3r,x3r*y3r-x3i*y3i,

 

 

;

x2r*y2i+x2i*y2r,x2r*y2r-x2i*y2i]

movntps

[eax+ecx*8], xmm0

; Stream XMM0 and XMM1 to representative

movntps

[eax+ecx*8+16], xmm1

;

memory address of prod[].

add

ecx, 4

; ECX = ECX + 4

jnz

eight_cmplx_prod_loop

 

 

sfence

; Finish all memory writes.

;==============================================================================

;INSTRUCTIONS BELOW RESTORE THE REGISTER STATE WITH WHICH THIS ROUTINE WAS

;ENTERED

;REGISTERS EAX, ECX, AND EDX ARE CONSIDERED VOLATILE AND ASSUMED TO BE CHANGED

;WHILE THE REGISTERS BELOW MUST BE PRESERVED IF THE USER IS CHANGING THEM add esp, 32

pop edi pop esi

pop ebx

mov esp, ebp pop ebp

;============================================================================== ret

_cmplx_multiply_sse ENDP _TEXT ENDS

END

Listing 26. Complex Multiplication of Streams of Complex Numbers (3DNow!™ Technology)

;cmplx_multiply_3dnow(float *x, float *y, int num_cmplx_elem, float *prod);

;TO ASSEMBLE INTO *.obj DO THE FOLLOWING:

;ml.exe -coff -c cmplx_multiply_3dnow.asm

;

.586

.K3D

.XMM

_TEXT SEGMENT

PUBLIC _cmplx_multiply_3dnow

;cmplx_multiply_3dnow(float *x, float *y, int num_cmplx_elem, float *prod);

;

;TO ASSEMBLE INTO *.obj DO THE FOLLOWING:

;ml.exe -coff -c cmplx_multiply_3dnow.asm

.586

.K3D

.XMM

_TEXT SEGMENT

PUBLIC _cmplx_multiply_3dnow

224

Optimizing with SIMD Instructions

Chapter 9

Page 240
Image 240
AMD 250 manual 224