224 Optimizing with SIMD Instructions Chapter 9
25112 Rev. 3.06 September 2005
Software Optimization Guide for AMD64 Processors
mulps xmm2, xmm6 ; xmm2=[x1i*y1r,-x1i*y1i,x0i*y0r,-x0i*y0i]
mulps xmm3, xmm7 ; xmm3=[x3i*y3r,-x3i*y3i,x2i*y2r,-x2i*y2i]
addps xmm0, xmm2 ; xmm0=[x1r*y1i+x1i*y1r,x1r*y1r-x1i*y1i,
; x0r*y0i+x0i*y0r,x0r*y0r-x0i*y0i]
addps xmm1, xmm3 ; xmm1=[x3r*y3i+x3i*y3r,x3r*y3r-x3i*y3i,
; x2r*y2i+x2i*y2r,x2r*y2r-x2i*y2i]
movntps [eax+ecx*8], xmm0 ; Stream XMM0 and XMM1 to representative
movntps [eax+ecx*8+16], xmm1 ; memory address of prod[].
add ecx, 4 ; ECX = ECX + 4
jnz eight_cmplx_prod_loop
sfence ; Finish all memory writes.
;==============================================================================
; INSTRUCTIONS BELOW RESTORE THE REGISTER STATE WITH WHICH THIS ROUTINE WAS
; ENTERED
; REGISTERS EAX, ECX, AND EDX ARE CONSIDERED VOLATILE AND ASSUMED TO BE CHANGED
; WHILE THE REGISTERS BELOW MUST BE PRESERVED IF THE USER IS CHANGING THEM
add esp, 32
pop edi
pop esi
pop ebx
mov esp, ebp
pop ebp
;==============================================================================
ret
_cmplx_multiply_sse ENDP
_TEXT ENDS
END

Listing 26. Complex Multiplication of Streams of Complex Numbers (3DNow!™ Technology)

; cmplx_multiply_3dnow(float *x, float *y, int num_cmplx_elem, float *prod);
;
; TO ASSEMBLE INTO *.obj DO THE FOLLOWING:
; ml.exe -coff -c cmplx_multiply_3dnow.asm
;
.586
.K3D
.XMM
_TEXT SEGMENT
PUBLIC _cmplx_multiply_3dnow
;cmplx_multiply_3dnow(float *x, float *y, int num_cmplx_elem, float *prod);
;
; TO ASSEMBLE INTO *.obj DO THE FOLLOWING:
; ml.exe -coff -c cmplx_multiply_3dnow.asm
;
.586
.K3D
.XMM
_TEXT SEGMENT
PUBLIC _cmplx_multiply_3dnow