Software Optimization Guide for AMD64 Processors | 25112 Rev. 3.06 September 2005 |
mulps xmm2, xmm6 | ; | ||
mulps xmm3, xmm7 | ; | ||
addps xmm0, xmm2 | ; | ||
|
| ; | |
addps xmm1, xmm3 | ; | ||
|
| ; | |
movntps | [eax+ecx*8], xmm0 | ; Stream XMM0 and XMM1 to representative | |
movntps | [eax+ecx*8+16], xmm1 | ; | memory address of prod[]. |
add | ecx, 4 | ; ECX = ECX + 4 | |
jnz | eight_cmplx_prod_loop |
|
|
sfence | ; Finish all memory writes. |
;==============================================================================
;INSTRUCTIONS BELOW RESTORE THE REGISTER STATE WITH WHICH THIS ROUTINE WAS
;ENTERED
;REGISTERS EAX, ECX, AND EDX ARE CONSIDERED VOLATILE AND ASSUMED TO BE CHANGED
;WHILE THE REGISTERS BELOW MUST BE PRESERVED IF THE USER IS CHANGING THEM add esp, 32
pop edi pop esi
pop ebx
mov esp, ebp pop ebp
;============================================================================== ret
_cmplx_multiply_sse ENDP _TEXT ENDS
END
Listing 26. Complex Multiplication of Streams of Complex Numbers (3DNow!™ Technology)
;cmplx_multiply_3dnow(float *x, float *y, int num_cmplx_elem, float *prod);
;TO ASSEMBLE INTO *.obj DO THE FOLLOWING:
;ml.exe
;
.586
.K3D
.XMM
_TEXT SEGMENT
PUBLIC _cmplx_multiply_3dnow
;cmplx_multiply_3dnow(float *x, float *y, int num_cmplx_elem, float *prod);
;
;TO ASSEMBLE INTO *.obj DO THE FOLLOWING:
;ml.exe
.586
.K3D
.XMM
_TEXT SEGMENT
PUBLIC _cmplx_multiply_3dnow
224 | Optimizing with SIMD Instructions | Chapter 9 |