Software Optimization Guide for AMD64 Processors | 25112 Rev. 3.06 September 2005 |
The following functions use SSE and 3DNow! instructions to illustrate complex multiplication of streams of complex numbers x[] and y[] stored in a product stream prod[]. For these examples, assume that the sizes of x[] and y[] are even multiples of four.
Examples
Listing 25. Complex Multiplication of Streams of Complex Numbers (SSE)
;cmplx_multiply_sse(float *x, float *y, int num_cmplx_elem, float *prod);
;TO ASSEMBLE INTO *.obj DO THE FOLLOWING:
;ml.exe
;
.586
.K3D
.XMM
_TEXT SEGMENT
PUBLIC _cmplx_multiply_sse
_cmplx_multiply_sse PROC NEAR ;==============================================================================
;INSTRUCTIONS BELOW SAVE THE REGISTER STATE WITH WHICH THIS ROUTINE WAS ENTERED
;REGISTERS (EAX, ECX, EDX ARE CONSIDERED VOLATILE AND ASSUMED TO BE CHANGED)
;WHILE THE REGISTERS BELOW MUST BE PRESERVED IF THE USER IS CHANGING THEM
push ebp
mov ebp, esp ;==============================================================================
;parameters passed into routine:
;[ebp+8] =
;[ebp+12] =
;[ebp+16] = num_cmplx_elem
;[ebp+20] =
push ebx | ; | preserve contents in ebx,esi, and edi on stack | |
push | esi | ; |
|
push | edi | ; |
|
;===============================================================================
;THE CODE BELOW PUTS THE FLOATING POINT SIGN MASK
;[800000000000000800000000000000h]
;TO FLIP THE SIGN OF PACKED SINGLE PRECISION NUMBERS BY USING XORPS ;==============================================================================
mov eax, esp | ; Copy stack | pointer into EAX. | |
mov ebx, 16 |
|
|
|
sub esp, 32 | ; Subtract 32 bytes from stack pointer. | ||
and eax, 15 | ; AND old stack pointer address with 15 to | ||
| ; | determine | # of bytes the address is past a |
| ; | ||
sub ebx, eax | ; EBX = # of | bytes above ESP to next | |
| ; | ||
mov edi, 0h | ; EDI = 00000000h | ||
mov esi, 80000000h | ; EBX = 80000000h | ||
shr ebx, 2 | ; EBX = # of | DWORDs past |
222 | Optimizing with SIMD Instructions | Chapter 9 |