AMD Athlon™ Processor x86 Code Optimization

22007E/0 — November 1999

MOV

ECX, (-LARGE_NUM)

MOV

EAX, OFFSET array_a

MOV

EDX,

OFFSET

array_b

MOV

ECX,

OFFSET

array_c

$loop:

;used biased

index

;get address

of array_a

;get

address

of array_b

;get

address

of array_c

PREFETCHW

[EAX+196]

;two cachelines ahead

PREFETCH

[EDX+196]

;two

cachelines ahead

PREFETCH

[ECX+196]

;two

cachelines ahead

FLD

QWORD PTR [EDX+ECX*8+ARR_SIZE]

;b[i]

 

FMUL

QWORD PTR [ECX+ECX*8+ARR_SIZE]

;b[i]*c[i]

FSTP

QWORD PTR [EAX+ECX*8+ARR_SIZE]

;a[i] =

b[i]*c[i]

FLD

QWORD PTR [EDX+ECX*8+ARR_SIZE+8]

;b[i+1]

 

FMUL

QWORD PTR [ECX+ECX*8+ARR_SIZE+8]

;b[i+1]*c[i+1]

FSTP

QWORD PTR [EAX+ECX*8+ARR_SIZE+8]

;a[i+1]

=

 

 

; b[i+1]*c[i+1]

FLD

QWORD PTR [EDX+ECX*8+ARR_SIZE+16];b[i+2]

 

FMUL

QWORD PTR [ECX+ECX*8+ARR_SIZE+16];b[i+2]*c[i+2]

FSTP

QWORD PTR [EAX+ECX*8+ARR_SIZE+16];a[i+2]

=

 

 

; [i+2]*c[i+2]

FLD

QWORD PTR [EDX+ECX*8+ARR_SIZE+24];b[i+3]

 

FMUL

QWORD PTR [ECX+ECX*8+ARR_SIZE+24];b[i+3]*c[i+3]

FSTP

QWORD PTR [EAX+ECX*8+ARR_SIZE+24];a[i+3] =

 

 

; b[i+3]*c[i+3]

FLD

QWORD PTR [EDX+ECX*8+ARR_SIZE+32];b[i+4]

 

FMUL

QWORD PTR [ECX+ECX*8+ARR_SIZE+32];b[i+4]*c[i+4]

FSTP

QWORD PTR [EAX+ECX*8+ARR_SIZE+32];a[i+4] =

 

 

; b[i+4]*c[i+4]

FLD

QWORD PTR [EDX+ECX*8+ARR_SIZE+40];b[i+5]

 

FMUL

QWORD PTR [ECX+ECX*8+ARR_SIZE+40];b[i+5]*c[i+5]

FSTP

QWORD PTR [EAX+ECX*8+ARR_SIZE+40];a[i+5] =

 

 

; b[i+5]*c[i+5]

FLD

QWORD PTR [EDX+ECX*8+ARR_SIZE+48];b[i+6]

 

FMUL

QWORD PTR [ECX+ECX*8+ARR_SIZE+48];b[i+6]*c[i+6]

FSTP

QWORD PTR [EAX+ECX*8+ARR_SIZE+48];a[i+6] =

 

 

; b[i+6]*c[i+6]

FLD

QWORD PTR [EDX+ECX*8+ARR_SIZE+56];b[i+7]

 

FMUL

QWORD PTR [ECX+ECX*8+ARR_SIZE+56];b[i+7]*c[i+7]

FSTP

QWORD PTR [EAX+ECX*8+ARR_SIZE+56];a[i+7] =

 

 

; b[i+7]*c[i+7]

ADD

ECX, 8

;next 8

products

JNZ

$loop

;until none left

END

48

Use the 3DNow!™ PREFETCH and PREFETCHW

Page 64
Image 64
AMD x86 manual Mov Ecx, -Largenum