AMD Athlon™ Processor x86 Code Optimization

22007E/0 — November 1999

$xfer:

 

movq

mm0, [eax]

add

edx, 64

movq

mm1, [eax+8]

add

eax, 64

movq

mm2, [eax-48]

movq

[edx-64], mm0

movq

mm0, [eax-40]

movq

[edx-56], mm1

movq

mm1, [eax-32]

movq

[edx-48], mm2

movq

mm2, [eax-24]

movq

[edx-40], mm0

movq

mm0, [eax-16]

movq

[edx-32], mm1

movq

mm1, [eax-8]

movq

[edx-24], mm2

movq

[edx-16], mm0

dec

ecx

movq

[edx-8], mm1

jnz

$xfer

femms

 

}

 

/* block fill (destination QWORD aligned) */

__asm {

 

mov

edx, [dst_ptr]

mov

ecx, [blk_size]

shr

ecx, 6

movq

mm0, [fill_data]

align 16

 

$fill:

 

movq

[edx], mm0

movq

[edx+8], mm0

movq

[edx+16], mm0

movq

[edx+24], mm0

movq

[edx+32], mm0

movq

[edx+40], mm0

add

edx, 64

movq

[edx-16], mm0

decq

ecx

mov

[edx-8], mm0

jnz

$fill

femms

 

}

 

116

Use MMX™ Instructions for Block Copies and Block Fills

Page 132
Image 132
AMD x86 manual 116