eax, foo edx, foo+4

25112 Rev. 3.06 September 2005Software Optimization Guide for AMD64 Processors

Preferred If Stores Are Close to the Load

movd

mm0,

eax

mov

foo+4, edx

punpckldq

mm0,

foo+4

Examples—Large-to-small Mismatches

Avoid large-to-small mismatches, as shown in the following code:

64-bit (Avoid)

foo DQ ?

...

mov QWORD PTR foo, rax mov eax, DWORD PTR foo mov edx, DWORD PTR foo+4

;Assume foo is 8-byte aligned.

;Store a QWORD to foo.

;Load a DWORD from foo.

;Load a DWORD from foo+4.

32-bit (Avoid)

foo DQ ?

; Assume

foo is 4-byte aligned.

...

 

 

 

 

 

fst QWORD PTR foo

; Store a QWORD in foo.

mov eax, DWORD PTR foo

;

Load

a

DWORD

from foo.

mov edx, DWORD PTR foo+4

;

Load

a

DWORD

from foo+4.

Avoid

movq foo, mm0

...

mov eax, foo mov edx, foo+4

Preferred

movd foo, mm0 pswapd mm0, mm0 movd foo+4, mm0 pswapd mm0, mm0

...

mov mov

Preferred If the Contents of MM0 are No Longer Needed

movd

foo, mm0

punpckhdq

mm0, mm0

movd

foo+4, mm0

...

 

mov

eax, foo

mov

edx, foo+4

Chapter 5

Cache and Memory Optimizations

93

Page 109
Image 109
AMD 250 manual Examples-Large-to-small Mismatches, Preferred If Stores Are Close to the Load