25112 Rev. 3.06 September 2005

Software Optimization Guide for AMD64 Processors

before starting a copy, especially for large blocks. To write data directly to main memory, bypassing the cache, use the MOVNTI instruction instead of MOV for the four store instructions.

;rsi = source

;rdi = destination

;ecx = byte count

mov

eax, ecx

 

 

shr

eax, 5

 

 

jz

done_32

 

 

align 16

 

; align the loop to a 16-byte fetch boundary

copy_32_bytes:

 

 

mov

r8, [rsi]

; read 8 bytes

mov

r9, [rsi+8]

; it's a bit faster to pair two reads

add

rsi, 32

; update source pointer

mov

[rdi], r8

;

store 8 bytes

mov

[rdi+8], r9

; again, pair 2 stores for slight perf gain

add

rdi, 32

;

update destination pointer

mov

r8, [rsi-16]

; loop is unrolled 4 reads, 4 writes

mov

r9, [rsi-8]

; 4-way unroll hides latency of adds and dec

dec

eax

; decrement data counter (32 bytes)

mov

[rdi-16], r8

;

store more bytes

mov

[rdi-8], r9

; store last 8 bytes

jnz

copy_32_bytes

 

 

done_32:

 

 

 

(copy any remaining bytes)

Chapter 5

Cache and Memory Optimizations

121

Page 137
Image 137
AMD 250 manual 121