22007E/0 — November 1999

AMD Athlon™ Processor x86 Code Optimization

Example 4 (Left shift):

;shift operand in EDX:EAX left, shift count in ECX (count

;applied modulo 64)

SHLD

EDX, EAX, CL

;first apply shift count

SHL

EAX, CL

; mod

32 to EDX:EAX

TEST

ECX, 32

;need

to shift by another 32?

JZ

$lshift_done

;no, done

MOV

EDX, EAX

;left

shift EDX:EAX

XOR

EAX, EAX

; by 32 bits

$lshift_done:

Example 5 (Right shift):

SHRD

EAX, EDX, CL

;first apply shift count

SHR

EDX, CL

; mod

32 to EDX:EAX

TEST

ECX, 32

;need

to shift by another 32?

JZ

$rshift_done

;no, done

MOV

EAX, EDX

;left

shift EDX:EAX

XOR

EDX, EDX

; by 32 bits

$rshift_done:

Example 6 (Multiplication):

;_llmul computes the low-order half of the product of its

;arguments, two 64-bit integers

;INPUT: [ESP+8]:[ESP+4] multiplicand

;[ESP+16]:[ESP+12] multiplier

;OUTPUT: EDX:EAX (multiplicand * multiplier) % 2^64

;

;DESTROYS: EAX,ECX,EDX,EFlags

_llmul PROC

 

MOV

EDX, [ESP+8]

;multiplicand_hi

MOV

ECX, [ESP+16]

;multiplier_hi

OR

EDX, ECX

;one operand >= 2^32?

MOV

EDX, [ESP+12]

;multiplier_lo

MOV

EAX, [ESP+4]

;multiplicand_lo

JNZ

$twomul

;yes, need two multiplies

MUL

EDX

;multiplicand_lo * multiplier_lo

RET

 

;done, return to caller

$twomul:

 

 

IMUL

EDX, [ESP+8]

;p3_lo = multiplicand_hi*multiplier_lo

IMUL

ECX, EAX

;p2_lo = multiplier_hi*multiplicand_lo

ADD

ECX, EDX

; p2_lo + p3_lo

MUL

DWORD PTR [ESP+12] ;p1=multiplicand_lo*multiplier_lo

ADD

EDX, ECX

;p1+p2lo+p3_lo = result in EDX:EAX

RET

 

;done, return to caller

_llmul ENDP

Efficient 64-Bit Integer Arithmetic

87

Page 103
Image 103
AMD x86 manual Example 4 Left shift, Example 5 Right shift, Example 6 Multiplication