25112 Rev. 3.06 September 2005

Software Optimization Guide for AMD64 Processors

;

; Destroys: EAX, ECX, EDX, EFlags

_ullrem

PROC

 

 

 

push

ebx

 

; Save EBX as per calling convention.

mov

ecx, [esp+20]

; divisor_hi

mov

ebx, [esp+16]

; divisor_lo

mov

edx, [esp+12]

; dividend_hi

mov

eax, [esp+8]

; dividend_lo

test

ecx, ecx

 

; divisor > 2^32 - 1?

jnz

r_big_divisor

; Yes, divisor > 32^32 - 1.

cmp

edx, ebx

 

; Only one division needed (ECX = 0)?

jae

r_two_divs

 

; Need two divisions.

div

ebx

 

; EAX

= quotient_lo

mov

eax, edx

 

; EAX

= remainder_lo

mov

edx, ecx

 

; EDX

= remainder_hi = 0

pop

ebx

 

; Restore EBX per calling convention.

ret

 

 

; Done, return to caller.

r_two_divs:

 

 

 

mov ecx, eax

; Save dividend_lo in ECX.

mov eax, edx

; Get dividend_hi.

xor edx, edx

; Zero-extend it into EDX:EAX.

div ebx

; EAX = quotient_hi, EDX = intermediate remainder

mov eax, ecx

; EAX = dividend_lo

div ebx

; EAX = quotient_lo

mov eax, edx

; EAX = remainder_lo

xor edx, edx

; EDX = remainder_hi = 0

pop ebx

; Restore EBX as per calling convention.

ret

 

; Done, return to caller.

r_big_divisor:

 

 

 

push

edi

 

;

Save EDI as per calling convention.

mov

edi, ecx

 

;

Save divisor_hi.

shr

edx, 1

 

;

Shift both divisor and dividend right

rcr

eax, 1

 

;

by 1 bit.

ror

edi, 1

 

 

 

rcr

ebx, 1

 

 

 

bsr

ecx, ecx

 

;

ECX = number of remaining shifts

shrd

ebx, edi, cl

;

Scale down divisor and dividend such

shrd

eax, edx, cl

;

that divisor is less than 2^32

shr

edx, cl

 

;

(that is, it fits in EBX).

rol

edi, 1

 

;

Restore original divisor (EDI:ESI).

div

ebx

 

;

Compute quotient.

mov

ebx, [esp+12]

;

dividend low word

mov

ecx, eax

 

;

Save quotient.

imul

edi, eax

 

;

quotient * divisor high word (low only)

mul

DWORD PTR [esp+20] ;

quotient * divisor low word

add

edx, edi

 

;

EDX:EAX = quotient * divisor

sub

ebx, eax

 

;

dividend_lo – (quot.*divisor)_lo

mov

ecx, [esp+16]

;

dividend_hi

Chapter 8

Integer Optimizations

175

Page 191
Image 191
AMD 250 manual 175