Software Optimization Guide for AMD64 Processors

25112 Rev. 3.06 September 2005

4.The four 4-bit sums can now be rapidly accumulated by multiplying with a so-called magic multiplier. This can be derived from looking at the following chart of partial products:

0p0q0r0s * 01010101 =

:0p0q0r0s

0p:0q0r0s

0p0q:0r0s

0p0q0r:0s

000pxxww:vvuutt0s

Here p, q, r, and s are the 4-bit sums from the previous step, and vv is the final interesting result. The final result is as follows:

z = (y * 0x01010101) >> 24

Integer Version

unsigned int popcount(unsigned int v)

{

unsigned int

retVal;

 

 

__asm {

 

 

 

 

mov

eax,

[v]

; v

 

mov

edx,

eax

; v

 

shr

eax,

1

; v >> 1

and

eax,

055555555h

; (v >> 1) & 0x55555555

sub

edx,

eax

; w = v - ((v >> 1) & 0x55555555)

mov

eax,

edx

; w

 

shr

edx,

2

; w >> 2

and

eax,

033333333h

; w & 0x33333333

and

edx,

033333333h

; (w >> 2) & 0x33333333

add

eax,

edx

; x = (w & 0x33333333) + ((w >> 2) &

 

 

 

;

0x33333333)

mov

edx,

eax

; x

 

shr

eax,

4

; x >> 4

add

eax,

edx

; x + (x >> 4)

and

eax,

00F0F0F0Fh

; y = (x + (x >> 4) & 0x0F0F0F0F)

imul

eax,

001010101h

; y * 0x01010101

shr

eax,

24

; population count = (y *

 

 

 

;

0x01010101) >> 24

mov

retVal, eax

; Store result.

}

return(retVal);

}

180

Integer Optimizations

Chapter 8

Page 196
Image 196
AMD 250 manual Integer Version, 180