Software Optimization Guide for AMD64 Processors

25112 Rev. 3.06 September 2005

;

xmm4

=

[d,c,b,a]

;

xmm5

=

[D,C,B,A]

;

xmm6

=

[h,g,f,e]

;

xmm7

=

[H,G,F,E]

;

 

 

 

;and arranges them to look like:

;

xmm4

=

[E,e,A,a]

;

xmm1

=

[F,f,B,b]

;

xmm2

=

[G,g,C,c]

;

xmm3

=

[H,h,D,d]

movaps xmm3, xmm4 movaps xmm0, xmm5

unpcklps xmm4, xmm6 unpckhps xmm3, xmm6 movaps xmm1, xmm4 movaps xmm2, xmm3

unpcklps xmm5, xmm7 unpckhps xmm0, xmm7

unpcklps xmm4, xmm5 unpckhps xmm1, xmm5 unpcklps xmm3, xmm0 unpckhps xmm2, xmm0

;xmm3 [d,c,b,a]

;xmm0 [D,C,B,A]

;xmm4 [f,b,e,a]

;xmm3 [h,d,g,c]

;xmm1 [f,b,e,a]

;xmm2 [h,d,g,c]

;xmm5 [F,B,E,A]

;xmm0 [H,D,G,C]

;xmm4 [E,e,A,a]

;xmm1 [F,f,B,b]

;xmm3 [G,g,C,c]

;xmm2 [H,h,D,d]

;Now if we compute the sum of these registers, we get the dot-product

;of the first row of A with vector X:

;

;a+b+c+d

;in the lower DWORD of the resultant XMM register. The dot-product of the

;second row is stored in the second DWORD and so on, such that:

;

;xmm1 = [V+X+Y+Z,v+x+y+z,A+B+C+D,a+b+c+d]

addps xmm1, xmm4

; xmm1 [E+F,e+f,A+B,a+b]

addps xmm3,

xmm2

;

xmm3

[G+H,g+h,C+D,c+d]

addps xmm1,

xmm3

;

xmm1

[E+F+G+H,e+f+g+h,A+B+C+D,a+b+c+d]

220

Optimizing with SIMD Instructions

Chapter 9

Page 236
Image 236
AMD 250 manual 220