22007E/0 — November 1999

AMD Athlon™ Processor x86 Code Optimization

$$xform:

 

 

ADD

EBX, 16

;res++

MOVQ

MM0, QWORD PTR [EDX]

;v->y v->x

MOVQ

MM1, QWORD PTR [EDX+8]

;v->w v->z

ADD

EDX, 16

;v++

MOVQ

MM2, MM0

;v->y v->x

MOVQ

MM3, QWORD PTR [EAX+M00]

;m[0][1] m[0][0]

PUNPCKLDQ

MM0, MM0

;v->x v->x

MOVQ

MM4, QWORD PTR [EAX+M10]

;m[1][1] m[1][0]

PFMUL

MM3, MM0

;v->x*m[0][1] v->x*m[0][0]

PUNPCKHDQ

MM2, MM2

;v->y v->y

PFMUL

MM4, MM2

;v->y*m[1][1] v->y*m[1][0]

MOVQ

MM5, QWORD PTR [EAX+M02]

;m[0][3] m[0][2]

MOVQ

MM7, QWORD PTR [EAX+M12]

;m[1][3] m[1][2]

MOVQ

MM6, MM1

;v->w v->z

PFMUL

MM5, MM0

;v->x*m[0][3] v0>x*m[0][2]

MOVQ

MM0, QWORD PTR [EAX+M20]

;m[2][1] m[2][0]

PUNPCKLDQ

MM1, MM1

;v->z v->z

PFMUL

MM7, MM2

;v->y*m[1][3] v->y*m[1][2]

MOVQ

MM2, QWORD PTR [EAX+M22]

;m[2][3] m[2][2]

PFMUL

MM0, MM1

;v->z*m[2][1] v->z*m[2][0]

PFADD

MM3, MM4

;v->x*m[0][1]+v->y*m[1][1]

 

 

; v->x*m[0][0]+v->y*m[1][0]

MOVQ

MM4, QWORD PTR [EAX+M30]

;m[3][1] m[3][0]

PFMUL

MM2, MM1

;v->z*m[2][3] v->z*m[2][2]

PFADD

MM5, MM7

;v->x*m[0][3]+v->y*m[1][3]

 

 

; v->x*m[0][2]+v->y*m[1][2]

MOVQ

MM1, QWORD PTR [EAX+M32]

;m[3][3] m[3][2]

PUNPCKHDQ MM6, MM6

;v->w v->w

PFADD

MM3, MM0

;v->x*m[0][1]+v->y*m[1][1]+v->z*m[2][1]

 

 

; v->x*m[0][0]+v->y*m[1][0]+v->z*m[2][0]

PFMUL

MM4, MM6

;v->w*m[3][1] v->w*m[3][0]

PFMUL

MM1, MM6

;v->w*m[3][3] v->w*m[3][2]

PFADD

MM5, MM2

;v->x*m[0][3]+v->y*m[1][3]+v->z*m[2][3]

 

 

; v->x*m[0][2]+v->y*m[1][2]+v->z*m[2][2]

PFADD

MM3, MM4

;v->x*m[0][1]+v->y*m[1][1]+v->z*m[2][1]+

 

 

; v->w*m[3][1] v->x*m[0][0]+v->y*m[1][0]+

 

 

; v->z*m[2][0]+v->w*m[3][0]

MOVQ

[EBX-16], MM3

;store res->y res->x

PFADD

MM5, MM1

;v->x*m[0][3]+v->y*m[1][3]+v->z*m[2][3]+

 

 

; v->w*m[3][3] v->x*m[0][2]+v->y*m[1][2]+

 

 

; v->z*m[2][2]+v->w*m[3][2]

MOVQ

[EBX-8], MM5

;store res->w res->z

DEC

ECX

;numverts--

JNZ

$$XFORM

;until numverts == 0

FEMMS

 

;clear MMX state

}

}

Optimized Matrix Multiplication

121

Page 137
Image 137
AMD x86 manual Optimized Matrix Multiplication 121