25112 Rev. 3.06 September 2005

Software Optimization Guide for AMD64 Processors

Btr_prefptr = Btr_ptr + 32; Ctr_prefptr = Ctr_ptr + 8;

//This loop cycles through the rows of the TRANSPOSED C matrix. A row

//of C-transpose is calculated by the code in this loop and then the

//next row is determined in the following loop iteration. There are

//32 rows in C-transpose.

for (Ctr_row_num = 0; Ctr_row_num < 32; Ctr_row_num++) {

//Assign pointers to 4 consecutive rows of A by using the

//address of matrix A passed into the function:

Aptr0 = A;

Aptr1 = Aptr0 + 32;

Aptr2 = Aptr0 + 64;

Aptr3 = Aptr0 + 96;

//This loop contains code that "dots" 8 rows of A upon the present row

//of B-transpose. By looping 4 times, all 32 rows of A are multiplied

//upon the present column of B-transpose.

for (Ctr_8col_blck = 0; Ctr_8col_blck < 4; Ctr_8col_blck++) {

//This instruction prefetches 1/4 of the next column of B-transpose

//upon which matrix A needs to be multiplied. The loop within which

//this code resides is executed 4 times, and by incrementing

//Btr_prefptr (the ptr to the address of B transpose to be

//prefetched) by 8 doubles (or 64 bytes, or 1 cache line) the entire

//contents of the next row of B-transpose are brought to the

//processor in advance when Ctr_row_num in the outer loop is

//incremented

_mm_prefetch(&Btr_prefptr[0], 2);

//This loop below "dots" 4 consecutive rows of A upon a row of

//B-transpose by looping 8 times through code that multiplies and

//accumulates the products of 4 elements of A's rows with 4

//elements of B-transpose's column.

for (n = 0; n

< 8; n++) {

 

Ctr_ptr[0]

+= Aptr0[0]*Btr_ptr[0] +

Aptr0[1]*Btr_ptr[1] +

 

Aptr0[2]*Btr_ptr[2] +

Aptr0[3]*Btr_ptr[3];

Ctr_ptr[1]

+= Aptr1[0]*Btr_ptr[0] +

Aptr1[1]*Btr_ptr[1] +

 

Aptr1[2]*Btr_ptr[2] +

Aptr1[3]*Btr_ptr[3];

Ctr_ptr[2]

+= Aptr2[0]*Btr_ptr[0] +

Aptr2[1]*Btr_ptr[1] +

 

Aptr2[2]*Btr_ptr[2] +

Aptr2[3]*Btr_ptr[3];

Ctr_ptr[3]

+= Aptr3[0]*Btr_ptr[0] +

Aptr3[1]*Btr_ptr[1] +

 

Aptr3[2]*Btr_ptr[2] +

Aptr3[3]*Btr_ptr[3];

//Increment pointers to B transpose's column and A's rows to

//the next 4 elements to be multiplied and accumulated.

Btr_ptr += 4; Aptr0 += 4; Aptr1 += 4; Aptr2 += 4; Aptr3 += 4;

}

//The pointer to C-transpose is incremented by 4 doubles to

//address the next 4 elements of C-transpose's row to be determined. Ctr_ptr += 4;

//The pointer to B transpose points to the end of the present

//row. We need to subtract 32 doubles so Btr_ptr points

Chapter 9

Optimizing with SIMD Instructions

203

Page 219
Image 219
AMD 250 manual 203