Software Optimization Guide for AMD64 Processors | 25112 Rev. 3.06 September 2005 |
//again to the top of the column for the next
//4 rows of A upon
Btr_ptr
//The addresses Aptr0, Aptr1, Aptr2, and Aptr3 need to be
//incremented to the next block of 4 rows of A to be multiplied
//upon B's column. 4 rows of A are 128 doubles in size, and in
//the
//must be incremented an additional 96 to point to the next
//4 rows of A to be dotted.
Aptr0 += 96; Aptr1 += 96; Aptr2 += 96; Aptr3 += 96;
_mm_prefetch(&Ctr_prefptr[0], 2);
//This loop below "dots" 4 consecutive rows of A upon a row
//of
//multiplies and accumulates the products of 4 elements of A's
//rows with 4 elements of
for (n = 0; n | < 8; n++) { |
|
Ctr_ptr[0] | += Aptr0[0]*Btr_ptr[0] + | Aptr0[1]*Btr_ptr[1] + |
| Aptr0[2]*Btr_ptr[2] + | Aptr0[3]*Btr_ptr[3]; |
Ctr_ptr[1] | += Aptr1[0]*Btr_ptr[0] + | Aptr1[1]*Btr_ptr[1] + |
| Aptr1[2]*Btr_ptr[2] + | Aptr1[3]*Btr_ptr[3]; |
Ctr_ptr[2] | += Aptr2[0]*Btr_ptr[0] + | Aptr2[1]*Btr_ptr[1] + |
| Aptr2[2]*Btr_ptr[2] + | Aptr2[3]*Btr_ptr[3]; |
Ctr_ptr[3] | += Aptr3[0]*Btr_ptr[0] + | Aptr3[1]*Btr_ptr[1] + |
| Aptr3[2]*Btr_ptr[2] + | Aptr3[3]*Btr_ptr[3]; |
//Increment pointers to B transpose's column and A's rows to
//the next 4 elements to be multiplied and accumulated.
Btr_ptr += 4; Aptr0 += 4; Aptr1 += 4; Aptr2 += 4; Aptr3 += 4;
}
//The addresses to prefetch in
//are incremented by 8 doubles, or 64 bytes, or 1 cache line.
//Each loop of the 4 loops of Ctr_8col_blck above brings in a
//new set of 8 doubles and after 4 loops the full column of the
//next column of B and the next set of 8 elements of C to be
//determined are also brought into the cache.
Btr_prefptr += 8;
Ctr_prefptr += 8;
//The pointer to
//to address the next 4 elements of
//determined.
Ctr_ptr += 4;
//The pointer to
//row. We need to subtract 32 doubles so Btr_ptr points again
//to the top of the column for the next
//upon
204 | Optimizing with SIMD Instructions | Chapter 9 |