204 Optimizing with SIMD Instructions Chapter 9
25112 Rev. 3.06 September 2005
Software Optimization Guide for AMD64 Processors
// again to the top of the column for the next dot-product of
// 4 rows of A upon B-transpose's row vector.
Btr_ptr -= 32;
// The addresses Aptr0, Aptr1, Aptr2, and Aptr3 need to be
// incremented to the next block of 4 rows of A to be multiplied
// upon B's column. 4 rows of A are 128 doubles in size, and in
// the n-loop above they were incremented by 32 already, so they
// must be incremented an additional 96 to point to the next
// 4 rows of A to be dotted.
Aptr0 += 96;
Aptr1 += 96;
Aptr2 += 96;
Aptr3 += 96;
_mm_prefetch(&Ctr_prefptr[0], 2);
// This loop below "dots" 4 consecutive rows of A upon a row
// of B-transpose by looping 8 times through code that
// multiplies and accumulates the products of 4 elements of A's
// rows with 4 elements of B-transpose's column.
for (n = 0; n < 8; n++) {
Ctr_ptr[0] += Aptr0[0]*Btr_ptr[0] + Aptr0[1]*Btr_ptr[1] +
Aptr0[2]*Btr_ptr[2] + Aptr0[3]*Btr_ptr[3];
Ctr_ptr[1] += Aptr1[0]*Btr_ptr[0] + Aptr1[1]*Btr_ptr[1] +
Aptr1[2]*Btr_ptr[2] + Aptr1[3]*Btr_ptr[3];
Ctr_ptr[2] += Aptr2[0]*Btr_ptr[0] + Aptr2[1]*Btr_ptr[1] +
Aptr2[2]*Btr_ptr[2] + Aptr2[3]*Btr_ptr[3];
Ctr_ptr[3] += Aptr3[0]*Btr_ptr[0] + Aptr3[1]*Btr_ptr[1] +
Aptr3[2]*Btr_ptr[2] + Aptr3[3]*Btr_ptr[3];
// Increment pointers to B transpose's column and A's rows to
// the next 4 elements to be multiplied and accumulated.
Btr_ptr += 4;
Aptr0 += 4;
Aptr1 += 4;
Aptr2 += 4;
Aptr3 += 4;
}
// The addresses to prefetch in B-transpose and C-transpose
// are incremented by 8 doubles, or 64 bytes, or 1 cache line.
// Each loop of the 4 loops of Ctr_8col_blck above brings in a
// new set of 8 doubles and after 4 loops the full column of the
// next column of B and the next set of 8 elements of C to be
// determined are also brought into the cache.
Btr_prefptr += 8;
Ctr_prefptr += 8;
// The pointer to C-transpose is incremented by 4 doubles
// to address the next 4 elements of C-transpose's row to be
// determined.
Ctr_ptr += 4;
// The pointer to B-transpose points to the end of the present
// row. We need to subtract 32 doubles so Btr_ptr points again
// to the top of the column for the next dot-product of 4 rows of A
// upon B-transpose's row vector