Implementing Fox’s Algorithm
void Fox(int n, GRID_INFO_T* grid,
LOCAL_MATRIX_T* local_C) {
/* Storage for the sub-matrix of A */
/* Calculate addresses for circular shift of B */
source = (grid->my_row + 1) % grid->q;
dest = (grid->my_row + grid->q - 1) % grid->q;
/* Storage for the broadcast block of A */
temp_A = Local_matrix_allocate(n_bar);
for (stage = 0; stage < grid->q; stage++) {
bcast_root = (grid->my_row + stage) % grid->q;
if (bcast_root == grid->my_col) {
MPI_Bcast(local_A, 1, local_matrix_mpi_t,
bcast_root, grid->row_comm);
Local_matrix_multiply(local_A, local_B,
MPI_Bcast(temp_A, 1, local_matrix_mpi_t,
bcast_root, grid->row_comm);
Local_matrix_multiply(temp_A, local_B,
MPI_Sendrecv_replace(local_B, 1,
local_matrix_mpi_t,dest, 0, source, 0,
grid->col_comm, &status);