Solution
void Parallel_matrix_vector_prod(
LOCAL_MATRIX_T local_A /* in */,
float local_x[] /* in */,
float global_x[] /* in */,
float local_y[] /* out */,
/* local_m = m/p, local_n = n/p */
MPI_Allgather(local_x, local_n, MPI_FLOAT,
global_x, local_n, MPI_FLOAT,
for (i = 0; i < local_m; i++) {
local_y[i] = local_y[i] +
local_A[i][j]*global_x[j];
} /* Parallel_matrix_vector_prod */