Hy
我编写了一个 MPI 程序,它将矩阵划分为网格,然后将网格分散在 CPU 之间。这是一个矩阵-矩阵-乘法。我的程序运行良好并输出正确的结果,至少有时是这样。
有时我几乎在开始时就收到中止陷阱 6 错误(在代码中标记),有时我在循环中遇到分段错误 11,我将矩阵从行主顺序重新排列为允许我分散网格的顺序(也在代码中标记)。我还遇到了一些BusError10。错误大部分时间发生在我标记的代码点,但有时也会发生在其他地方。
我真的很绝望,因为它有时会起作用,当它抛出错误时,它甚至不是同一个错误,也不是在代码中的同一点,这是我真的没有得到的。
我还认为,只有当我一个接一个地多次运行程序时,才更有可能发生错误。
你看到我的错误了吗?
这是代码:(很多,但我用长行标记了错误部分)
int main(int argc, char **argv) {
//Initializing communication....
MPI_Init(&argc, &argv);
int size = atoi(argv[1]);
int delta = 10;
int world_rank;
int world_size;
int root = 0;
// MPI_Status mystatus;
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
// Calculate sqrt of world size
int root_of_worldsize = sqrt((double)world_size);
if (world_rank == root) {
printf("The square-root of the worldsize is %dn", root_of_worldsize);
}
// Setup for initializing groups
int row_rank_a, column_rank_b;
int **rowranks = malloc(root_of_worldsize*sizeof(int*));
int **columnranks = malloc(root_of_worldsize*sizeof(int*));
for (int i = 0; i < root_of_worldsize; i++) {
rowranks[i] = malloc(root_of_worldsize*sizeof(int));
columnranks[i] = malloc(root_of_worldsize*sizeof(int));
for (int j = 0; j < root_of_worldsize ; j++) {
rowranks[i][j] = (i*root_of_worldsize + j);
columnranks[i][j] = (j*root_of_worldsize + i);
}
}
//printing rank array
if (world_rank == root) {
printf("Colum ranks: ");
printf("[");
for (int i = 0; i < root_of_worldsize; i++) {
printf("[");
for (int j = 0; j < root_of_worldsize; j++) {
printf("%d, ", columnranks[i][j]);
}
printf("]");
}
printf("]n");
}
if (world_rank == root) {
printf("Row ranks: ");
printf("[");
for (int i = 0; i < root_of_worldsize; i++) {
printf("[");
for (int j = 0; j < root_of_worldsize; j++) {
printf("%d, ", rowranks[i][j]);
}
printf("]");
}
printf("]n");
}
MPI_Group world_group, rows_groupa, columns_groupb;
MPI_Comm rowa_comm, columb_comm;
//Get world group handle...
MPI_Comm_group(MPI_COMM_WORLD, &world_group);
//check compatibility of size and number of processors
assert(size % world_size == 0);
// Create groups
for (int i = 0; i < root_of_worldsize; i++) {
if (i*root_of_worldsize <= world_rank && world_rank < (i+1)*root_of_worldsize) {
//printf("Rank %d; I am getting assigned to the %d row group.n", world_rank, i+1);
MPI_Group_incl(world_group, root_of_worldsize, rowranks[i], &rows_groupa);
}
if (world_rank % root_of_worldsize == i) {
//printf("Rank %d; I am getting assigned to the %d column group.n", world_rank, i+1);
MPI_Group_incl(world_group, root_of_worldsize, columnranks[i], &columns_groupb);
}
if (world_rank == root) {
printf("n");
}
}
// Create new communicators
MPI_Comm_create(MPI_COMM_WORLD, rows_groupa, &rowa_comm);
MPI_Comm_create(MPI_COMM_WORLD, columns_groupb, &columb_comm);
// Get respective group ranks
MPI_Group_rank(rows_groupa, &row_rank_a);
MPI_Group_rank(columns_groupb, &column_rank_b);
printf("worldrank = %d; rowrank = %d; columnrank = %dn", world_rank, row_rank_a, column_rank_b);
double *matrixA;
double *matrixB;
int chunk_size = size / root_of_worldsize;
if (world_rank == root) {
printf("Chunk size: %dn",chunk_size);
printf("Root of worldsize: %dn", root_of_worldsize);
}
MPI_Barrier(MPI_COMM_WORLD);
if (world_rank == root) {
// Create two matrices
printf("Creating matrices...n");
double *matrixA_i = malloc(size*size*sizeof(double));
double *matrixB_i = malloc(size*size*sizeof(double));
double **matrixA_2d = malloc(root_of_worldsize*sizeof(double*));
for (int i = 0; i < size; i++) {
matrixA_2d[i] = malloc(chunk_size*chunk_size*sizeof(double));
}
double **matrixB_2d = malloc(root_of_worldsize*sizeof(double*));
for (int i = 0; i < size; i++) {
matrixB_2d[i] = malloc(chunk_size*chunk_size*sizeof(double));
}
srand(1234);
for (int i = 0; i < size; i++) {
for (int j = 0; j < size; j++) {
matrixA_i[i*size + j] = rand() % delta + 1;
}
}
srand(2345);
for (int i = 0; i < size; i++) {
for (int j = 0; j < size; j++) {
matrixB_i[i*size + j] = rand() % delta + 1;
}
}
-----------------Abort trap 6 is happening around here or also at the every end of programm------------------------------------------------------
printf("Created matrices.n");
printf("Matrix B:n");
print_contiguous_matrix_array(matrixB_i, size);
printf("Matrix A:n");
print_contiguous_matrix_array(matrixA_i, size);
-----------------In this for loop is the Seg11 fault------------------------------------------------------
// Rearrange the matrix to a "major-row-grid"-matrix
printf("Rearranging matrices for grid scatteringn");
int k = 0;
int j = 0;
int l = 0;
for (int i = 0; i < (size*size); i++) {
if (i == 0) {
//Insert:
//printf("Counters: k->%d, l->%d, j->%dn",k,l,j);
matrixA_2d[k+root_of_worldsize*j][(i%chunk_size) + l*chunk_size] = matrixA_i[i];
//printf("Writing on: [%d][%d]n", k+root_of_worldsize*j, i - ((j*chunk_size*chunk_size*root_of_worldsize)+(l*chunk_size*root_of_worldsize)+(k*chunk_size)));
matrixB_2d[k+root_of_worldsize*j][(i%chunk_size) + l*chunk_size] = matrixB_i[i];
} else {
if (i % chunk_size == 0) {
k++;
if (k > (root_of_worldsize-1)) {
k = 0;
}
//printf("Raised k, k->%dn", k);
}
// Strip counter:
if (i % (chunk_size*chunk_size*root_of_worldsize) == 0) {
j++;
//printf("Raised j, j->%dn", j);
}
// line counter:
if (i % (chunk_size*root_of_worldsize) == 0) {
l++;
if (l > (chunk_size-1)) {
l = 0;
}
//printf("Raised l, l->%dn", l);
}
//Insert:
printf("Counters: k->%d, l->%d, j->%d; i->%dn",k,l,j,i);
matrixA_2d[k+root_of_worldsize*j][(i%chunk_size) + l*chunk_size] = matrixA_i[i];
printf("Writing on: [%d][%d]n", k+root_of_worldsize*j, (i%chunk_size) + l*chunk_size);
matrixB_2d[k+root_of_worldsize*j][(i%chunk_size) + l*chunk_size] = matrixB_i[i];
}
}
free(matrixA_i);
free(matrixB_i);
// 2d to 1d array
//printf("2d A: ");
//print_matrix(matrixA_2d, size, size);
//printf("2d B: ");
//print_matrix(matrixB_2d, size, size);
//Two to one dimensional
printf("converting from to to one dimensionaln");
int counter = 0;
matrixB = malloc(size*size*sizeof(double));
matrixA = malloc(size*size*sizeof(double));
for (int i = 0; i < world_size; i++) {
for (int j = 0; j < chunk_size; j++) {
for (int k = 0; k < chunk_size; k++) {
matrixA[counter] = matrixA_2d[i][j*chunk_size + k];
matrixB[counter] = matrixB_2d[i][j*chunk_size + k];
counter++;
}
}
}
//free 2d
for (int q = 0; q < root_of_worldsize; q++) {
free(matrixA_2d[q]);
free(matrixB_2d[q]);
}
free(matrixB_2d);
free(matrixA_2d);
//printf("Rearranged B ");
//print_contiguous_matrix_array(matrixB, size);
//printf("Rearranged A ");
//print_contiguous_matrix_array(matrixA, size);
}
MPI_Barrier(MPI_COMM_WORLD);
//Scatter....
double *matrixA_chunk = malloc(chunk_size*chunk_size*sizeof(double));
double *matrixB_chunk = malloc(chunk_size*chunk_size*sizeof(double));
double *matrixA_tmp_chunk = malloc(chunk_size*chunk_size*sizeof(double));
double *matrixB_tmp_chunk = malloc(chunk_size*chunk_size*sizeof(double));
double *result_chunk = calloc(chunk_size*chunk_size, sizeof(double));
MPI_Scatter(matrixA, chunk_size*chunk_size, MPI_DOUBLE, matrixA_chunk, chunk_size*chunk_size, MPI_DOUBLE, root, MPI_COMM_WORLD);
MPI_Scatter(matrixB, chunk_size*chunk_size, MPI_DOUBLE, matrixB_chunk, chunk_size*chunk_size, MPI_DOUBLE, root, MPI_COMM_WORLD);
for (int z = 0; z < root_of_worldsize; z++) {
if (row_rank_a == z) {
matrixA_tmp_chunk = matrixA_chunk;
}
MPI_Bcast(matrixA_tmp_chunk, chunk_size*chunk_size, MPI_DOUBLE, z, rowa_comm);
/*if (world_rank == 0) {
printf("temporary A: ");
print_contiguous_matrix_array(matrixA_tmp_chunk, chunk_size);
}*/
if (column_rank_b == z) {
matrixB_tmp_chunk = matrixB_chunk;
MPI_Bcast(matrixB_tmp_chunk, chunk_size*chunk_size, MPI_DOUBLE, z, columb_comm);
}
MPI_Bcast(matrixB_tmp_chunk, chunk_size*chunk_size, MPI_DOUBLE, z, columb_comm);
printf("Iteration: %d; Rank %d; row_rank %d; temporary A matrix: %f, %f, %f, %fn", z, world_rank, row_rank_a, matrixA_tmp_chunk[0], matrixA_tmp_chunk[1], matrixA_tmp_chunk[2], matrixA_tmp_chunk[3]);
/*if (world_rank == 0) {
printf("temporary B: ");
print_contiguous_matrix_array(matrixB_tmp_chunk, chunk_size);
}*/
//calculate
for (int i = 0; i < chunk_size; i++) {
for (int j = 0; j < chunk_size; j++) {
for (int k = 0; k < chunk_size; k++) {
result_chunk[j*chunk_size + i] += (matrixA_tmp_chunk[j*chunk_size + k] * matrixB_tmp_chunk[k*chunk_size + i]);
}
}
}
MPI_Barrier(MPI_COMM_WORLD);
}
double *final_result;
double *contiguous_final_result = NULL;
if (world_rank == root) {
final_result = malloc(size*size*sizeof(double));
contiguous_final_result = malloc(size*size*sizeof(double));
}
MPI_Gather(result_chunk, chunk_size*chunk_size, MPI_DOUBLE, final_result, chunk_size*chunk_size, MPI_DOUBLE, root, MPI_COMM_WORLD);
if (world_rank == root) {
printf("final result major grid: ");
print_contiguous_matrix_array(final_result, size);
}
// Rearrange gridded matrix to row major matrix
if (world_rank == root) {
int l2 = 0;
int k2 = 0;
int s2 = 0;
for (int i = 0; i < (size*size); i++) {
if (i == 0) {
contiguous_final_result[(i%chunk_size) + l2*size + s2*size*chunk_size + k2*chunk_size] = final_result[i];
printf("Access values: i->%d; l->%d; s->%d; k->%d; total->%dn", i, l2, s2, k2, (i%chunk_size) + l2*size + s2*size*chunk_size + k2*chunk_size);
}
else {
if (i % chunk_size == 0) {
l2++;
if (l2 > (chunk_size-1)) {
l2 = 0;
}
}
if (i % (chunk_size*chunk_size*root_of_worldsize) == 0) {
s2++;
}
if (i % (chunk_size*chunk_size) == 0) {
k2++;
if (k2 > (root_of_worldsize-1)) {
k2 = 0;
}
}
contiguous_final_result[(i%chunk_size) + l2*size + s2*size*chunk_size + k2*chunk_size] = final_result[i];
printf("Access values: i->%d; l->%d; s->%d; k->%d; total->%dn", i, l2, s2, k2, (i%chunk_size) + l2*size + s2*size*chunk_size + k2*chunk_size);
}
}
}
if (world_rank == root) {
printf("Row major result: ");
print_contiguous_matrix_array(contiguous_final_result, size);
}
//free!!!!!!
if (world_rank == root) {
free(matrixA);
free(matrixB);
free(final_result);
free(contiguous_final_result);
}
free(matrixA_chunk);
free(matrixB_chunk);
free(result_chunk);
MPI_Finalize();
return 0;
}
提前非常感谢!
问题很可能是这两个分配:
int **rowranks = malloc(root_of_worldsize*sizeof(int));
int **columnranks = malloc(root_of_worldsize*sizeof(int));
在这里,您将变量声明为基本上是指针数组,但不为指针分配内存。如果 int
的大小小于 int*
的大小(通常在所有现代 64 位系统上都是这样),这将导致未定义的行为。
我发现了问题!正是在代码的这一部分:
double *matrixA_i = malloc(size*size*sizeof(double));
double *matrixB_i = malloc(size*size*sizeof(double));
double **matrixA_2d = malloc(root_of_worldsize*sizeof(double*));
for (int i = 0; i < size; i++) {
matrixA_2d[i] = malloc(chunk_size*chunk_size*sizeof(double));
}
double **matrixB_2d = malloc(root_of_worldsize*sizeof(double*));
for (int i = 0; i < size; i++) {
matrixB_2d[i] = malloc(chunk_size*chunk_size*sizeof(double));
}
我没有为 2d 阵列分配正确的大小,非常感谢 @Joachim Pileborg,您的回答让我走上了正确的道路,寻找什么!!