我正在编写一个MPI程序,该程序解决了N皇后区问题,进程0应该部分解决该问题,并让其他进程完成它。我的程序进行了编译,但在创建结构类型并尝试发送结构时,它向我显示了分段错误。这发生在spawn_processes
函数中(由进程0执行,因此segfault发生在进程0中),同时尝试发送subProblemType
。
这是我的代码:
#include <stdio.h>
#include <mpi.h>
#define MAXBOARDSIZE 8
static int board_size=8;
typedef struct boardplus{
int size; // board size
int x; // this is where we need to restart
int y; // idem
int board[MAXBOARDSIZE]; // the actual board, padded to largest instance
} subproblem;
#define INIT 1 // Message to client: subproblem
#define DATA 2 // Message from client with results
#define EXIT 4 // Message from client with CPU time
// Also to client, giving permission to exit
static long int N_solutions;
int solution_count;
void spawn_processes(int rows[board_size],int y){
//printf("_______________________");
subproblem subP;
int col,//column number to start from
count,//number of solutions recieved from a worker
nProc,//total number of processes
proc,
nActive,i;// number of active processes
MPI_Status status;
MPI_Datatype subProblemType;
MPI_Datatype type[4]={MPI_INT,MPI_INT,MPI_INT,MPI_INT};
int block_len[4]={1,1,1,MAXBOARDSIZE};
MPI_Aint disp[4];
MPI_Address(&subP,disp);
MPI_Address(&subP.x,disp+1);
MPI_Address(&subP.y,disp+2);
MPI_Address(&subP.board,disp+3);
int base=disp[0];
for(i=0;i<4;i++) disp[i]-=base;
MPI_Type_create_struct(4,block_len,disp,type,&subProblemType);
MPI_Type_commit(&subProblemType);
MPI_Comm_size(MPI_COMM_WORLD,&nProc);
subP.size=board_size;
subP.y=y;
//subP.board=rows;
for(i=0;i<y;i++)subP.board[i]=rows[i];
printf("spawning processes ....n");
for(col=0,proc=1;proc<nProc && col<board_size;proc++, col++){
printf("sending to process %d n ",proc);
fflush(stdout);
subP.x=col;
MPI_Send(&subP,1,subProblemType,proc,INIT,MPI_COMM_WORLD);
}
nActive=proc-1;
// Receive back results and send out new problems
while(col<board_size){
MPI_Recv(&count,1,MPI_INT,MPI_ANY_SOURCE,DATA,MPI_COMM_WORLD,&status);
proc=status.MPI_SOURCE;
printf("recieved from process %d, found %d solutions n",proc,count);
N_solutions+=count;
subP.x=col++;
MPI_Send(&subP,1,subProblemType,proc,INIT,MPI_COMM_WORLD);
}
// Finally, receive back pending results and send termination
// indication (message with size of zero).
subP.size=0;
while(nActive>0){
MPI_Recv(&count,1,MPI_INT,MPI_ANY_SOURCE,DATA,MPI_COMM_WORLD,&status);
proc=status.MPI_SOURCE;
printf("recieved from process %d, found %d solutions n",proc,count);
--nActive;
N_solutions+=count;
//send a subproblem with size=0 (termination messages)
MPI_Send(&subP,1,subProblemType,proc,INIT,MPI_COMM_WORLD);
}
for (proc = 1; proc < nProc; proc++)
MPI_Send(&proc, 0, MPI_INT, proc, EXIT, MPI_COMM_WORLD);
}
void process_queens(int my_id){
int root=0;
subproblem subP;
MPI_Status status;
int rows[board_size];
int x,y,i;
MPI_Datatype subProblemType;
MPI_Datatype type[4]={MPI_INT,MPI_INT,MPI_INT,MPI_INT};
int block_len[4]={1,1,1,MAXBOARDSIZE};
MPI_Aint disp[4];
MPI_Address(&subP,disp);
MPI_Address(&subP.x,disp+1);
MPI_Address(&subP.y,disp+2);
MPI_Address(&subP.board,disp+3);
int base=disp[0];
for(i=0;i<4;i++) disp[i]-=base;
MPI_Type_create_struct(4,block_len,disp,type,&subProblemType);
MPI_Type_commit(&subProblemType);
printf("process %d waiting to recieve a taskn",my_id);
fflush(stdout);
MPI_Recv(&subP,1,subProblemType,root,INIT,MPI_COMM_WORLD,&status);
while(subP.size>0){
x=subP.x;
y=subP.y;
for(i=0;i<y;i++)rows[i]=subP.board[i];
//rows=subP.board;
if(is_safe(rows,x,y)){
rows[y]=x;
n_queens_solver(rows,y+1);
}
MPI_Send(&N_solutions,1,MPI_INT,root,DATA,MPI_COMM_WORLD);
}
// Final hand-shake: get permission to terminate
MPI_Recv(&N_solutions, 0, MPI_INT, 0, EXIT, MPI_COMM_WORLD, &status);
}
int is_safe(int rows[board_size], int x, int y)
{
int i;
if (y == 0)
return 1;
for (i=0; i < y; ++i) {
if (rows[i] == x || rows[i] == x + y - i || rows[i] == x - y +i)
return 0;
}
return 1;
}
void n_queens_solver(int rows[board_size], int y)
{
int x;
for (x=0; x < board_size; ++x) {
if (is_safe(rows, x, y)) {
rows[y] = x;
if (y == board_size-1) {
++N_solutions;
}
else
n_queens_solver(rows, y+1);
}
}
}
void n_queens_expander(int rows[board_size], int y, int expand_levels)
{
int x;
if(y == expand_levels-1){
spawn_processes(rows,y);
}
else{
for (x=0; x < board_size; ++x) {
if (is_safe(rows, x, y))
{
rows[y] = x;
n_queens_expander(rows, y+1, expand_levels-1);
}
}
}
}
int main(int argc,char *argv[]) {
int rows[board_size];
//int expand_levels=1;
int numproc,my_id;
MPI_Status status;
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD,&numproc);
MPI_Comm_rank(MPI_COMM_WORLD,&my_id);
//printf("number of processes:%d n",numproc);
if(my_id==0){
//printf("process 0 starting...n");
n_queens_expander(rows,0,1);
}
else{
process_queens(my_id);
}
MPI_Finalize();
return 0;
}
您的错误是正确的。。。
MPI_Address(&subP,disp);
MPI_Address(&subP.x,disp+1);
MPI_Address(&subP.y,disp+2);
MPI_Address(&subP.board,disp+3);
int base=disp[0]; // <--------------------- HERE
for(i=0;i<4;i++) disp[i]-=base;
在LP64系统(包括运行OS X、FreeBSD、Solaris或Linux的64位x86系统)上,MPI_Aint
为8字节长,而int
仅为4字节长。subP
是一个堆栈变量,x64上主线程的堆栈位于虚拟地址空间的高处,因此在分配给base
时会发生截断,计算的位移与实际位移无关。
解决方案:base
的类型应为MPI_Aint
。
解决方案2:
for(i=1;i<4;i++) disp[i]-=disp[0];
disp[0] = 0;
所有列中都存在相同的问题,但由于主崩溃且不发送数据,工作线程中的MPI_Recv
从不向内存写入,因此它们不会出错。
请记住始终使用-Wall
进行编译,并注意编译器产生的警告消息。
虽然我不熟悉MPI,但直觉和一些快速的谷歌搜索表明,你需要在结构中为该数组声明一个类型,而不是将其作为整数传递。*将其作为整型传递可能只会将指向数组中第一个项的指针发送到远程机器,然后可能是远程机器上的某个伪内存地址,当远程机器使用该地址时,会导致segfault。
看看这个答案:为包含指针的结构创建MPI_Datatype。你也许可以摆脱这个基本想法。请注意,从简短的略读来看,Hristo Iliev在该线程中的响应可能是您的程序结构上更好的方法。
*我假设这就是MPI_Datatype type[4]={MPI_INT,MPI_INT,MPI_INT,MPI_INT};
行的意思
你有这个:
static int board_size=8;
typedef struct boardplus{
int size; // board size
int x; // this is where we need to restart
int y; // idem
int board[MAXBOARDSIZE]; // the actual board, padded to largest instance
} subproblem;
并制作这样的类型:
MPI_Datatype subProblemType;
MPI_Datatype type[4]={MPI_INT,MPI_INT,MPI_INT,MPI_INT};
int block_len[4]={1,1,1,MAXBOARDSIZE};
MPI_Aint disp[4];
MPI_Address(&subP,disp);
MPI_Address(&subP.x,disp+1);
MPI_Address(&subP.y,disp+2);
MPI_Address(&subP.board,disp+3);
int base=disp[0];
for(i=0;i<4;i++) disp[i]-=base;
MPI_Type_create_struct(4,block_len,disp,type,&subProblemType);
我对你对位移的修正持怀疑态度。更常见的是,处理MPI_Address类型的事情的方法是使用MPI_BOTTOM作为缓冲区。因此,您的发送不是发送subP,而是如下所示:
MPI_Send(MPI_BOTTOM,1,subProblemType,proc,INIT,MPI_COMM_WORLD);