首先,我确定我是法国人,我的英语不是很好。
我正在研究MPI应用程序,我遇到了一些问题,我希望有人可以帮助我。
正如我帖子标题中所报告的,当我必须终止我的应用程序然后调用MPI_Finalize函数时,我尝试使用线程来侦听。
但是,我的应用程序没有完成更正。更准确地说,我得到以下消息:
[XPS-2720:27441] * 过程接收信号 *
[XPS-2720:27441] 信号:分段错误 (11)
[XPS-2720:27441] 信号代码:地址未映射 (1)
[XPS-2720:27441] 地址失败:0x7f14077a3b6d
[XPS-2720:27440] * 过程接收信号 *
[XPS-2720:27440] 信号:分段错误 (11)
[XPS-2720:27440] 信号代码:地址未映射 (1)
[XPS-2720:27440] 地址失败:0x7fb11d07bb6d
mpirun 注意到节点 lagniez-XPS-2720 上具有 PID 27440 的进程等级 1 在信号 11 上退出(分段错误)。
我的从属代码是:
#include "mpi.h"
#include <stdio.h>
#include <stdlib.h>
#include <signal.h>
#include <unistd.h>
#include <sys/types.h>
#include <pthread.h>
#include <cassert>
#define send_data_tag 1664
#define send_kill_tag 666
void *finilizeMPICom(void *intercomm)
{
printf("the finilizeMPICom was calledn");
MPI_Comm parentcomm = * ((MPI_Comm *) intercomm);
MPI_Status status;
int res;
// sleep(10);
MPI_Recv(&res, 1, MPI_INT, 0, send_kill_tag, parentcomm, &status);
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
printf("we receive something %d -- %dn", rank, res);
MPI_Finalize();
exit(0);
}// finilizeMPICom
int main( int argc, char *argv[])
{
int numtasks, rank, len, rc;
char hostname[MPI_MAX_PROCESSOR_NAME];
int provided, claimed;
rc = MPI_Init_thread(0, 0, MPI_THREAD_MULTIPLE, &provided);
MPI_Query_thread( &claimed );
if (rc != MPI_SUCCESS || provided != 3)
{
printf ("Error starting MPI program. Terminating.n");
MPI_Abort(MPI_COMM_WORLD, rc);
}
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
MPI_Comm parentcomm;
MPI_Comm_get_parent(&parentcomm);
/* create a second thread to listen when we have to kill the program */
pthread_t properlyKill;
if(pthread_create(&properlyKill, NULL, finilizeMPICom, (void *) &parentcomm))
{
fprintf(stderr, "Error creating threadn");
return 0;
}
assert(parentcomm != MPI_COMM_NULL);
MPI_Status status;
int root_process, ierr, num_rows_to_receive;
int mode;
MPI_Recv( &mode, 1, MPI_INT, 0, send_data_tag, parentcomm, &status);
printf("c The solver works in the mode %dn", mode);
printf("I sent a message %dn", rank);
// if(rank != 1) sleep(100);
int res = 1;
MPI_Send(&res, 1, MPI_INT, 0, send_data_tag, parentcomm);
printf("we want to listen for somethiing %dn", rank);
int rescc = 1;
MPI_Recv(&rescc, 1, MPI_INT, 0, send_data_tag, parentcomm, &status);
printf("I received the message %d %dn", rescc, rank);
if(rescc == 1000)
{
printf("~~~~~~~~>>> I print the solution %dn", rank);
int res3 = 1001;
MPI_Send(&res3, 1, MPI_INT, 0, send_data_tag, parentcomm);
}
else printf("I do not understand %dn", rank);
printf("I wait the thread to kill the programm %dn", rank);
pthread_join(properlyKill, (void**)&(res));
return 0;
}
对于主人,我有:
int main(int argc, char **argv)
{
Parser *p = new Parser("slave.xml");
MPI_Init(&argc, &argv);
if(p->method == "concurrent")
{
ConcurrentManager cc(p->instance, p->solvers);
cc.run();
}
else
{
cerr << "c The only available methods are: concurrent, eps (Embarrassingly Parallel Search) or tree" << endl;
exit(1);
}
delete(p);
MPI_Finalize();
exit(0);
}// main
/**
Create a concurrent manager (means init the data structures to run
the solvers).
@param[in] _instance, the benchmark path
@param[in] _solvers, the set of solvers that will be ran
*/
ConcurrentManager::ConcurrentManager(string _instance, vector<Solver> &_solvers) :
instance(_instance), solvers(_solvers)
{
cout << "cnc Concurrent manager called" << endl;
nbSolvers = _solvers.size();
np = new int[nbSolvers];
cmds = new char*[nbSolvers];
arrayOfArgs = new char **[nbSolvers];
infos = new MPI_Info[nbSolvers];
for(int i = 0 ; i<nbSolvers ; i++)
{
np[i] = solvers[i].npernode;
cmds[i] = new char[(solvers[i].executablePath).size() + 1];
strcpy(cmds[i], (solvers[i].executablePath).c_str());
arrayOfArgs[i] = new char *[(solvers[i].options).size() + 1];
for(unsigned int j = 0 ; j<(solvers[i].options).size() ; j++)
{
arrayOfArgs[i][j] = new char[(solvers[i].options[j]).size() + 1];
strcpy(arrayOfArgs[i][j], (solvers[i].options[j]).c_str());
}
arrayOfArgs[i][(solvers[i].options).size()] = NULL;
MPI_Info_create(&infos[i]);
char hostname[solvers[i].hostname.size()];
strcpy(hostname, solvers[i].hostname.c_str());
MPI_Info_set(infos[i], "host", hostname);
}
sizeComm = 0;
}// constructor
/**
Wait that at least one process finish and return the code
SOLUTION_FOUND.
@param[in] intercomm, the communicator
*/
void ConcurrentManager::waitForSolution(MPI_Comm &intercomm)
{
MPI_Status arrayStatus[sizeComm], status;
MPI_Request request[sizeComm];
int val[sizeComm], flag;
for(int i = 0 ; i<sizeComm ; i++) MPI_Irecv(&val[i], 1, MPI_INT, i, TAG_MSG, intercomm, &request[i]);
bool solutionFound = false;
while(!solutionFound)
{
for(int i = 0 ; i<sizeComm ; i++)
{
MPI_Test(&request[i], &flag, &arrayStatus[i]);
if(flag)
{
printf("---------------------> %d reveived %dn", i , val[i]);
if(val[i] == SOLUTION_FOUND)
{
int msg = PRINT_SOLUTION;
MPI_Send(&msg, 1, MPI_INT, i, TAG_MSG, intercomm); // ask to print the solution
int msgJobFinished;
MPI_Recv(&msgJobFinished, 1, MPI_INT, i, TAG_MSG, intercomm, &status); // wait the answer
assert(msgJobFinished == JOB_FINISHED);
cout << "I am going to kill everybody" << endl;
int msgKill[sizeComm];
for(int j = 0 ; j<sizeComm ; j++)
{
msgKill[i] = STOP_AT_ONCE;
MPI_Send(&msgKill[i], 1, MPI_INT, j, TAG_KILL, intercomm);
}
solutionFound = true;
break;
} else
{
printf("restart the communication for %dn", i);
MPI_Irecv(&val[i], 1, MPI_INT, i, TAG_MSG, intercomm, &request[i]);
}
}
}
}
}// waitForSolution
/**
Run the solver.
*/
void ConcurrentManager::run()
{
MPI_Comm intercomm;
int errcodes[solvers.size()];
MPI_Comm_spawn_multiple(nbSolvers, cmds, arrayOfArgs, np, infos, 0, MPI_COMM_WORLD, &intercomm, errcodes);
MPI_Comm_remote_size(intercomm, &sizeComm);
cout << "c Solvers are now running: " << sizeComm << endl;
int msg = CONCU_MODE;
for(int i = 0 ; i<sizeComm ; i++) MPI_Send(&msg, 1, MPI_INT, i, TAG_MSG, intercomm); // init the working mode
waitForSolution(intercomm);
}// run
我知道我放了很多代码:(
但是,我不知道问题出在哪里。
请帮我:)
此致敬意。
MPI 文档关于 MPI 如何与线程交互,要求对MPI_Finalize()
调用由主线程执行,即初始化 MPI 的同一线程。 在您的情况下,这也恰好是进程的初始线程。
为了满足 MPI 的要求,您可以重新组织应用程序,使初始线程是等待终止信号然后关闭 MPI 的线程。 然后,它当前所做的其他工作需要移动到不同的线程。