c-openmp串行比并行快

代码是并行的，但我不知道为什么它比串行的慢，当我把线程添加到7到10时，程序也会变慢。

我一直想弄清楚问题出在哪里，但对我来说很困难

我使for循环并行，但它似乎不起作用。运行代码时没有收到任何错误。

#include <stdio.h>
#include <math.h>
#include <omp.h>
#include <stdlib.h>

int main(int argc, char *argv[])
{
int m; 
int n;
double tol;// = 0.0001;
double tstart, tstop;
int i, j, iter, nthreads;

m = atoi(argv[1]);
n = atoi(argv[2]);
tol = atof(argv[3]);
double t[m+2][n+2], tnew[m+1][n+1], diff, difmax,priv_difmax;
printf("%d %d %lfn",m,n,tol);
printf("Enter the number of threads (max 10) ");
scanf("%d",&nthreads);
omp_set_num_threads(nthreads);
tstart = omp_get_wtime ();
//** initialise temperature array*

#pragma omp parallel for schedule(static)
default(shared) private(i,j)
for (i=0; i <= m+1; i++) {
for (j=0; j <= n+1; j++) {
t[i][j] = 30.0;
}
}
//*** fix boundary conditions***

for (i=1; i <= m; i++) {
t[i][0] = 20.0;
t[i][n+1] = 100.0;
}
for (j=1; j <= n; j++) {
t[0][j] = 10.0;
t[m+1][j] = 140.0;
}

//** main loop**

iter = 0;
difmax = 1000000.0;
while (difmax > tol) {
iter++;
// **update temperature for next iteration**

#pragma omp parallel for schedule(static) 
default(shared) private(i,j)
for (i=1; i <= m; i++) {
for (j=1; j <= n; j++) {
tnew[i][j] = (t[i-1][j]+t[i+1][j]+t[i][j-1]+t[i][j+1])/4.0;
}
}
// **work out maximum difference between old and new temperatures**
difmax = 0.0;
#pragma omp parallel default(shared) private(i, j, diff, priv_difmax)
{
priv_difmax = 0.0;
#pragma omp for schedule(static)
for (i=1; i <= m; i++) {
for (j=1; j <= n; j++) {
diff = fabs(tnew[i][j]-t[i][j]);
if (diff > priv_difmax) {
priv_difmax = diff;
}
//** copy new to old temperatures**
t[i][j] = tnew[i][j];
}
#pragma omp critical 
if (priv_difmax > difmax){
difmax = priv_difmax;
}
}
}
}
tstop = omp_get_wtime ();
// print results
printf("iter = %d  difmax = %9.11lf", iter, difmax);
for (i=0; i <= m+1; i++) {
printf("n");
for (j=0; j <= n+1; j++) {
printf("%3.5lf ", t[i][j]);
}
}
printf("n");
tstop = omp_get_wtime ();
printf("time taken is %4.3lfn", (tstop-tstart));
printf("n");
}

我没有看到明显的问题，除了以下代码：

#pragma omp parallel default(shared) private(i, j, diff, priv_difmax)
{
priv_difmax = 0.0;
#pragma omp for schedule(static)
for (i=1; i <= m; i++) {
for (j=1; j <= n; j++) {
diff = fabs(tnew[i][j]-t[i][j]);
if (diff > priv_difmax) {
priv_difmax = diff;
}
//** copy new to old temperatures**
t[i][j] = tnew[i][j];
}
#pragma omp critical 
if (priv_difmax > difmax){
difmax = priv_difmax;
}
}
}

将priv_difmax复制到difmax的归约部分应该移出循环，以便线程只通过critical部分一次，而不是在外循环的每次迭代中。

#pragma omp parallel default(shared) private(i, j, diff, priv_difmax)
{
priv_difmax = 0.0;
#pragma omp for schedule(static) nowait //no need to wait after the loop
for (i=1; i <= m; i++) {
for (j=1; j <= n; j++) {
diff = fabs(tnew[i][j]-t[i][j]);
if (diff > priv_difmax) {
priv_difmax = diff;
}
//** copy new to old temperatures**
t[i][j] = tnew[i][j];
}
}
// Finish the loop first, then update difmax
#pragma omp critical 
if (priv_difmax > difmax){
difmax = priv_difmax;
}
} //Implicit barrier

现在，并行化有一个开销成本，并且可能只对m和n的大值进行加速。你考虑的问题可能太小了。减少开销的方法是合并两个parallel构造，这样线程池就不必派生两次。或者更好的是，将while循环放在parallel构造中，这样我们只需要在每次迭代中同步现有线程，而不是创建和销毁它们：

difmax=1000000.0;
#pragma omp parallel default(shared) private(i, j, diff, priv_difmax)
while (difmax > tol) {
// have one thread reset difmax and increment iter
#pragma omp single nowait
iter++,difmax=0.0;
// loop to update tnew - distributed among threads
#pragma omp parallel for schedule(static) 
default(shared) private(i,j)
for (i=1; i <= m; i++) {
for (j=1; j <= n; j++) {
tnew[i][j] = (t[i-1][j]+t[i+1][j]+t[i][j-1]+t[i][j+1])/4.0;
}
} //implicit barrier here
// each thread resets its private difmax
priv_difmax=0.0;
// loop to compute difmax - distributed among threads
#pragma omp for schedule(static) nowait
for (i=1; i <= m; i++) {
for (j=1; j <= n; j++) {
diff = fabs(tnew[i][j]-t[i][j]);
if (diff > priv_difmax) {
priv_difmax = diff;
}
//** copy new to old temperatures**
t[i][j] = tnew[i][j];
}
}
// each thread now updates difmax if needed, one at a time
#pragma omp critical 
if (priv_difmax > difmax){
difmax = priv_difmax;
}
// put a barrier here to make sure that diffmax have been updated 
// before any thread tests the condition for next iteration of the
// while-loop condition
#pragma omp barrier
}

比较代码在串行和并行运行方式的最佳方法是在支持和不支持OpenMP的情况下对其进行编译(例如，使用gcc，使用和不使用-fopenmp编译器和链接器标志进行编译(。这将有助于指出问题实际上是并行化，还是原始串行代码和"准备并行"版本之间的其他修改。

这个想法是为了知道从原始串行代码到并行代码(在没有并行支持的情况下编译(再到并行代码(使用OpenMP编译(时时间损失在哪里

需要使用一些预处理头，因为如果没有OpenMP支持，编译器将无法识别像omp_get_thread_num()这样的函数。也不应使用omp_get_wtime()；由于所有的时间释放都是在平行区域之外完成的，因此不需要使用该特定函数，并且对time()的调用将是准确的(这需要对#include <time.h>的调用(。

// This part is necessary for the code to run whether it is compiled or not with OpenMP
#ifdef _OPENMP
#include <omp.h>
#else
# ifndef _ESCAPE_OMPENMP
#define omp_get_num_threads() 1
#define omp_get_thread_num() 0
#define omp_get_max_threads() 0
#define _ESCAPE_OMPENMP
#endif
#endif

相关内容

最新更新

热门标签：