在子函数中使用openmp的c速度性能



我在一个函数中使用openmp,然后在主函数中调用这个函数,但我发现这比直接将所有内容放入主函数要慢得多。我想知道原因和解决方法。代码1:

#include<omp.h>
#include<stdio.h>
#include<math.h>
double func()
{
double a[500];
double d_title[500];
double b[500];
double c[500];
for (int i = 0; i < 500; i++)
{
a[i] = 0.3291;
d_title[i] = 2.414;
b[i] = 3.8037;
c[i] = 4086;
}
double nu_start = 0;
double mu_start = 0;
double z_start = 0;
double step_nu = 2 * 3.1415926 / 100;
double step_mu = 3.1415926 / 100;
double step_z = 0;
double nu = 0;
double mu = 0;
double z = 0;
double integral = 0;
double d_uv = 0;
int i = 0;
int j = 0;
int k = 0;
int loop = 0;
#pragma omp parallel for default(none) shared(a, d_title, b, c, nu_start, mu_start, z_start, step_nu, step_mu) private( i,j,k,mu, nu, step_z, z, d_uv)
for (loop = 0; loop < 500; loop++)
{
for (i = 0; i < 100; i++)
{
mu = mu_start + (i + 1) * step_mu;
for (j = 0; j < 100; j++)
{
nu = nu_start + (j + 1) * step_nu;
for (k = 0; k < 500; k++)
{
d_uv = (sin(mu) * sin(mu) * cos(nu) * cos(nu) + sin(mu) * sin(mu) * (a[loop] * sin(nu) - d_title[loop] * cos(nu)) * (a[loop] * sin(nu) - d_title[loop] * cos(nu)) + b[loop] * b[loop] * cos(mu) * cos(mu)) / (c[loop] * c[loop]);
step_z = 20 / (d_uv * 500);
z = z_start + (k + 1) * step_z;
#pragma omp atomic
integral +=sin(mu) * (1 - 3 * sin(mu) * sin(mu) * cos(nu) * cos(nu)) * exp(-d_uv * z) * log(1 + z * z) * step_z * step_mu * step_nu / (c[loop] * c[loop]);
}
}
}
}

return integral;
}
int main() 
{
double a;
a = func();
return 0;
}

代码2:

Everything is the same except put the content of the function 'func' in the main function.

代码1比代码2慢得多。

为什么将所有内容都融合到main()中会更快?

因为您随后丢弃了同一函数中的结果,并且编译器也被允许消除所有,而这些都是获得所需结果所必需的。

从(不再需要的(结果开始,整个计算被简单地删除了。只要没有可观察到的副作用,编译器就可以这么做。

函数边界(以及未声明为inlinestatic可见性的函数(导致编译器被要求假设除main()之外的其他人可能最终需要该结果,因此它无法修剪func()的主体。

因此,如果你想正确测量,请观察你的结果!仅仅一个简单的printf()就能让一切变得不同。

即使您的应用程序在main((中时没有进行优化,您仍然可以进行一些简单的改进来提高代码的性能。你的函数和我的改进版本(在calculate.c中(:

#include "calculate.h"
#include <omp.h>
#include <math.h>
#define DATA_SIZE 500
#define NUMBER_OF_STEPS 100
static double a[DATA_SIZE];
static double d_title[DATA_SIZE];
static double b[DATA_SIZE];
static double c[DATA_SIZE];
void initialize_data()
{
int i;
for (i = 0; i < DATA_SIZE; i++)
{
a[i] = 0.3291;
d_title[i] = 2.414;
b[i] = 3.8037;
c[i] = 4086;
}
}
double func()
{
double nu_start = 0;
double mu_start = 0;
double z_start = 0;
double step_nu = 2 * M_PI / NUMBER_OF_STEPS;
double step_mu = M_PI / NUMBER_OF_STEPS;
double step_z = 0;
double nu = 0;
double mu = 0;
double z = 0;
double integral = 0;
double d_uv = 0;
int i = 0;
int j = 0;
int k = 0;
int loop = 0;
#pragma omp parallel for default(none) shared(a, d_title, b, c, nu_start, mu_start, z_start, step_nu, step_mu, integral) private(i, j, k, mu, nu, step_z, z, d_uv)
for (loop = 0; loop < DATA_SIZE; loop++)
{
for (i = 0; i < NUMBER_OF_STEPS; i++)
{
mu = mu_start + (i + 1) * step_mu;
for (j = 0; j < NUMBER_OF_STEPS; j++)
{
nu = nu_start + (j + 1) * step_nu;
for (k = 0; k < DATA_SIZE; k++)
{
d_uv = (sin(mu) * sin(mu) * cos(nu) * cos(nu) + sin(mu) * sin(mu) * (a[loop] * sin(nu) - d_title[loop] * cos(nu)) * (a[loop] * sin(nu) - d_title[loop] * cos(nu)) + b[loop] * b[loop] * cos(mu) * cos(mu)) / (c[loop] * c[loop]);
step_z = 20 / (d_uv * DATA_SIZE);
z = z_start + (k + 1) * step_z;
#pragma omp atomic
integral += sin(mu) * (1 - 3 * sin(mu) * sin(mu) * cos(nu) * cos(nu)) * exp(-d_uv * z) * log(1 + z * z) * step_z * step_mu * step_nu / (c[loop] * c[loop]);
}
}
}
}
return integral;
}
double func2()
{
double integral = 0;
int loop = 0;
int i = 0;
#pragma omp parallel for default(none) shared(a, d_title, b, c) reduction(+: integral) collapse(2)
for (loop = 0; loop < DATA_SIZE; loop++)
{
for (i = 0; i < NUMBER_OF_STEPS; i++)
{
const double mu_start = 0;
const double step_mu = M_PI / NUMBER_OF_STEPS;
const double mu = mu_start + (i + 1) * step_mu;
int j;
for (j = 0; j < NUMBER_OF_STEPS; j++)
{
const double nu_start = 0;
const double step_nu = 2 * M_PI / NUMBER_OF_STEPS;
const double nu = nu_start + (j + 1) * step_nu;
int k;
for (k = 0; k < DATA_SIZE; k++)
{
const double z_start = 0;
const double d_uv = (sin(mu) * sin(mu) * cos(nu) * cos(nu) + sin(mu) * sin(mu) * (a[loop] * sin(nu) - d_title[loop] * cos(nu)) * (a[loop] * sin(nu) - d_title[loop] * cos(nu)) + b[loop] * b[loop] * cos(mu) * cos(mu)) / (c[loop] * c[loop]);
const double step_z = 20 / (d_uv * DATA_SIZE);
const double z = z_start + (k + 1) * step_z;
integral += sin(mu) * (1 - 3 * sin(mu) * sin(mu) * cos(nu) * cos(nu)) * exp(-d_uv * z) * log(1 + z * z) * step_z * step_mu * step_nu / (c[loop] * c[loop]);
}
}
}
}
return integral;
}

标题(计算.h(:

#ifndef CALCULATE_H
#define CALCULATE_H
#ifdef __cplusplus
extern "C"
{
#endif
void initialize_data();
double func();
double func2();
#ifdef __cplusplus
}
#endif

#endif

使用Google Benchmark:的主要应用程序(openmp-performance.cpp(

#include <benchmark/benchmark.h>
#include "calculate.h"
static void BM_original_func(benchmark::State& state)
{
initialize_data();
for (auto _ : state) 
{
const double result = func();
state.counters["result"] = result;
}
}
static void BM_func2(benchmark::State& state)
{
initialize_data();
for (auto _ : state)
{
const double result = func2();
state.counters["result"] = result;
}
}
BENCHMARK(BM_original_func)->Unit(benchmark::kSecond);
BENCHMARK(BM_func2)->Unit(benchmark::kSecond);
BENCHMARK_MAIN();

我用Makefile构建:

CXXFLAGS+=-Wall -march=native -g -fopenmp -O2
CFLAGS=$(CXXFLAGS)
LDFLAGS=-lpthread -lbenchmark
TARGET = benchmark
all : $(TARGET)
$(TARGET) : calculate.o openmp-performance.o
g++ $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
.PHONY : clean
clean :
rm -f $(TARGET) *.o

我在Linux机器上构建并执行了代码,但你应该会得到类似的结果:

[dan@cpp-slave openmp-performance]$ make && ./benchmark
cc -Wall -march=native -g -fopenmp -O2   -c -o calculate.o calculate.c
g++ -Wall -march=native -g -fopenmp -O2   -c -o openmp-performance.o openmp-performance.cpp
g++ -Wall -march=native -g -fopenmp -O2 calculate.o openmp-performance.o -o benchmark -lpthread -lbenchmark
2021-11-12T16:13:01+01:00
Running ./benchmark
Run on (4 X 2394 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x4)
L1 Instruction 32 KiB (x4)
L2 Unified 4096 KiB (x4)
L3 Unified 16384 KiB (x1)
Load Average: 0.58, 0.52, 0.59
---------------------------------------------------------------------------
Benchmark                 Time             CPU   Iterations UserCounters...
---------------------------------------------------------------------------
BM_original_func        104 s           102 s             1 result=46.2432k
BM_func2               27.8 s          26.7 s             1 result=46.2432k

只需稍微更改一下OpenMP声明并将变量向下推一点,我就能够显著提高您的函数的性能:

  1. 用还原代替原子产生了很大的不同。
    1. 使用原子操作会导致来自多个线程的所有添加都被序列化
    2. 在这里使用一个减少来对末尾各个线程的小计进行汇总
  2. 预先计算sin(mu(和cos(nu(没有任何区别,这表明编译器发现了自己的优化

最新更新