在c中调用函数时性能略有下降



我正在优化我为博士学位制作的一些模拟代码,专注于以下目标:

  1. 使其模块化(创建库并将代码分成小块)
  2. 让它至少以与我已经拥有的代码相同的性能运行。
  3. 并行化。

目前,我专注于步骤1和2。我正在玩一个代码,它通过龙格-库塔四阶方法集成了一个非线性方程组,并将结果打印到输出文件中。这个方法是我正在优化的所有其他分析和方法的核心。因此,每一毫秒的运行时间都很重要,因为在某些情况下,我必须调用该方法数百万次。

我有两个版本,我运行了10次:

  1. 在第一个版本中,我定义了函数duffing来处理非线性方程组,函数rk4来处理龙格-库塔积分器的一步:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
void *rk4(int dim, double *x, double t, double h, double *par, double *f,
void (*edosys)(int, double *, double, double *, double *));
void duffing(int dim, double *x, double t, double *par, double *f);
int main(void) {
// Program Parameters
const double pi = 4 * atan(1);
int DIM = 2;
int nP = 1000;
int nDiv = 1000;
int nPar = 5;
// Parameters
double *par = malloc(nPar * sizeof *par);
double *f = malloc(DIM * sizeof *f);
double *x = malloc(DIM * sizeof *x);
par[0] = 1.0;
par[1] = 0.15;
par[2] = 0.01;
par[3] = -0.5;
par[4] = 0.5;
// Initial Conditions
double t = 0.0;
x[0] = 1.0;
x[1] = 0.0;
// Numerical Parameters
double h = (2 * pi) / (nDiv * par[0]);
// Create Output File
FILE *output = fopen("output.txt", "w");
// Time variables
double time_spent = 0.0;
clock_t time_i = clock();
// Solution
fprintf(output, "Time x[0] x[1]n");
fprintf(output, "%.10lf %.10lf %.10lfn", t, x[0], x[1]);
for (int i = 0; i < nP; i++) {
for (int j = 0; j < nDiv; j++) {
rk4(DIM, x, t, h, par, f, duffing);
t = t + h;
fprintf(output, "%.10lf %.10lf %.10lfn", t, x[0], x[1]);
}
}
// Time Spent
clock_t time_f = clock();
time_spent += (double)(time_f - time_i) / CLOCKS_PER_SEC; 
printf("The elapsed time is %f secondsn", time_spent);
// Free Memory
free(par); free(f); free(x);
}
void duffing(int dim, double *x, double t, double *par, double *f) {  
if (dim == 2) {
f[0] = x[1];
f[1] = par[1]*sin(par[0] * t) - 2*par[2]*x[1] - par[3]*x[0] - par[4]*x[0]*x[0]*x[0];    
} 
else if (dim == 6) {
f[0] = x[1];
f[1] = par[1]*sin(par[0] * t) - 2*par[2]*x[1] - par[3]*x[0] - par[4]*x[0]*x[0]*x[0];
for (int i = 0; i < 2; i++) {
f[2 + i] = x[4 + i];
f[4 + i] = -par[3]*x[2 + i] - 3*par[4]*x[0]*x[0]*x[2 + i] - 2*par[2]*x[4 + i];
}
} else {
printf("Wrong dimension (dim) or (ndim) allocated for system of equationsn");
exit(1);
}    
}
void *rk4(int dim, double *x, double t, double h, double *par, double *f,
void (*edosys)(int, double *, double, double *, double *)) {
double tmp[dim], k1[dim], k2[dim], k3[dim], k4[dim];
// Calculate first slope
edosys(dim, x, t, par, k1);
// Assign next value for tmp[dim] to be inserted in the system of edos
for (int i = 0; i < dim; i++) {
tmp[i] = x[i] + 0.5 * (h * k1[i]);
}
// Calculate second slope
edosys(dim, tmp, t + 0.5 * h, par, k2);
// Assign next value for tmp[dim] to be inserted in the system of edos
for (int i = 0; i < dim; i++) {
tmp[i] = x[i] + 0.5 * (h * k2[i]);
}
// Calculate third slope
edosys(dim, tmp, t + 0.5 * h, par, k3);
// Assign next value for tmp[dim] to be inserted in the system of edos
for (int i = 0; i < dim; i++) {
tmp[i] = x[i] + h * k3[i];
}
// Calculate the fourth slope
edosys(dim, tmp, t + h, par, k4);
// Calculate the next value of x[dim]
for (int i = 0; i < dim; i++) {
x[i] = x[i] + (h/6.0) * (k1[i] + 2 * k2[i] + 2 * k3[i] + k4[i]);
}        
}

每次运行的运行时间如下所示:

The elapsed time is 0.718215 seconds
The elapsed time is 0.713928 seconds
The elapsed time is 0.705679 seconds
The elapsed time is 0.713959 seconds
The elapsed time is 0.707523 seconds
The elapsed time is 0.710903 seconds
The elapsed time is 0.708110 seconds
The elapsed time is 0.718513 seconds
The elapsed time is 0.706710 seconds
The elapsed time is 0.710024 seconds
  1. 第二个版本我定义了与之前相同的duffingrk4函数,并添加了第三个称为rk4_solution的函数,该函数处理方法的所有步骤:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
void *rk4(int dim, double *x, double t, double h, double *par, double *f,
void (*edosys)(int, double *, double, double *, double *));
void duffing(int dim, double *x, double t, double *par, double *f);
void rk4_solution(FILE *output, double *x, double t, int dim, int np, int ndiv,
double *par, double *f,
void (*edosys)(int, double *, double, double *, double *));
int main(void) {
// Program Parameters
int DIM = 2;
int nP = 1000;
int nDiv = 1000;
int nPar = 5;
// Parameters
double *par = malloc(nPar * sizeof *par);
double *f = malloc(DIM * sizeof *f);
double *x = malloc(DIM * sizeof *x);
par[0] = 1.0;
par[1] = 0.15;
par[2] = 0.01;
par[3] = -0.5;
par[4] = 0.5;
// Initial Conditions
double t = 0.0;
x[0] = 1.0;
x[1] = 0.0;
// Create Output File
FILE *output = fopen("output.txt", "w");
// Time variables
double time_spent = 0.0;
clock_t time_i = clock();
// Solution
rk4_solution(output, x, t, DIM, nP, nDiv, par, f, duffing);
// Time Spent
clock_t time_f = clock();
time_spent += (double)(time_f - time_i) / CLOCKS_PER_SEC; 
printf("The elapsed time is %f secondsn", time_spent);
// Free Memory
free(par); free(f); free(x);
}
void duffing(int dim, double *x, double t, double *par, double *f) {
if (dim == 2) {
f[0] = x[1];
f[1] = par[1]*sin(par[0] * t) - 2*par[2]*x[1] - par[3]*x[0] - par[4]*x[0]*x[0]*x[0];    
} 
else if (dim == 6) {
f[0] = x[1];
f[1] = par[1]*sin(par[0] * t) - 2*par[2]*x[1] - par[3]*x[0] - par[4]*x[0]*x[0]*x[0];
for (int i = 0; i < 2; i++) {
f[2 + i] = x[4 + i];
f[4 + i] = -par[3]*x[2 + i] - 3*par[4]*x[0]*x[0]*x[2 + i] - 2*par[2]*x[4 + i];
}
} else {
printf("Wrong dimension (dim) or (ndim) allocated for system of equationsn");
exit(1);
}    
}
void *rk4(int dim, double *x, double t, double h, double *par, double *f,
void (*edosys)(int, double *, double, double *, double *)) {
double tmp[dim], k1[dim], k2[dim], k3[dim], k4[dim];
// Calculate first slope
edosys(dim, x, t, par, k1);
// Assign next value for tmp[dim] to be inserted in the system of edos
for (int i = 0; i < dim; i++) {
tmp[i] = x[i] + 0.5 * (h * k1[i]);
}
// Calculate second slope
edosys(dim, tmp, t + 0.5 * h, par, k2);
// Assign next value for tmp[dim] to be inserted in the system of edos
for (int i = 0; i < dim; i++) {
tmp[i] = x[i] + 0.5 * (h * k2[i]);
}
// Calculate third slope
edosys(dim, tmp, t + 0.5 * h, par, k3);
// Assign next value for tmp[dim] to be inserted in the system of edos
for (int i = 0; i < dim; i++) {
tmp[i] = x[i] + h * k3[i];
}
// Calculate the fourth slope
edosys(dim, tmp, t + h, par, k4);
// Calculate the next value of x[dim]
for (int i = 0; i < dim; i++) {
x[i] = x[i] + (h/6.0) * (k1[i] + 2 * k2[i] + 2 * k3[i] + k4[i]);
}        
}
void rk4_solution(FILE *output, double *x, double t, int dim, int np, int ndiv,
double *par, double *f,
void (*edosys)(int, double *, double, double *, double *)) {
const double pi = 4 * atan(1);
// Numerical Parameters
double h = (2 * pi) / (ndiv * par[0]);
fprintf(output, "Time x[0] x[1]n");
fprintf(output, "%.10lf %.10lf %.10lfn", t, x[0], x[1]);
for (int i = 0; i < np; i++) {
for (int j = 0; j < ndiv; j++) {
rk4(dim, x, t, h, par, f, edosys);
t = t + h;
fprintf(output, "%.10lf %.10lf %.10lfn", t, x[0], x[1]);
}
}
}
通过添加第三个函数,运行时间稍微改变为:
The elapsed time is 0.753674 seconds
The elapsed time is 0.748255 seconds
The elapsed time is 0.738883 seconds
The elapsed time is 0.738666 seconds
The elapsed time is 0.736813 seconds
The elapsed time is 0.740047 seconds
The elapsed time is 0.736575 seconds
The elapsed time is 0.739985 seconds
The elapsed time is 0.737410 seconds
The elapsed time is 0.738836 seconds

有人能帮助我理解为什么会发生这种情况,以及如何避免它,因为我在两个版本的代码中做基本相同的操作?

这个结果让我非常担心,所以我在稍微复杂一点的代码中测试了同样的事情,它给了我平均20秒的差异。最后,它可以在几周甚至几个月内影响我最复杂的分析。

提前感谢!

…如何避免它,因为我在两个版本的代码做基本相同的操作?

使它们更相似。


可能/可能不考虑所有时间差异的想法:

不要在rk4_solution()中添加额外的代码

k4_solution()中执行与main()相同的定时。

// Return clock difference
clock_t rk4_solution(FILE *output, double *x, double t, int dim, int np, 
int ndiv, double *par, double *f, 
void (*edosys)(int, double *, double, double *, double *)) {
// Do not time these
const double pi = 4 * atan(1);
// Numerical Parameters
double h = (2 * pi) / (ndiv * par[0]);
// Add 
clock_t time_i = clock();
fprintf(output, "Time x[0] x[1]n");
fprintf(output, "%.10lf %.10lf %.10lfn", t, x[0], x[1]);
for (int i = 0; i < np; i++) {
for (int j = 0; j < ndiv; j++) {
rk4(dim, x, t, h, par, f, edosys);
t = t + h;
fprintf(output, "%.10lf %.10lf %.10lfn", t, x[0], x[1]);
}
}
// Add 
clock_t time_f = clock();
return time_f - time_i;
}

使用restrict

rk4_solution()中,指向x, par, f的数据可能重叠,影响优化潜力,而在main()中,已知它们不重叠。

// void *rk4(int dim, double *x, double t, double h, 
//     double *par, double *f, 
//     void (*edosys)(int, double *, double, double *, double *))
void *rk4(int dim, double * restrict x, double t, double h,
double * restrict par, double * restrict f, 
void (*edosys)(int, double *, double, double *, double *))

有许多方法可以提高这两种方法的性能,但是OP在这里似乎想要确定性能差异

最新更新