动态编程并行化/矢量化与OpenMP



我正在尝试编写一个函数,该函数使用动态编程计算最小成本多边形三角测量,并使用OpenMP对其进行并行化/矢量化。到目前为止,我写的代码返回了正确的结果,但速度太慢了——对于由3000多个点组成的多边形,它甚至在5分钟后都不会停止。这是代码:

#pragma omp declare simd
float dist(float x1, float y1, float x2, float y2)
{
return sqrt((x1 - x2)*(x1 - x2) + (y1 - y2)*(y1 - y2));
}
float triangulate(const vector<Point> &points) {
int n = points.size();
vector<vector<float>> table (n, vector<float>(n, 0));
int threads = omp_get_max_threads();
for (int gap = 0; gap < n; ++gap)
{
for (int i = 0, j = gap; j < n; ++i, ++j)
{
if (j < i+2)
table[i][j] = 0.0;
else
{
int size = j - i - 1;
Point p1 = points[i], p2 = points[j];
//Precompute distance between i and j
float ij = dist(p1.x, p1.y, p2.x, p2.y);
float minimum = MAX;
#pragma omp parallel for simd schedule(static, 64) num_threads(threads) reduction(min:minimum) if(size > 300)
for (int k = i+1; k < j; ++k)
{
Point p3 = points[k];
float perimeter = ij + dist(p1.x, p1.y, p3.x, p3.y) + dist(p2.x, p2.y, p3.x, p3.y) + table[i][k] + table[k][j];
if(perimeter < minimum)
{
minimum = perimeter;
}
}
table[i][j] = minimum;
}
}
}
return table[0][n-1];
}

循环imho的间隙和i,j不能并行化,因此只有k上的for循环可以并行化。我试着配合日程安排的争论,但没有任何进展。我是错过了什么,还是只是这个功能在这种方法中不能更快?

为什么不在i/j上并行?对于给定的i/j对,计算周长仅依赖于table[i][k]table[k][j]的值,其中ik之间或kj之间的间隙小于ij之间的间隙。只要gap上最外层的循环按顺序进行,i/j上的内部循环就具有令人尴尬的并行性,并且可以在没有任何预防措施的情况下并行化。

// Do this here so that we can dispose of the if/then/else in the loop later
table[0][0] = 0.0;
for (int i= 1; i < n; ++i){
table[i][i-1] = 0.0;
table[i][i]   = 0.0;
}
// spawn parallel threads here
#pragma omp parallel default(shared)
for (int gap = 2; gap < n; ++gap)
{
// Loop on i will now be distributed among threads
// Not sure that is possible to place two variables in the loop definition
// so loop on i and compute the corresponding value of j
#pragma omp for
for (int i = 0; i < n-gap; ++i)
{
int j=i+gap; 
int size = gap - 1;
Point p1 = points[i], p2 = points[j];
//Precompute distance between i and j
float ij = dist(p1.x, p1.y, p2.x, p2.y);
float minimum = MAX;
// remove of all directives but simd
#pragma omp simd reduction(min:minimum)
for (int k = i+1; k < j; ++k)
{
Point p3 = points[k];
float perimeter = ij + dist(p1.x, p1.y, p3.x, p3.y) + dist(p2.x, p2.y, p3.x, p3.y) + table[i][k] + table[k][j];
if(perimeter < minimum)
minimum = perimeter;
}
table[i][j] = minimum;
} // implicit barrier, all threads will wait here for loop completion before moving to next value of gap
} //end of loop, end of parallel region

最新更新