如何使用 OpenMP (C) 优化中值滤波器?



我编写了一个程序,该程序有两个版本的中值过滤器使用OpenCV在C中实现,一个是顺序的,另一个是与OpenMP并行化的。我的问题是,无论块大小或线程数如何,OpenMP版本的运行速度似乎都比我的顺序版本慢。

非常欢迎任何想法/建议!

这是我的顺序代码

void medianFilter (const IplImage* img){
IplImage* output = cvCloneImage(img);
int rows, cols, step;
uchar *data;
rows = output->height;
cols = output->width;
step = output->widthStep;
data = (uchar *)output->imageData;
if(!data)
{ return; }
//create a sliding window of size 9
int window[9];
for(int y = 1; y < rows - 1; y++){
for(int x = 1; x < cols - 1; x++){
// Pick up window element
window[0] = data[(y - 1) * step + (x - 1)];
window[1] = data[y * step + (x - 1)];
window[2] = data[(y + 1) * step + (x - 1)];
window[3] = data[(y - 1) * step + x];
window[4] = data[y * step + x];
window[5] = data[(y + 1) * step + x];
window[6] = data[(y - 1) * step + (x + 1)];
window[7] = data[y * step + (x + 1)];
window[8] = data[(y + 1) * step + (x + 1)];
// Sort the window to find median
insertionSort(window);
// Assign the median to centered element of the matrix
data[y * step + x] = window[4];
}
}
cvNamedWindow("Post-filter", CV_WINDOW_AUTOSIZE);
cvShowImage("Post-filter", output);
cvReleaseImage(&output);
}

这是我的并行化代码

void omp_medianFilter (const IplImage* img){
IplImage* output = cvCloneImage(img);
int rows, cols, step, nthreads;
uchar *data;
rows = output->height;
cols = output->width;
step = output->widthStep;
data = (uchar *)output->imageData;
if(!data)
{ return; }
// Create a sliding window of size 9
int window[9], x, y;
// Set the number of threads to use
omp_set_num_threads(NUM_THREADS);
// Parallel code segment. Window, x and y are private variables for each thread
#pragma omp parallel private(window, x, y)
{
//if(omp_get_thread_num() == 0){
//nthreads = omp_get_num_threads();
//printf("Numer of threads running: %d n", nthreads);
//}
// Parallel for loop with dynamic scheduling and collapsing nested loops
#pragma omp for schedule(dynamic, CHUNK) collapse(2)
for(y = 1; y < rows - 1; y++){
for(x = 1; x < cols - 1; x++){
// Pick up 3x3 window elements
window[0] = data[(y - 1) * step + (x - 1)];
window[1] = data[y * step + (x - 1)];
window[2] = data[(y + 1) * step + (x - 1)];
window[3] = data[(y - 1) * step + x];
window[4] = data[y * step + x];
window[5] = data[(y + 1) * step + x];
window[6] = data[(y - 1) * step + (x + 1)];
window[7] = data[y * step + (x + 1)];
window[8] = data[(y + 1) * step + (x + 1)];
// Sort the window to find median
insertionSort(window);
// Assign the median to centered element of the matrix
data[y * step + x] = window[4];
}
}
}
cvNamedWindow("Post-filter (OMP)", CV_WINDOW_AUTOSIZE);
cvShowImage("Post-filter (OMP)", output);
cvReleaseImage(&output);
}

完整代码

#include <stdio.h>
#include <opencv2/imgproc/imgproc_c.h>
#include <opencv2/highgui/highgui_c.h>
#include <opencv2/core/types_c.h>
#include <sys/time.h>
#include <omp.h>
#define NUM_THREADS 8
#define CHUNK 15000
//Function to measure time
double get_walltime() {   
struct timeval tp; gettimeofday(&tp, NULL);
return (double) (tp.tv_sec + tp.tv_usec*1e-6);
}
//Sort the window elements using insertion sort
void insertionSort(int window[])
{
int temp, i , j;
for(i = 0; i < 9; i++){
temp = window[i];
for(j = i-1; j >= 0 && temp < window[j]; j--){
window[j+1] = window[j];
}
window[j+1] = temp;
}
}
void medianFilter (const IplImage* img){
IplImage* output = cvCloneImage(img);
int rows, cols, step;
uchar *data;
rows = output->height;
cols = output->width;
step = output->widthStep;
data = (uchar *)output->imageData;
if(!data)
{ return; }
//create a sliding window of size 9
int window[9];
for(int y = 1; y < rows - 1; y++){
for(int x = 1; x < cols - 1; x++){
// Pick up window element
window[0] = data[(y - 1) * step + (x - 1)];
window[1] = data[y * step + (x - 1)];
window[2] = data[(y + 1) * step + (x - 1)];
window[3] = data[(y - 1) * step + x];
window[4] = data[y * step + x];
window[5] = data[(y + 1) * step + x];
window[6] = data[(y - 1) * step + (x + 1)];
window[7] = data[y * step + (x + 1)];
window[8] = data[(y + 1) * step + (x + 1)];
// Sort the window to find median
insertionSort(window);
// Assign the median to centered element of the matrix
data[y * step + x] = window[4];
}
}
cvNamedWindow("Post-filter", CV_WINDOW_AUTOSIZE);
cvShowImage("Post-filter", output);
cvReleaseImage(&output);
}
// Parallelized implementation of median filter
void omp_medianFilter (const IplImage* img){
IplImage* output = cvCloneImage(img);
int rows, cols, step, nthreads;
uchar *data;
rows = output->height;
cols = output->width;
step = output->widthStep;
data = (uchar *)output->imageData;
if(!data)
{ return; }
// Create a sliding window of size 9
int window[9], x, y, j, k, min;
// Set the number of threads to use
omp_set_num_threads(NUM_THREADS);
// Parallel code segment. Window, x and y are private variables for each thread
#pragma omp parallel private(window, x, y, j, k, min)
{
//if(omp_get_thread_num() == 0){
//nthreads = omp_get_num_threads();
//printf("Numer of threads running: %d n", nthreads);
//}
// Parallel for loop with dynamic scheduling and collapsing nested loops
#pragma omp for schedule(dynamic, CHUNK) collapse(2)
for(y = 1; y < rows - 1; y++){
for(x = 1; x < cols - 1; x++){
// Pick up 3x3 window elements
window[0] = data[(y - 1) * step + (x - 1)];
window[1] = data[y * step + (x - 1)];
window[2] = data[(y + 1) * step + (x - 1)];
window[3] = data[(y - 1) * step + x];
window[4] = data[y * step + x];
window[5] = data[(y + 1) * step + x];
window[6] = data[(y - 1) * step + (x + 1)];
window[7] = data[y * step + (x + 1)];
window[8] = data[(y + 1) * step + (x + 1)];
// Sort the window to find median
//insertionSort(window);
for (int j = 0; j < 5; ++j)
{
//   Find position of minimum element
int min = j;
for (int l = j + 1; l < 9; ++l)
if (window[l] < window[min])
min = l;
//   Put found minimum element in its place
const int temp = window[j];
window[j] = window[min];
window[min] = temp;
}

// Assign the median to centered element of the matrix
data[y * step + x] = window[4];
}
}
}
cvNamedWindow("Post-filter (OMP)", CV_WINDOW_AUTOSIZE);
cvShowImage("Post-filter (OMP)", output);
cvReleaseImage(&output);
}
int main(int argc, char *argv[])
{
IplImage* src;
double time1, time2;
if(argc<2){
printf("Usage: main <image-file-name>n7");
exit(0);
}
// Load a source image
src = cvLoadImage(argv[1], CV_LOAD_IMAGE_GRAYSCALE);
cvNamedWindow("Original", CV_WINDOW_AUTOSIZE);
cvShowImage("Original", src);
/*time1 = get_walltime();
medianFilter(src);
time2 = get_walltime();
printf("Sequential Code Performance: %fsn", time2 - time1);*/
time1 = get_walltime();
omp_medianFilter(src);
time2 = get_walltime();
printf("Parallel Code Performance: %fsn", time2 - time1);
cvWaitKey(0);
cvReleaseImage(&src);
return 0;
}

已修复

我确实应用了给出的许多建议,我确实看到了性能的提高,但提到的不是问题。

事实证明,这是一件非常愚蠢的事情。我在装有 Ubuntu 16.04 的 VM 上运行它,我不小心忘记增加内核数量,因此它只使用 1,这可能意味着它根本没有并行化。

最新更新