因此,我已经在cuda中实现了(或至少尝试(sobel滤波器,而我的代码如下。当我执行此文件时,我获得了正确的Sobel过滤图像的一半,而另一半为黑色。我无法上传图片,因为它们处于.pgm格式。因此,代码的作用是在.pgm格式的灰度图像中读取的,并使用共享内存概念将SOBEL FILLE掩码卷入。我使用了1024 x 1024 .pgm图像作为输入,然后返回带有一半的边缘的Sobel过滤图像水平切割,因此下半部分为黑色。有人可以在这里帮我吗?另外,我对代码有些好奇,我真的不明白第二批加载的功能,所以您也可以解释一下。
sobel.cu
/* sobel.cu */
#include <stdio.h>
#include <stdlib.h>
#include <float.h>
#include <time.h>
#include "mypgm.h"
#define Mask_width 3
#define Mask_radius Mask_width/2
#define TILE_WIDTH 16
#define w (TILE_WIDTH + Mask_width - 1)
#define clamp(x) (min(max((x), 0.0), 1.0))
__global__ void convolution(float *I, const float* __restrict__ M, float *P, int width, int height) {
__shared__ float N_ds[w][w];
int k;
// First batch loading
int dest = threadIdx.y * TILE_WIDTH + threadIdx.x,
destY = dest / w, destX = dest % w,
srcY = blockIdx.y * TILE_WIDTH + destY - Mask_radius,
srcX = blockIdx.x * TILE_WIDTH + destX - Mask_radius,
src = srcY * width + srcX;
if (srcY >= 0 && srcY < height && srcX >= 0 && srcX < width)
N_ds[destY][destX] = I[src];
else
N_ds[destY][destX] = 0;
for (int iter = 1; iter <= (w*w) / (TILE_WIDTH*TILE_WIDTH); iter++)
{
// Second batch loading
dest = threadIdx.y * TILE_WIDTH + threadIdx.x + TILE_WIDTH * TILE_WIDTH;
destY = dest / w, destX = dest % w;
srcY = blockIdx.y * TILE_WIDTH + destY - Mask_radius;
srcX = blockIdx.x * TILE_WIDTH + destX - Mask_radius;
src = srcY * width + srcX;
if (destY < w) {
if (srcY >= 0 && srcY < height && srcX >= 0 && srcX < width)
N_ds[destY][destX] = I[src];
else
N_ds[destY][destX] = 0;
}
}
__syncthreads();
float accum = 0;
int y, x;
for (y = 0; y < Mask_width; y++)
for (x = 0; x < Mask_width; x++)
accum += N_ds[threadIdx.y + y][threadIdx.x + x] * M[y * Mask_width + x];
y = blockIdx.y * TILE_WIDTH + threadIdx.y;
x = blockIdx.x * TILE_WIDTH + threadIdx.x;
if (y < height && x < width)
P[y * width + x] = accum;
__syncthreads();
}
void sobel_filtering()
/* Spatial filtering of image data */
/* Sobel filter (horizontal differentiation */
/* Input: image1[y][x] ---- Outout: image2[y][x] */
{
/* Definition of Sobel filter in horizontal direction */
float weight[3][3] = { { -1, 0, 1 },
{ -2, 0, 2 },
{ -1, 0, 1 } };
float pixel_value;
int x, y, i, j; /* Loop variable */
float * deviceInputImageData;
float * deviceOutputImageData;
float * deviceMaskData;
cudaMalloc((void **)&deviceInputImageData, x_size1 * y_size1 * sizeof(float));
cudaMalloc((void **)&deviceOutputImageData, x_size1 * y_size1 * sizeof(float));
cudaMalloc((void **)&deviceMaskData, 3 * 3 * sizeof(float));
cudaMemcpy(deviceInputImageData, image1, x_size1 * y_size1 * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(deviceMaskData, weight, 3 * 3 * sizeof(float), cudaMemcpyHostToDevice);
/* Maximum values calculation after filtering*/
printf("Now, filtering of input image is performednn");
x_size2 = x_size1;
y_size2 = y_size1;
for (y = 0; y < y_size2; y++) {
for (x = 0; x < x_size2; x++) {
image2[y][x] = 0;
}
}
dim3 dimGrid(ceil((float)x_size1 / TILE_WIDTH), ceil((float)y_size1 / TILE_WIDTH));
dim3 dimBlock(TILE_WIDTH, TILE_WIDTH);
convolution<<<dimGrid, dimBlock>>>(deviceInputImageData, deviceMaskData, deviceOutputImageData, x_size1, y_size1);
cudaMemcpy(image2,
deviceOutputImageData,
x_size2 * y_size2 * sizeof(float),
cudaMemcpyDeviceToHost);
cudaFree(deviceInputImageData);
cudaFree(deviceOutputImageData);
cudaFree(deviceMaskData);
}
int main()
{
load_image_data(); /* Input of image1 */
clock_t begin = clock();
sobel_filtering(); /* Sobel filter is applied to image1 */
clock_t end = clock();
double time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
printf("nnTiming result of multiplication of matrix-vector: %fn", time_spent);
save_image_data(); /* Output of image2 */
return 0;
}
mypgm.h
/* pgm file IO headerfile ------ mypgm.h */
/* Constant declaration */
//#define MAX_IMAGESIZE 1024
#define MAX_IMAGEWIDTH 3840
#define MAX_IMAGEHEIGHT 2160
#define MAX_BRIGHTNESS 255 /* Maximum gray level */
#define GRAYLEVEL 256 /* No. of gray levels */
#define MAX_FILENAME 256 /* Filename length limit */
#define MAX_BUFFERSIZE 256
/* Global constant declaration */
/* Image storage arrays */
float image1[MAX_IMAGEWIDTH][MAX_IMAGEHEIGHT],
image2[MAX_IMAGEWIDTH][MAX_IMAGEHEIGHT];
int x_size1, y_size1, /* width & height of image1*/
x_size2, y_size2; /* width & height of image2 */
/* Prototype declaration of functions */
void load_image_data( ); /* image input */
void save_image_data( ); /* image output*/
void load_image_file(char *); /* image input */
void save_image_file(char *); /* image output*/
/* Main body of functions */
void load_image_data()
/* Input of header & body information of pgm file */
/* for image1[ ][ ],x_size1,y_size1 */
{
char file_name[MAX_FILENAME];
char buffer[MAX_BUFFERSIZE];
FILE *fp; /* File pointer */
int max_gray; /* Maximum gray level */
int x, y; /* Loop variable */
/* Input file open */
printf("n-----------------------------------------------------n");
printf("Monochromatic image file input routine n");
printf("-----------------------------------------------------nn");
printf(" Only pgm binary file is acceptablenn");
printf("Name of input image file? (*.pgm) : ");
scanf("%s", file_name);
fp = fopen(file_name, "rb");
if (NULL == fp) {
printf(" The file doesn't exist!nn");
exit(1);
}
/* Check of file-type ---P5 */
fgets(buffer, MAX_BUFFERSIZE, fp);
if (buffer[0] != 'P' || buffer[1] != '5') {
printf(" Mistaken file format, not P5!nn");
exit(1);
}
/* input of x_size1, y_size1 */
x_size1 = 0;
y_size1 = 0;
while (x_size1 == 0 || y_size1 == 0) {
fgets(buffer, MAX_BUFFERSIZE, fp);
if (buffer[0] != '#') {
sscanf(buffer, "%d %d", &x_size1, &y_size1);
}
}
/* input of max_gray */
max_gray = 0;
while (max_gray == 0) {
fgets(buffer, MAX_BUFFERSIZE, fp);
if (buffer[0] != '#') {
sscanf(buffer, "%d", &max_gray);
}
}
/* Display of parameters */
printf("n Image width = %d, Image height = %dn", x_size1, y_size1);
printf(" Maximum gray level = %dnn", max_gray);
if (x_size1 > MAX_IMAGEWIDTH || y_size1 > MAX_IMAGEHEIGHT) {
printf(" Image size exceeds %d x %dnn",
MAX_IMAGEWIDTH, MAX_IMAGEHEIGHT);
printf(" Please use smaller images!nn");
exit(1);
}
if (max_gray != MAX_BRIGHTNESS) {
printf(" Invalid value of maximum gray level!nn");
exit(1);
}
/* Input of image data*/
for (y = 0; y < y_size1; y++) {
for (x = 0; x < x_size1; x++) {
image1[y][x] = (unsigned char)fgetc(fp);
}
}
printf("-----Image data input OK-----nn");
printf("-----------------------------------------------------nn");
fclose(fp);
}
void save_image_data()
/* Output of image2[ ][ ], x_size2, y_size2 in pgm format*/
{
char file_name[MAX_FILENAME];
FILE *fp; /* File pointer */
int x, y; /* Loop variable */
/* Output file open */
printf("-----------------------------------------------------n");
printf("Monochromatic image file output routinen");
printf("-----------------------------------------------------nn");
printf("Name of output image file? (*.pgm) : ");
scanf("%s", file_name);
fp = fopen(file_name, "wb");
/* output of pgm file header information */
fputs("P5n", fp);
fputs("# Created by Image Processingn", fp);
fprintf(fp, "%d %dn", x_size2, y_size2);
fprintf(fp, "%dn", MAX_BRIGHTNESS);
/* Output of image data */
for (y = 0; y < y_size2; y++) {
for (x = 0; x < x_size2; x++) {
fputc(image2[y][x], fp);
}
}
printf("n-----Image data output OK-----nn");
printf("-----------------------------------------------------nn");
fclose(fp);
}
void load_image_file(char *filename)
/* Input of header & body information of pgm file */
/* for image1[ ][ ],x_size1,y_size1 */
{
char buffer[MAX_BUFFERSIZE];
FILE *fp; /* File pointer */
int max_gray; /* Maximum gray level */
int x, y; /* Loop variable */
/* Input file open */
fp = fopen(filename, "rb");
if (NULL == fp) {
printf(" The file doesn't exist!nn");
exit(1);
}
/* Check of file-type ---P5 */
fgets(buffer, MAX_BUFFERSIZE, fp);
if (buffer[0] != 'P' || buffer[1] != '5') {
printf(" Mistaken file format, not P5!nn");
exit(1);
}
/* input of x_size1, y_size1 */
x_size1 = 0;
y_size1 = 0;
while (x_size1 == 0 || y_size1 == 0) {
fgets(buffer, MAX_BUFFERSIZE, fp);
if (buffer[0] != '#') {
sscanf(buffer, "%d %d", &x_size1, &y_size1);
}
}
/* input of max_gray */
max_gray = 0;
while (max_gray == 0) {
fgets(buffer, MAX_BUFFERSIZE, fp);
if (buffer[0] != '#') {
sscanf(buffer, "%d", &max_gray);
}
}
if (x_size1 > MAX_IMAGEWIDTH || y_size1 > MAX_IMAGEHEIGHT) {
printf(" Image size exceeds %d x %dnn",
MAX_IMAGEWIDTH, MAX_IMAGEHEIGHT);
printf(" Please use smaller images!nn");
exit(1);
}
if (max_gray != MAX_BRIGHTNESS) {
printf(" Invalid value of maximum gray level!nn");
exit(1);
}
/* Input of image data*/
for (y = 0; y < y_size1; y++) {
for (x = 0; x < x_size1; x++) {
image1[y][x] = (float)fgetc(fp);
}
}
fclose(fp);
}
void save_image_file(char *filename)
/* Output of image2[ ][ ], x_size2, y_size2 */
/* into pgm file with header & body information */
{
FILE *fp; /* File pointer */
int x, y; /* Loop variable */
fp = fopen(filename, "wb");
/* output of pgm file header information */
fputs("P5n", fp);
fputs("# Created by Image Processingn", fp);
fprintf(fp, "%d %dn", x_size2, y_size2);
fprintf(fp, "%dn", MAX_BRIGHTNESS);
/* Output of image data */
for (y = 0; y < y_size2; y++) {
for (x = 0; x < x_size2; x++) {
fputc(image2[y][x], fp);
}
}
fclose(fp);
}
您仅看到图像的一部分的原因是因为您的主机图像缓冲区大小与设备图像缓冲区大小之间的不匹配。
主机上的图像缓冲区如下定义:
#define MAX_IMAGEWIDTH 3840
#define MAX_IMAGEHEIGHT 2160
...
float image1[MAX_IMAGEWIDTH][MAX_IMAGEHEIGHT],
image2[MAX_IMAGEWIDTH][MAX_IMAGEHEIGHT];
然后,您继续加载尺寸1024x1024的PGM图像。然后,您可以创建大小1024x1024的设备存储:
cudaMalloc((void **)&deviceInputImageData, x_size1 * y_size1 * sizeof(float));
cudaMalloc((void **)&deviceOutputImageData, x_size1 * y_size1 * sizeof(float));
其中x_size
和y_size1
由您的PGM加载例程定义,该日程将为1024,对于1024x1024图像。
然后,当您从主机到设备进行副本时(设备 ->主机副本也发生了类似的问题(:
cudaMemcpy(deviceInputImageData, image1, x_size1 * y_size1 * sizeof(float), cudaMemcpyHostToDevice);
您将从主机缓冲区复制连续字节到设备缓冲区。这意味着每个主机缓冲区阵容直至将MAX_IMAGEWIDTH
的完整宽度复制到设备。但这并不是您想要的。您只需要复制每个主机缓冲区直线至复制到设备的x_size1
。
有几种可能的方法来解决此问题。我很确定最简单的只是将MAX_IMAGEWIDTH
和MAX_IMAGEHEIGHT
设置为您打算使用的图像的实际值。当我这样做时,我得到了一个合理的过滤结果。
由于这将限制您处理单个图像大小,因此更好的方法是在阅读PGM标头数据后动态定义主机图像缓冲区的大小。
,也可以使用涉及cudaMemcpy2D
的方法,但这似乎是不必要的。
关于您的第二个问题,"第二批次加载"的原因是因为第一个批次加载仅将共享存储器加载到螺纹块的尺寸,因此它加载了共享内存的16x16"补丁",一个元素,一个元素。线。但是,我们需要加载完整的共享内存数组,因此我们必须进行额外的"批次"加载,以填充与滤波器宽度和高度相关的光环区域。
这是一个修改后的文件,似乎对我有效,演示了动态分配主机图像缓冲区的方法:
#include <stdio.h>
#include <stdlib.h>
#include <float.h>
#include <time.h>
#define MAX_IMAGEWIDTH 2048
#define MAX_IMAGEHEIGHT 2048
#define MAX_BRIGHTNESS 255 /* Maximum gray level */
#define GRAYLEVEL 256 /* No. of gray levels */
#define MAX_FILENAME 256 /* Filename length limit */
#define MAX_BUFFERSIZE 256
/* Global constant declaration */
/* Image storage arrays */
float *image1, *image2;
int x_size1, y_size1, /* width & height of image1*/
x_size2, y_size2; /* width & height of image2 */
/* Prototype declaration of functions */
void load_image_data( ); /* image input */
void save_image_data( ); /* image output*/
void load_image_file(char *); /* image input */
void save_image_file(char *); /* image output*/
/* Main body of functions */
void load_image_data()
/* Input of header & body information of pgm file */
/* for image1[ ][ ],x_size1,y_size1 */
{
char file_name[MAX_FILENAME];
char buffer[MAX_BUFFERSIZE];
FILE *fp; /* File pointer */
int max_gray; /* Maximum gray level */
int x, y; /* Loop variable */
/* Input file open */
printf("n-----------------------------------------------------n");
printf("Monochromatic image file input routine n");
printf("-----------------------------------------------------nn");
printf(" Only pgm binary file is acceptablenn");
printf("Name of input image file? (*.pgm) : ");
scanf("%s", file_name);
fp = fopen(file_name, "rb");
if (NULL == fp) {
printf(" The file doesn't exist!nn");
exit(1);
}
/* Check of file-type ---P5 */
fgets(buffer, MAX_BUFFERSIZE, fp);
if (buffer[0] != 'P' || buffer[1] != '5') {
printf(" Mistaken file format, not P5!nn");
exit(1);
}
/* input of x_size1, y_size1 */
x_size1 = 0;
y_size1 = 0;
while (x_size1 == 0 || y_size1 == 0) {
fgets(buffer, MAX_BUFFERSIZE, fp);
if (buffer[0] != '#') {
sscanf(buffer, "%d %d", &x_size1, &y_size1);
}
}
/* input of max_gray */
max_gray = 0;
while (max_gray == 0) {
fgets(buffer, MAX_BUFFERSIZE, fp);
if (buffer[0] != '#') {
sscanf(buffer, "%d", &max_gray);
}
}
/* Display of parameters */
printf("n Image width = %d, Image height = %dn", x_size1, y_size1);
printf(" Maximum gray level = %dnn", max_gray);
if (x_size1 > MAX_IMAGEWIDTH || y_size1 > MAX_IMAGEHEIGHT) {
printf(" Image size exceeds %d x %dnn",
MAX_IMAGEWIDTH, MAX_IMAGEHEIGHT);
printf(" Please use smaller images!nn");
exit(1);
}
if (max_gray != MAX_BRIGHTNESS) {
printf(" Invalid value of maximum gray level!nn");
exit(1);
}
image1 = (float *)malloc(x_size1*y_size1*sizeof(float));
/* Input of image data*/
for (y = 0; y < y_size1; y++) {
for (x = 0; x < x_size1; x++) {
image1[y*x_size1+x] = (unsigned char)fgetc(fp);
}
}
printf("-----Image data input OK-----nn");
printf("-----------------------------------------------------nn");
fclose(fp);
}
void save_image_data()
/* Output of image2[ ][ ], x_size2, y_size2 in pgm format*/
{
char file_name[MAX_FILENAME];
FILE *fp; /* File pointer */
int x, y; /* Loop variable */
/* Output file open */
printf("-----------------------------------------------------n");
printf("Monochromatic image file output routinen");
printf("-----------------------------------------------------nn");
printf("Name of output image file? (*.pgm) : ");
scanf("%s", file_name);
fp = fopen(file_name, "wb");
/* output of pgm file header information */
fputs("P5n", fp);
fputs("# Created by Image Processingn", fp);
fprintf(fp, "%d %dn", x_size2, y_size2);
fprintf(fp, "%dn", MAX_BRIGHTNESS);
/* Output of image data */
for (y = 0; y < y_size2; y++) {
for (x = 0; x < x_size2; x++) {
fputc(image2[y*x_size2+x], fp);
}
}
printf("n-----Image data output OK-----nn");
printf("-----------------------------------------------------nn");
fclose(fp);
}
#define Mask_width 3
#define Mask_radius Mask_width/2
#define TILE_WIDTH 16
#define w (TILE_WIDTH + Mask_width - 1)
#define clamp(x) (min(max((x), 0.0), 1.0))
__global__ void convolution(float *I, const float* __restrict__ M, float *P, int width, int height) {
__shared__ float N_ds[w][w];
// First batch loading
int dest = threadIdx.y * TILE_WIDTH + threadIdx.x,
destY = dest / w, destX = dest % w,
srcY = blockIdx.y * TILE_WIDTH + destY - Mask_radius,
srcX = blockIdx.x * TILE_WIDTH + destX - Mask_radius,
src = srcY * width + srcX;
if (srcY >= 0 && srcY < height && srcX >= 0 && srcX < width)
N_ds[destY][destX] = I[src];
else
N_ds[destY][destX] = 0;
for (int iter = 1; iter <= (w*w) / (TILE_WIDTH*TILE_WIDTH); iter++)
{
// Second batch loading
dest = threadIdx.y * TILE_WIDTH + threadIdx.x + TILE_WIDTH * TILE_WIDTH;
destY = dest / w, destX = dest % w;
srcY = blockIdx.y * TILE_WIDTH + destY - Mask_radius;
srcX = blockIdx.x * TILE_WIDTH + destX - Mask_radius;
src = srcY * width + srcX;
if (destY < w) {
if (srcY >= 0 && srcY < height && srcX >= 0 && srcX < width)
N_ds[destY][destX] = I[src];
else
N_ds[destY][destX] = 0;
}
}
__syncthreads();
float accum = 0;
int y, x;
for (y = 0; y < Mask_width; y++)
for (x = 0; x < Mask_width; x++)
accum += N_ds[threadIdx.y + y][threadIdx.x + x] * M[y * Mask_width + x];
y = blockIdx.y * TILE_WIDTH + threadIdx.y;
x = blockIdx.x * TILE_WIDTH + threadIdx.x;
if (y < height && x < width)
P[y * width + x] = accum;
}
void sobel_filtering()
/* Spatial filtering of image data */
/* Sobel filter (horizontal differentiation */
/* Input: image1[y][x] ---- Outout: image2[y][x] */
{
/* Definition of Sobel filter in horizontal direction */
float weight[3][3] = { { -1, 0, 1 },
{ -2, 0, 2 },
{ -1, 0, 1 } };
int x, y; /* Loop variable */
float * deviceInputImageData;
float * deviceOutputImageData;
float * deviceMaskData;
cudaMalloc((void **)&deviceInputImageData, x_size1 * y_size1 * sizeof(float));
cudaMalloc((void **)&deviceOutputImageData, x_size1 * y_size1 * sizeof(float));
cudaMalloc((void **)&deviceMaskData, 3 * 3 * sizeof(float));
cudaMemcpy(deviceInputImageData, image1, x_size1 * y_size1 * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(deviceMaskData, weight, 3 * 3 * sizeof(float), cudaMemcpyHostToDevice);
/* Maximum values calculation after filtering*/
printf("Now, filtering of input image is performednn");
x_size2 = x_size1;
y_size2 = y_size1;
image2 = (float *)malloc(x_size2*y_size2*sizeof(float));
for (y = 0; y < y_size2; y++) {
for (x = 0; x < x_size2; x++) {
image2[y*x_size2+x] = 0;
}
}
dim3 dimGrid(ceil((float)x_size1 / TILE_WIDTH), ceil((float)y_size1 / TILE_WIDTH));
dim3 dimBlock(TILE_WIDTH, TILE_WIDTH);
convolution<<<dimGrid, dimBlock>>>(deviceInputImageData, deviceMaskData, deviceOutputImageData, x_size1, y_size1);
cudaMemcpy(image2,
deviceOutputImageData,
x_size2 * y_size2 * sizeof(float),
cudaMemcpyDeviceToHost);
cudaFree(deviceInputImageData);
cudaFree(deviceOutputImageData);
cudaFree(deviceMaskData);
}
int main()
{
load_image_data(); /* Input of image1 */
clock_t begin = clock();
sobel_filtering(); /* Sobel filter is applied to image1 */
clock_t end = clock();
double time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
printf("nnTiming result of multiplication of matrix-vector: %fn", time_spent);
save_image_data(); /* Output of image2 */
return 0;
}
注意以上代码(您的PGM加载例程(具有(IMO(缺陷,因为它要求在PGM文件中的同一行上指定X和Y大小,但据我所知,这不是一个P5 PGM文件的要求。如果将其传递给有效的P5 PGM文件,该文件在文件中的不同行上指定了X和Y大小,则将悬挂。我没有尝试解决这个问题,因为这似乎不是您要问的问题。