Cuda 基本程序 (将值写入矩阵和 std:cout 不起作用) ;主功能不启动



我写了一个非常简单的Cuda程序。我想为设备内存中的矩阵赋值。然后我想将值复制到主机并显示它们。我写的程序不起作用。但我不知道为什么。我试图通过使用cout显示状态来弄清楚我做错了什么,但即使这样也不起作用,所以我认为主要功能没有启动。

有谁知道问题出在哪里?

这是我的代码:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdio.h>
const int N = 1024;

__global__ void matrix(float *d_A)
{
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;

int index = col + row * N;
if (col < N && row < N)
{
d_A[index] = 255;
}
}
int main()
{
std::cout << "Programm begins";
float A[N * N];
float d_A[N * N];
cudaMalloc((void**)&d_A, (N * N)*sizeof(float));
std::cout << "Matrizes allocated";
std::cout << A[0] << " , " << A[1] << " , " << A[2] << " , " << A[3] << " , " << A[4] << " , " << A[5] << "n";
std::cout << A[1024] << " , " << A[1025] << " , " << A[1026] << " , " << A[1027] << " , " << A[1028] << " , " << A[1029] << "n";
matrix << <1024, 1024 >> >(d_A);
std::cout << "Wrote Matrix to local device memory";
std::cout << d_A[0] << " , " << d_A[1] << " , " << d_A[2] << " , " << d_A[3] << " , " << d_A[4] << " , " << d_A[5] << "n";
std::cout << d_A[1024] << " , " << d_A[1025] << " , " << d_A[1026] << " , " << d_A[1027] << " , " << d_A[1028] << " , " << d_A[1029] << "n";

cudaMemcpy(A, d_A, N * N * sizeof(float), cudaMemcpyDeviceToHost);
std::cout << "Wrote Matrix to host memory";
std::cout << A[0] << " , " << A[1] << " , " << A[2] << " , " << A[3] << " , " << A[4] << " , " << A[5] << "n";
std::cout << A[1024] << " , " << A[1025] << " , " << A[1026] << " , " << A[1027] << " , " << A[1028] << " , " << A[1029] << "n";
return 0;
}

您提供的代码存在一些问题。

  1. 取消引用主机的设备内存,例如d_A[0]是非法的,将导致未定义的行为。
  2. 将数据
  3. 视为内核内部的二维数据,而网格和块则作为一维提供。在这种情况下,row变量将始终为 0,并且在计算index中基本上没有任何作用。将格网和块大小定义为dim3类型以创建 2D 格网和块。
  4. 不建议在堆栈上创建大型数组,例如float A[N*N];。首选使用new运算符进行动态内存分配。
  5. 设备内存分配给已分配的主机阵列d_A是未定义的行为。如果要将设备内存分配给变量,只需将其声明为简单的指针,如float* d_A;.

固定代码可能如下所示:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdio.h>
const int N = 1024;
__global__ void matrix(float *d_A)
{
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;

int index = col + row * N;
if (col < N && row < N)
{
d_A[index] = 255;
}
}
int main()
{
std::cout << "Programm begins"<<std::endl;
float *A = new float[N*N];
float *d_A;
cudaMalloc((void**)&d_A, (N * N)*sizeof(float));
std::cout << "Matrizes allocated"<<std::endl;
std::cout << A[0] << " , " << A[1] << " , " << A[2] << " , " << A[3] << " , " << A[4] << " , " << A[5] <<std::endl;
std::cout << A[1024] << " , " << A[1025] << " , " << A[1026] << " , " << A[1027] << " , " << A[1028] << " , " << A[1029] <<std::endl;
dim3 block(32,32);
dim3 grid;
grid.x = (N + block.x - 1) / block.x;
grid.y = (N + block.y - 1) / block.y;
matrix << <grid, block >> >(d_A);
std::cout << "Wrote Matrix to local device memory"<<std::endl;
cudaMemcpy(A, d_A, N * N * sizeof(float), cudaMemcpyDeviceToHost);
std::cout << "Wrote Matrix to host memory"<<std::endl;
std::cout << A[0] << " , " << A[1] << " , " << A[2] << " , " << A[3] << " , " << A[4] << " , " << A[5] <<std::endl;
std::cout << A[1024] << " , " << A[1025] << " , " << A[1026] << " , " << A[1027] << " , " << A[1028] << " , " << A[1029] <<std::endl;
cudaFree(d_A);
delete[] A;
return 0;
}

强烈建议为每个 CUDA API 调用添加错误检查,以简化调试过程。

您的代码存在一些问题,如果这是您进入 Cuda 和 C++ 的第一步,我会进一步简化代码。 试试这个(重要的变化被评论星包围(;

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdio.h>
const int Grids = 256;
const int Threads = 256;
__global__ void matrix(float *d_A)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
d_A[idx] = 1.0;
}
int main()
{
std::cout << "Programm begins";
// ****
float *A = new float[Grids * Threads];
float *d_A;
// ****
cudaMalloc((void**)&d_A, (Grids * Threads)*sizeof(float));
matrix<<<Grids, Threads>>>(d_A);
cudaMemcpy(A, d_A, Grids * Threads*sizeof(float), cudaMemcpyDeviceToHost);
for(int i=0; i < (Grids * Threads); ++i)
{
cout << A[i] << ",";
}
// ****
cudaFree(d_A);
delete A;
// ****  
return 0;
}

另请查看此处的基本示例,https://devblogs.nvidia.com/easy-introduction-cuda-c-and-c/

这里有几个问题;

1( 您分配的主机内存 N * N (1024 * 1024( 数量很大,可能无法从堆中使用

2( 当您声明d_A变量时,您还声明了主机内存以及它的设备内存,这不是必需

的3( 您没有释放设备内存用于d_A

4( 您的设备/GPU 可能无法同时运行 1024 个线程;在这种情况下,它可能会静默失败,最终您的内核将无法运行。

最新更新