此代码正在打印"结果为0",它似乎不是从设备复制结果。这是为什么呢?
__global__ void add_arrays(int *a, int *b, int *c, int size){
int index = threadIdx.x + blockIdx.x * blockDim.x;
//avoid accessing beyond the end of the array
if (index < size)
{ c[index] = a[index] + b[index]; }
}
#define N (2048 * 2048)
#define THREADS_PER_BLOCK 512
int main()
{
int *a, *b, *c; // local (host) copy
int *d_a, *d_b, *d_c; // device copy
int size = N * sizeof(int);
//allocating space for device copies ON THE DEVICE
CUDA_CHECK_RETURN(cudaMalloc((void **)&d_a, size));
CUDA_CHECK_RETURN(cudaMalloc((void **)&d_b, size));
CUDA_CHECK_RETURN(cudaMalloc((void **)&d_c, size));
//allocating space for local copies ON THE HOST and initialize: a, b
a = (int *)malloc(size); random_ints(a, N);
b = (int *)malloc(size); random_ints(b, N);
c = (int *)malloc(size); //random_ints(c, N);
//copy inputs to device
CUDA_CHECK_RETURN(cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice));
CUDA_CHECK_RETURN(cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice));
int num_block = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
add_arrays<<<num_block, THREADS_PER_BLOCK>>>(d_a, d_b, d_c, N);
// Copy result back from Device to Host
CUDA_CHECK_RETURN(cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost));
for (int i =0; i<N; ++i) {
printf("Result is %d n", *(c+i));
}
您可能需要考虑发布一个完整的可重现示例,以便它包含您对CUDA_CHECK_RETURN
、random_ints
、包含标题等的定义。如注释中所述,代码中的明显问题是
cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice)
应该是
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice)
同样,对于b
,a
和b
都是指针。