我需要求和一个向量,它比cuda块中的线程数要长。所以我使用多个块来处理任务。我在每个块中对向量的一部分求和,之后我有两个选择,一个是使用atomicAdd将每个块的和合并,另一个是将结果写入某个全局内存并启动另一个内核进行求和。你建议我用哪种方法?
当我们做reduce sum时,cuda atomicAdd操作比启动另一个内核更快吗?
对于下面的测试用例,使用此处(视频)中从幻灯片16和17中提取的代码,它似乎要快一点。区别在于内核启动开销的成本,这是有意义的:
$ cat t1834.cu
#include <time.h>
#include <sys/time.h>
#include <iostream>
const int BLOCK_SIZE = 1024;
template <typename T>
__global__ void reduce(const T * __restrict__ gdata, T * __restrict__ out, const int N){
__shared__ T sdata[BLOCK_SIZE];
int tid = threadIdx.x;
sdata[tid] = 0.0;
size_t idx = threadIdx.x+blockDim.x*blockIdx.x;
while (idx < N) { // grid stride loop to load data
sdata[tid] += gdata[idx];
idx += gridDim.x*blockDim.x;
}
for (unsigned int s=blockDim.x/2; s>0; s>>=1) {
__syncthreads();
if (tid < s) // parallel sweep reduction
sdata[tid] += sdata[tid + s];
}
if (tid == 0)
#ifndef USE_ATOMIC
out[blockIdx.x] = sdata[0];
#else
atomicAdd(out, sdata[0]);
#endif
}
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
typedef float mt;
const int ds = 1048576*8;
int main(){
mt *h_gdata, *d_gdata, *h_out, *d_out;
h_gdata = new mt[ds];
cudaMalloc(&d_gdata, ds*sizeof(mt));
const int nblocks = 160;
h_out = new mt[1];
cudaMalloc(&d_out, nblocks*sizeof(mt));
for (int i = 0; i < ds; i++) h_gdata[i] = 1;
cudaMemcpy(d_gdata, h_gdata, ds*sizeof(mt), cudaMemcpyHostToDevice);
reduce<<<nblocks, BLOCK_SIZE>>>(d_gdata, d_out, ds); // warm-up
cudaDeviceSynchronize();
cudaMemset(d_out, 0, sizeof(mt));
unsigned long long dt = dtime_usec(0);
reduce<<<nblocks, BLOCK_SIZE>>>(d_gdata, d_out, ds);
#ifndef USE_ATOMIC
reduce<<<1, BLOCK_SIZE>>>(d_out, d_out, nblocks);
#endif
cudaDeviceSynchronize();
dt = dtime_usec(dt);
cudaMemcpy(h_out, d_out, sizeof(mt), cudaMemcpyDeviceToHost);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {std::cout << "CUDA error: " << cudaGetErrorString(err) << std::endl; return 0;}
if (h_out[0] != ds) {std::cout << "Reduce Error: " << h_out[0] << std::endl; return 0;}
std::cout << "Timing: " << dt << "us" << std::endl;
return 0;
}
$ nvcc -lineinfo -arch=sm_70 -O3 -o t1834 t1834.cu -std=c++14 -Wno-deprecated-gpu-targets
$ ./t1834
Timing: 69us
$ nvcc -lineinfo -arch=sm_70 -O3 -o t1834 t1834.cu -std=c++14 -Wno-deprecated-gpu-targets -DUSE_ATOMIC
$ ./t1834
Timing: 66us
$
(CUDA 11.2, Centos 7, V100 GPU)