CUDA-多次平行减排有时会失败

我有以下问题。我已经实现了几种不同的并行归约算法，如果每个内核只减少一个值，所有这些算法都能正确工作。但现在我需要减少几个(21)，我不知道为什么它有时有效，有时无效。

执行的步骤有：

计算每个线程的相关值(在示例中，我只是将它们设置为1，因为它显示了相同的行为)
将它们加载到共享内存中
同步块中的线程
减少共享内存中的值

以下是完整的代码，您只需cpy&pst并运行。

#include <stdio.h>
#include <cuda_runtime.h>
// switch the compiler flag if you don't have the sdk's helper_cuda.h file
#if 1
#include "helper_cuda.h"
#else
#define checkCudaErrors(val) (val)
#define getLastCudaError(msg)
#endif
#ifdef __CDT_PARSER__
#define __global__
#define __device__
#define __shared__
#define __host__
#endif
// compute sum of val over num threads
__device__ float localSum(const float& val, volatile float* reductionSpace, const uint& localId)
{
reductionSpace[localId] = val;  // load data into shared mem
__syncthreads();
// complete loop unroll
if (localId < 128) reductionSpace[localId] += reductionSpace[localId + 128];
__syncthreads();
if (localId < 64) reductionSpace[localId] += reductionSpace[localId + 64];
__syncthreads();
// within one warp (=32 threads) instructions are SIMD synchronous
// -> __syncthreads() not needed
if (localId < 32)
{
reductionSpace[localId] += reductionSpace[localId + 32];
reductionSpace[localId] += reductionSpace[localId + 16];
reductionSpace[localId] += reductionSpace[localId + 8];
reductionSpace[localId] += reductionSpace[localId + 4];
reductionSpace[localId] += reductionSpace[localId + 2];
reductionSpace[localId] += reductionSpace[localId + 1];
}
## Edit: Here we need to sync in order to guarantee that the thread with ID 0 is also done... ##
__syncthreads();
return reductionSpace[0];
}
__global__ void d_kernel(float* od, int n)
{
extern __shared__ float reductionSpace[];
int g_idx = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int linId = threadIdx.x;
__shared__ float partialSums[21];
float tmp[6] =
{ 0, 0, 0, 0, 0, 0 };
// for simplification all computations are remove - this version still shows the same behaviour
if (g_idx < n)
{
tmp[0] = 1.0f;
tmp[1] = 1.0f;
tmp[2] = 1.0f;
tmp[3] = 1.0f;
tmp[4] = 1.0f;
tmp[5] = 1.0f;
}
float res = 0.0f;
int c = 0;
for (int i = 0; i < 6; ++i)
{
for (int j = i; j < 6; ++j, ++c)
{
res = tmp[i] * tmp[j];
// compute the sum of the values res for blockDim.x threads. This uses
// the shared memory reductionSpace for calculations
partialSums[c] = localSum(res, reductionSpace, linId);
}
}
__syncthreads();
// write back the sum values for this block
if (linId < 21)
{
atomicAdd(&od[linId], partialSums[linId]);
}
}
int main()
{
int w = 320;
int h = 240;
int n = w * h;
// ------------------------------------------------------------------------------------
float *d_out;
checkCudaErrors(cudaMalloc(&d_out, 21 * sizeof(float)));
float* h_out = new float[21];
int dimBlock = 256;
int dimGrid = (n - 1) / dimBlock + 1;
int sharedMemSize = dimBlock * sizeof(float);
printf("w: %dn", w);
printf("h: %dn", h);
printf("dimBlock: %dn", dimBlock);
printf("dimGrid: %dn", dimGrid);
printf("sharedMemSize: %dn", sharedMemSize);
int failcounter = 0;
float target = (float) n;
int c = 0;
// ------------------------------------------------------------------------------------
// run the kernel for 200 times
for (int run = 0; run < 200; ++run)
{
cudaMemset(d_out, 0, 21 * sizeof(float));
d_kernel<<<dimGrid, dimBlock, sharedMemSize>>>(d_out, n);;
getLastCudaError("d_kernel");
checkCudaErrors(cudaMemcpy(h_out, d_out, 21 * sizeof(float), cudaMemcpyDeviceToHost));
// check if the output has target value
// since all threads get value 1 the kernel output corresponds to counting the elements which is w*h=n
bool failed = false;
for (int i = 0; i < 21; ++i)
{
if (abs(h_out[i] - target) > 0.01f)
{
++failcounter;
failed = true;
}
}
// if failed, print the elements to show which one failed
if (failed)
{
c = 0;
for (int i = 0; i < 6; ++i)
{
for (int j = i; j < 6; ++j, ++c)
{
printf("%10.7f ", h_out[c]);
}
printf("n");
}
}
}
printf("failcounter: %dn", failcounter);
// ------------------------------------------------------------------------------------
delete[] h_out;
checkCudaErrors(cudaFree(d_out));
// ------------------------------------------------------------------------------------
return 0;
}

一些评论：

BlockSize总是256-因此localSum()中展开的循环检查正确的线程ID。就像开头提到的那样，在200次跑步中，它有时是完全正确的，有时只有2个值是错误的，有时大约150个值是错的。

它不必对任何具有浮点精度的东西进行运算，因为只有1x1被相乘并存储在d_kernel()中的变量res中。我可以清楚地看到，有时只是一些线程或块没有开始，但我不知道为什么。：/

从结果来看，应该很明显存在某种比赛条件，但我根本看不出问题所在。

有人知道问题出在哪里吗？

编辑：

我现在测试了很多东西，发现它必须对BlockSize做一些事情。如果我将其缩减为smth<64并相应地更改localSum()，那么一切都会按预期进行。

但这对我来说毫无意义？！在这里，我仍然不做任何其他事情，只是使用共享内存进行正常的并行缩减，唯一的区别是每个线程执行21次。

编辑2：

现在我完全糊涂了问题在于展开循环或者更好的说法是同步扭曲。以下localSum()代码有效：

// compute sum of val over num threads
__device__ float localSum(const float& val, volatile float* reductionSpace, const uint& localId)
{
reductionSpace[localId] = val;  // load data into shared mem
__syncthreads();
for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
{
if (localId < s)
{
reductionSpace[localId] += reductionSpace[localId + s];
}
__syncthreads();
}
return reductionSpace[0];
}

但是，如果我展开最后一个扭曲，并且不在线程之间同步，我有时会在2000次运行中再次得到2到3个错误的结果。因此，以下代码不起作用：

// compute sum of val over num threads
__device__ float localSum(const float& val, volatile float* reductionSpace, const uint& localId)
{
reductionSpace[localId] = val;  // load data into shared mem
__syncthreads();
for (unsigned int s = blockDim.x / 2; s > 32; s >>= 1)
{
if (localId < s)
{
reductionSpace[localId] += reductionSpace[localId + s];
}
__syncthreads();
}
if (localId < 32)
{
reductionSpace[localId] += reductionSpace[localId + 32];
reductionSpace[localId] += reductionSpace[localId + 16];
reductionSpace[localId] += reductionSpace[localId + 8];
reductionSpace[localId] += reductionSpace[localId + 4];
reductionSpace[localId] += reductionSpace[localId + 2];
reductionSpace[localId] += reductionSpace[localId + 1];
}
return reductionSpace[0];
}

但是，既然CUDA同时执行一个warp(32个线程)，而且不需要__syncthreads()，这又有什么意义呢？！

我不需要有人在这里发布我的工作代码，但我真的要求在CUDA编程方面有丰富经验和深厚知识的人来描述这里的潜在问题。或者至少给我一个提示。

解决方案太简单了，我几乎羞于告诉它。我太盲目了，到处看，但没有看最明显的代码。localSum()中的return语句之前缺少一个简单的__syncthreads()。Bc最后一个扭曲本身正在同时执行，但不能保证线程ID为0的扭曲完成。。。这是一个愚蠢的错误，我只是没有看到。

很抱歉给你带来麻烦..：)

编辑：

编辑2：

相关内容

最新更新

热门标签：