CUDA 共享内存写入会导致无法解释的长时间延迟



这让我发疯了。我有一个 1D 块的 3D 网格。每个块包含 272 个线程。每个线程执行两个向量的点积,并将其结果存储在__shared__大小为[272]的双精度内存数组中的相应位置,该数组的线程数相同。主线程正在调用多个内核,我正在加起来执行它们所花费的时间。当我注释掉写入共享内存的行时,我的执行时间约为 2,401 毫秒。当我注释掉共享内存写入行时,我得到非常长的时间,例如 450,309 毫秒。我尝试使用 int 值而不是 double s。我还尝试使用if(threadIdx.x ==0)语句,只让一个线程进行写入,以避免可能的银行冲突。似乎什么都不起作用。

下面是调用线程代码:

double theta=0;
int count=0;
cudaEventRecord(start,0);
while(theta <180)
{
    theta+=0.18;
    calc_LF<<<gridDim, blockDim>>>(ori_dev, X_dev, Y_dev, Z_dev, F_dev, F_grad_dev, g_oriD, r_vD, LF);
    calc_S<<<gridDim, 272>>>(g_traD, LF, Ci, C);
    count++;
}
cudaEventRecord( stop, 0 );
cudaEventSynchronize( stop );
cudaEventElapsedTime( &elapsedTime, start, stop );
err = cudaGetLastError();
if ( cudaSuccess != err )
{
    fprintf( stderr, "Cuda error in file '%s' in line %i : %s.n",
             __FILE__, __LINE__, cudaGetErrorString( err) );
}
else
{
    fprintf( stderr, "n n Cuda NO error in file '%s' in line %i : %s.n",
             __FILE__, __LINE__, cudaGetErrorString( err) );
    printf("n %d orientation updates: Total Time = %3.10f msn", count, elapsedTime);
}

有问题的内核是calc_S内核,其代码是:

__global__ void calc_S(double* g_traD, double* LF, double* Ci, double* C)
{
    __shared__ double G[H];
    int myTRA[W];
    int tx= threadIdx.x;
    for(int j=0; j<W; j++)
    {
        myTRA[j]= getElement(g_traD, tx, j, W);
    }
    double sum;
    for(int j=0; j<W; j++)
    {
        sum += myTRA[j] * LF[j];
    }       
    // Write your sum to shared memory
    G[threadIdx.x]=sum;
    // __syncthreads();
}

我正在使用带有CUDA 4.2和计算能力2.0的GPU(即GeForce GTX 580(的MS Visual Studio 2008。笔记:

  • 每个块 272 个线程。
  • H/W 线程限制:1,536/272 = 最多 5 个块
  • 共享内存限制:G[272] 的双精度 = 需要 2,176 字节。 48K/2176= 最多 22 个块(这永远不会发生,但我们知道共享内存没有限制(
  • 寄存器根本不是问题。

因此,应该是可以同时执行 5 个块。

感谢您的任何帮助。

编辑:

这是整个代码的缩短版本。整个代码可以在MatrixMul Nvidia SDK示例中运行。

在文件MatrixMul.cu

int main(int argc, char** argv)
{
    // reading data from Matlab into double arrays
    //CUDA begins here:
    if(shrCheckCmdLineFlag(argc, (const char**)argv, "device"))
    {
        cutilDeviceInit(argc, argv);
    }
    else
    {
        cutilSafeCall( cudaSetDevice(cutGetMaxGflopsDeviceId()) );
    }
    int devID;
    cudaDeviceProp props;
    // get GPU props
    cutilSafeCall(cudaGetDevice(&devID));
    cutilSafeCall(cudaGetDeviceProperties(&props, devID));
    printf("Device %d: "%s" with Compute %d.%d capabilityn", devID, props.name, props.major, props.minor);
    
    //Declare Device memory for matrices read from Matlab
    double *X_dev;    // size 19 x 1
    double *Y_dev;    // size 19 x 1
    double *Z_dev;    // size 17 x 1
    double *r_vD;     // size 544 x 3
    double *g_oriD;   // size 544 x 3
    double *g_traD;   // size 272 x 544
    double *cov_D;    // size 272 x 272
    double *cov_i_D;  // size 272 x 272
  
    err= cudaMalloc((void**)&X_dev, sizeX*sizeof(double));
    errorCheck(err);
    err= cudaMalloc((void**)&Y_dev, sizeY*sizeof(double));
    errorCheck(err);
    err= cudaMalloc((void**)&Z_dev, sizeZ*sizeof(double));
    errorCheck(err);
    err= cudaMalloc((void**)&r_vD, sizeR_V*sizeof(double));
    errorCheck(err);
    err= cudaMalloc((void**)&g_oriD, sizeG_ori*sizeof(double));
    errorCheck(err);
    err= cudaMalloc((void**)&g_traD, sizeG_tra*sizeof(double));
    errorCheck(err);
    err= cudaMalloc((void**)&cov_D, sizeCov*sizeof(double));
    errorCheck(err); 
    err= cudaMalloc((void**)&cov_i_D, sizeCov_i*sizeof(double));
    errorCheck(err); 
    //Transfer Xs, Ys, and Zs to GPU Global memory
    cudaMemcpy(X_dev,dipole_x_coords, sizeX*sizeof(double), cudaMemcpyHostToDevice);
    errorCheck(err);
    cudaMemcpy(Y_dev,dipole_y_coords, sizeY*sizeof(double), cudaMemcpyHostToDevice);
    errorCheck(err);
    cudaMemcpy(Z_dev,dipole_z_coords, sizeZ*sizeof(double), cudaMemcpyHostToDevice);
    errorCheck(err);
    // Transfer r_v, g_ori, and g_tra to GPU memory
    cudaMemcpy(r_vD, r_v, sizeR_V*sizeof(double), cudaMemcpyHostToDevice);
    errorCheck(err);
    cudaMemcpy(g_oriD,g_ori, sizeG_ori*sizeof(double), cudaMemcpyHostToDevice);
    errorCheck(err);
    cudaMemcpy(g_traD,g_tra, sizeG_tra*sizeof(double), cudaMemcpyHostToDevice);
    errorCheck(err);
    // Transfer cov, and cov_i to GPU memory
    cudaMemcpy(cov_D, cov_post, sizeCov*sizeof(double), cudaMemcpyHostToDevice);
    errorCheck(err);
    cudaMemcpy(cov_i_D,cov_post_i, sizeCov_i*sizeof(double), cudaMemcpyHostToDevice);
    //Specify dimensions of block and grid
    dim3 gridDim(sizeX, sizeY, sizeZ);   // 19 x 19 x 17
    int numThreads=(int) sizeR_V/3;      // numThreads = 544
    dim3 blockDim(numThreads,1,1);       // 544 x 1 x 1 
    //call Cuda wrapper
    float cf = runB(X_dev, Y_dev, Z_dev, r_vD, g_oriD, g_traD, cov_i_D, cov_D, blockDim, gridDim, sizeG_tra, tra_W, tra_H);
    int c=0;
    scanf("%d", c);
    return 0;
}
float runB(double* X_dev, double* Y_dev, double* Z_dev, 
           double* r_vD, double* g_oriD, double* g_traD, double* Ci, double* C,
           dim3 blockDim, dim3 gridDim, int sizeG_tra, int tra_W, int tra_H)
{  
    cudaError err;
    // Calculate the size of thread output in global memory
    size_t size_F = gridDim.x * gridDim.y * gridDim.z * blockDim.x;
    size_t size_F_grad = gridDim.x * gridDim.y * gridDim.z * blockDim.x * 3;
    // Make global memory space for F and F_grad 
    double* F_dev;
    double* F_grad_dev;
    err= cudaMalloc((void**)&F_dev, size_F*sizeof(double));
    errorCheck(err); 
    err= cudaMalloc((void**)&F_grad_dev, size_F_grad*sizeof(double));
    errorCheck(err); 
    //Allocate Device memory for LF 
    double *LF;
    err= cudaMalloc((void**)&LF, 544*sizeof(double));
    errorCheck(err); 
    cudaEvent_t start, stop;
    float elapsedTime;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    double theta=0;
    cudaEventRecord(start,0);
    while(theta <180)
    {
        theta+=0.18;
        calc_LF<<<gridDim, blockDim>>>(ori_dev, X_dev, Y_dev, Z_dev, F_dev, F_grad_dev, g_oriD, r_vD, LF);
        calc_S<<<gridDim, 272>>>(g_traD, LF, Ci, C);
        count++;
    }
    cudaEventRecord( stop, 0 );
    cudaEventSynchronize( stop );
    cudaEventElapsedTime( &elapsedTime, start, stop );
    err = cudaGetLastError();
    if ( cudaSuccess != err )
    {
        fprintf( stderr, "Cuda error in file '%s' in line %i : %s.n",
                 __FILE__, __LINE__, cudaGetErrorString( err) );
    }
    else
    {
        fprintf( stderr, "n n Cuda NO error in file '%s' in line %i : %s.n",
                 __FILE__, __LINE__, cudaGetErrorString( err) );
        printf("n 180 orientation updates: Total Time = %3.10f msn",elapsedTime);
    }
    return 0;
}

在文件 MatrixMul_kernel.cu 中:

#define HDM_DIM 3 
__global__ void calc_LF(double* ori_dev, double* X_dev, double* Y_dev, double* Z_dev, double* F_dev, double* F_grad_dev, 
                        double* g_oriD, double* r_vD, double* LF)
{ 
    // Get this block's global index 
    int blockId= blockIdx.x + gridDim.x*blockIdx.y + gridDim.x*gridDim.y*blockIdx.z;
    int tx= threadIdx.x;
    // This thread's global index
    int gtx= blockId*blockDim.x + threadIdx.x;
    double r_v[3];
    double g_ori[3];
        
    // Each thread reads 1 row (3 values) of r_vD 
    r_v[0] = getElement(r_vD, tx, 0, HDM_DIM);
    r_v[1] = getElement(r_vD, tx, 1, HDM_DIM);
    r_v[2] = getElement(r_vD, tx, 2, HDM_DIM);
    // Each thread reads 1 row (3 values) of g_oriD (which contains grad.ori data)
    g_ori[0] = getElement(g_oriD, tx, 0, HDM_DIM);
    g_ori[1] = getElement(g_oriD, tx, 1, HDM_DIM);
    g_ori[2] = getElement(g_oriD, tx, 2, HDM_DIM);
    
    //fetch d_ori from global memory
    double d_ori[3];
    for(int i=0; i< 3; i++){
        d_ori[i]= ori_dev[3*gtx+i];
    }   
    //read this block's X, Y, Z location
    double x= X_dev[blockIdx.x];
    double y= Y_dev[blockIdx.y];
    double z= Z_dev[blockIdx.z];     
     
    double c2[HDM_DIM];
    c2[0]= d_ori[1]*z - d_ori[2]*y;
    c2[1]= d_ori[2]*x - d_ori[0]*z;
    c2[2]= d_ori[0]*y - d_ori[1]*x;
    
    // Fetch F and F_grad from global memory
    double F = F_dev[gtx];
    double F_grad[3];
    for(int j=0; j<3; j++)
    {
        F_grad[j] = F_grad_dev[gtx*3+j];
    }
    double c1[HDM_DIM];
    c1[0]= F* c2[0];
    c1[1]= F* c2[1];
    c1[2]= F* c2[2];
    
    double d3= c2[0]*r_v[0] + c2[1]*r_v[1] + c2[2]*r_v[2];
    
    double s2[HDM_DIM];
    for(int j=0; j<HDM_DIM; j++)
    {
        s2[j] = d3*F_grad[j];
    }
    
    double s1[HDM_DIM];
    for(int j=0; j<HDM_DIM; j++)
    {
        s1[j] = c1[j] - s2[j];
    }
    
    double b_v[HDM_DIM];
    for(int j=0; j<HDM_DIM; j++)
    {
        b_v[j] = (10^-7)/(F*F) * s1[j]; 
    }
    double sum=0;
    for(int j=0; j<HDM_DIM; j++)
    {
        sum += b_v[j]*g_ori[j];
    }   
    // Write this thread's value to global memory
    LF[tx]= sum;
 
}       

值得一提的是,这个calc_LF内核用于在共享内存中写入其最终结果,这将执行时间从大约 500+ 毫秒增加到大约 2,500 毫秒(即只有共享内存写入行大约乘以 5(。

__global__ void calc_S(double* g_traD, double* LF, double* Ci, double* C)
{
    __shared__ double T[H];
    __shared__ double G[H];
    // Get this block's global index 
    int blockId= blockIdx.x + gridDim.x*blockIdx.y + gridDim.x*gridDim.y*blockIdx.z;
    int tx= threadIdx.x;
    // This thread's global index
    int gtx= blockId*blockDim.x + threadIdx.x;
    
    int myTRA[W];
    double my_LF[W];
    for (int i=0; i<W; i++){
        my_LF[i]= LF[gtx];
    }
    for(int j=0; j<W; j++){
        myTRA[j]= getElement(g_traD, tx, j, W);
    }
    double sum;
    for(int j=0; j<W; j++)
    {
        sum += myTRA[j] * my_LF[j];
    }       
    // Write your sum to shared memory
    G[tx]=sum;
    __syncthreads();
}

您看到的效果是编译器优化的结果。获取基本内核代码的可编译版本:

#define H (128)
#define W (128)
__device__
double getElement(const double *g, int t, int j, int w)
{
    return g[t + j*w];
}
__global__ 
void calc_S(double* g_traD, double* LF, double* Ci, double* C)
{
    __shared__ double G[H];
    // Get this block's global index 
    int blockId= blockIdx.x + gridDim.x*blockIdx.y + 
                   gridDim.x*gridDim.y*blockIdx.z;
    int tx= threadIdx.x;
    // This thread's global index
    int gtx= blockId*blockDim.x + threadIdx.x;
    int myTRA[W];
    double my_LF[W];
    for (int i=0; i<W; i++){
        my_LF[i]= LF[gtx];
    }
    for(int j=0; j<W; j++){
        myTRA[j]= getElement(g_traD, tx, j, W);
    }
    double sum;
    for(int j=0; j<W; j++)
    {
        sum += myTRA[j] * my_LF[j];
    }       
    // Write your sum to shared memory
    G[tx]=sum;
    __syncthreads();
}

使用 CUDA 5 编译它给出了:

$ nvcc -m64 -arch=sm_20 -cubin -Xptxas="-v"  dead_code.cu 
dead_code.cu(13): warning: variable "G" was set but never used
dead_code.cu(13): warning: variable "G" was set but never used
ptxas info    : 0 bytes gmem
ptxas info    : Compiling entry function '_Z6calc_SPdS_S_S_' for 'sm_20'
ptxas info    : Function properties for _Z6calc_SPdS_S_S_
    1536 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 23 registers, 1024 bytes smem, 64 bytes cmem[0]

有一个关于共享内存变量G未使用的警告,但编译器遵循它并发出消耗 23 个寄存器的代码。所以现在,如果我注释掉内核末尾的G[tx]=sum,它会像这样编译:

$ nvcc -m64 -arch=sm_20 -cubin -Xptxas="-v"  dead_code.cu 
dead_code.cu(13): warning: variable "G" was declared but never referenced
dead_code.cu(13): warning: variable "G" was declared but never referenced
ptxas info    : 0 bytes gmem
ptxas info    : Compiling entry function '_Z6calc_SPdS_S_S_' for 'sm_20'
ptxas info    : Function properties for _Z6calc_SPdS_S_S_
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 2 registers, 64 bytes cmem[0]

现在只使用了两个寄存器,工具链发出了这个:

$ cuobjdump -sass dead_code.cubin 
    code for sm_20
        Function : _Z6calc_SPdS_S_S_
    /*0000*/     /*0x00005de428004404*/     MOV R1, c [0x1] [0x100];
    /*0008*/     /*0xfc1fdc03207e0000*/     IMAD.U32.U32 RZ, R1, RZ, RZ;
    /*0010*/     /*0xffffdc0450ee0000*/     BAR.RED.POPC RZ, RZ;
    /*0018*/     /*0x00001de780000000*/     EXIT;

即四个组装指令。你所有的代码都消失了。

这种影响的根本来源是编译器死代码删除。编译器足够聪明,可以确定对全局或共享内存输出没有影响的代码是不需要的,可以删除。在这种情况下,一个写入G被删除,整个内核实际上毫无意义,编译器只是优化了整个事情。您可以在此处和此处看到其他一些死代码删除示例及其影响。后者在 OpenCL 中,但相同的机制适用。

最新更新