c语言 - 如何使用 cuda 沿行方向对巨大的 2D 矩阵进行缩减?(每行的最大值和最大值的索引)



我正试图实现沿着二维矩阵的行方向的减少。我从我在StackOverflow上找到的代码开始(非常感谢Robert!)

thrust::max_element比较慢cublasIsamax -更有效的实现?

上面的链接显示了一个对单行执行约简的自定义内核。它将输入行分成许多行,每行有1024个线程。非常好。

对于2D情况,一切都是一样的,除了现在有一个y网格维度。所以每个块的y维数仍然是1。问题是,当我尝试将数据写入每个块内的共享内存(在代码中的max_idx_kernel_reduction_within_block内核内)时,它需要很长时间-长于(# of Rows) *(在1行上执行减少所需的时间)。我宁愿运行一个for循环。我知道我有很多元素,但我期待更快的东西。

我不认为内存访问模式是一个问题,但我听说共享内存的总量可能是限制?CUDA:合并全局内存访问比共享内存更快吗?另外,分配一个大的共享内存数组是否会减慢程序的速度?

有什么建议,使我的代码更快(第一个内核是瓶颈)?非常感谢,非常感谢!

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <iostream>
#include <cuda_runtime.h>
#define NCOLS 163317 // number of columns
#define NROWS 8 // number of rows
#define nTPB 1024  // Threads per Block. nTPB should be a power-of-2
#define MAX_BLOCKS_X ((NCOLS/nTPB)+1) // # of blocks I will launch
#define MIN(a,b) ((a>b)?b:a)
#define FLOAT_MIN -1.0f // lowest anticipated number of the data. Values in array will be compared with this and updated with the larger one
#define IDX2F(i,j,ld) ((j-1) * (ld) + ((i) - 1))  // 1 based indexing
#define IDX2C(i,j,ld) ((j) * (ld) + i)  // 0 based indexing

__device__ volatile float blk_vals[NROWS][MAX_BLOCKS_X];
__device__ volatile int   blk_idxs[NROWS][MAX_BLOCKS_X];
// blk_vals and blk_idxs are the results obtained from reduction within each block.
// after 1st reduction, each row will have blk_vals[MAX_BLOCKS_X] array and blk_idxs[MAX_BLOCKS_X]
// these will be passed to the 2nd kernel
__global__ void max_idx_kernel_reduction_within_block(const float *data, const int xSize, const int ySize){  // first kernel. Reduction within blocks
  __shared__ volatile float vals[nTPB]; // Total amount of shared memory per block: 49152 bytes (50 KB). 1024 gives ~ 4KB for single.
  __shared__ volatile int idxs[nTPB]; // ~ 4 KB for single, when nTPB is 1024. each block will have both indices and values
  int idx = threadIdx.x+blockDim.x * blockIdx.x; // idx in the x direction
  float my_val = FLOAT_MIN; // lowest possible number
  int my_idx = -1;  // to check whether you DID perform the kernel. Again, it's the idx in the x dir.
  // sweep from global memory
  while (idx < xSize){   // this ensures you don't go out the size of the array's x direction
    if (data[IDX2C(blockIdx.y,idx,ySize)] > my_val) {my_val = data[IDX2C(blockIdx.y,idx,ySize)]; my_idx = idx;}
    // compare with my_val, and put the bigger value into my_val for next comparison. my_idx is 0 index based
    idx += blockDim.x*gridDim.x;}
                                                                 // until here takes about 6 ms !! very fast!!
  // populate shared memory: takes ~ 270 ms
  vals[threadIdx.x] = my_val;  // put the computed max value for each thread into the shared memory. -> this is the bottleneck!!
  idxs[threadIdx.x] = my_idx;  // do this for index as well -> this is also slow!!


  __syncthreads();

  // sweep in shared memory
  for (int i = (nTPB>>1); i > 0; i>>=1){
    if (threadIdx.x < i)    // the first half threads of the block
      if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
                            // the above is comparing shared memory of threadIdx.x with shared memory of threadIdx.x + i.
                            // then puts the larger value into shared memory of threadIdx.x
    __syncthreads();}       // so now in each block, shared memory's first element (index 0) is the max value and max value index

  // perform block-level reduction
  if (!threadIdx.x){    // at the shared memory, only the first element (index 0) (actually 2 elements in the first index. max value, and max value index) is what we need.
      blk_vals[blockIdx.y][blockIdx.x] = vals[0]; // For each window (single x row), the first elements of the blocks are stored into the blk_vals[windowNumber][:]
                                                // remember, this is a global variable.
    blk_idxs[blockIdx.y][blockIdx.x] = idxs[0]; // and the max value index
  __syncthreads();
}
}



  // originally the following kernel was in the 1st kernel, performed by the last block. So just use one block for this.
__global__ void max_idx_kernel_final(int *result_maxInd, float *result_maxVal){
  __shared__ volatile float vals[nTPB]; //  Total amount of shared memory per block: 49152 bytes (50 KB). 1024 gives ~ 4KB for single.
  __shared__ volatile int idxs[nTPB]; // ~ 4 KB for single, when nTPB is 1024. each block will have these variables!! (vals and idxs)
  int idx = threadIdx.x;
  float my_val = FLOAT_MIN;
  int my_idx = -1;  // remember, these are local variables, so each thread has this variable. This local variable is independent from other thread's local variable
  while (idx < MAX_BLOCKS_X ){                                                          // ?? confused whether it should be gridDim.x (actual # of blocks launched) or MAX_BLOCKS_X (# of elements in x dir of the global array blk_vals)
    if (blk_vals[blockIdx.y][idx] > my_val)
        {my_val = blk_vals[blockIdx.y][idx]; my_idx = blk_idxs[blockIdx.y][idx]; }
    idx += blockDim.x;} // all threads in this single block (single in the x dir) are working, so you should loop over blockDim.x.
                      // Imagine where gridDim.x (# of blocks) is huge so that you need to loop over to get the max value and index
                      // After this, each thread in the block has a local variable (max value and max value index).
                      // So far it was sort of a reduction, but instead of pairing values we just looped over the blk_vals and blk_idxs
  // populate shared memory
  vals[threadIdx.x] = my_val;   // This is now shared memory. This is because reduction requires comparison between different elements
  idxs[threadIdx.x] = my_idx;   // my_idx value is 0 based. This is done for all blocks (in the y direction)
  __syncthreads();
  // Now the final task is to do reduction for all threads in our single block (single block in the x dir, NROWS blocks in the y dir)!
// sweep in shared memory
  for (int i = (nTPB>>1); i > 0; i>>=1) {
    if (threadIdx.x < i) // the first half threads of the block
      if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
    __syncthreads();} // now all the results are in threadIdx.x == 0 for each block (there are NROWS blocks in the y dir)
  // 0th thread. the results are in shared memory, not the local memory, so any thread could do the following. We just selected the 0th thread for no reason. If several threads try to do this, that would be a problem, since we'll have to wait for them
  if(!threadIdx.x){
        result_maxInd[blockIdx.y] = idxs[0]; // the final result for each row goes into the corresponding position (blockIdx.y)
        result_maxVal[blockIdx.y] = vals[0];
      }
}



  int main(){
    dim3 grids(MAX_BLOCKS_X, NROWS);
    dim3 threads(nTPB,1);
    dim3 grids2(1,NROWS);
    dim3 threads2(nTPB);
    float *d_vector, *h_vector;
    h_vector = (float*)malloc(NROWS * NCOLS * sizeof(float));

    for (int j = 1; j <= NCOLS; j++) {
      for (int i = 1; i <= NROWS; i++)  {
          h_vector[IDX2F(i,j,NROWS)] = (float) (rand()/(float)RAND_MAX);
      }
    }

    h_vector[IDX2F(2,5,NROWS)] = 10;  // create definite max element
    cudaMalloc(&d_vector, NROWS * NCOLS * sizeof(float));
    cudaMemcpy(d_vector, h_vector, NROWS * NCOLS * sizeof(float), cudaMemcpyHostToDevice);
    //d_vector is a pointer on the device pointing to the beginning of the vector, containing nrElements floats.
    int *max_index;
    float *max_val;
    int *d_max_index;
    float *d_max_val;
    max_index = (int*)malloc(NROWS * sizeof(int));
    max_val = (float*)malloc(NROWS * sizeof(float));
    cudaMalloc((void**)&d_max_index, NROWS * sizeof(int));
    cudaMalloc((void**)&d_max_val, NROWS * sizeof(float));
    max_idx_kernel_reduction_within_block<<<grids, threads>>>(d_vector, NCOLS, NROWS);
    max_idx_kernel_final<<<grids2,threads2>>>(d_max_index, d_max_val);


    cudaMemcpy(max_index, d_max_index, NROWS * sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(max_val, d_max_val, NROWS * sizeof(float), cudaMemcpyDeviceToHost);
    for(int z=0;z<20;z++)
printf("%d  ",max_index[z]);
    printf("nnn");
    for(int z=0;z<20;z++)
    printf("%f  ",max_val[z]);
    return 0;
  }

你的代码有各种问题:

    你应该使用正确的cuda错误检查。这只是我说的一个标准的老生常谈。我不认为你的代码产生任何运行时错误。你应该验证你的结果。我严重怀疑代码是否产生了合理的结果。原因将变得显而易见。如果您想自己证明这一点,可以将数据初始化修改为明显且容易验证的内容,例如我所展示的,而不做任何其他更改,然后您将看到程序产生错误。
  1. 在你的内核中,你没有正确索引数组。也许您不了解IDX2CIDX2F宏。它们在两个方面伤害了您:您不理解它们在您的数组中索引的模式,并且它们正在扼杀全局内存中的合并(由于您使用它们的方式)。每当我们有一个threadIdx.xthreadIdx.y(或blockIdx.y)的索引函数的构造时,为了保持相邻线程之间的适当合并,希望基于threadIdx.x 的组件不乘以任何缩放因子。但是,在内核中向IDX2C传递参数的方式违反了这一规则(同时也破坏了期望的逐行访问模式)。所以现在,让我们摆脱这些宏,因为它们混淆了问题。
  2. 这是非法使用__syncthreads():

      if (!threadIdx.x){    // at the shared memory, only the first element (index 0) (actually 2 elements in the first index. max value, and max value index) is what we need.
          blk_vals[blockIdx.y][blockIdx.x] = vals[0]; // For each window (single x row), the first elements of the blocks are stored into the blk_vals[windowNumber][:]
                                            // remember, this is a global variable.
        blk_idxs[blockIdx.y][blockIdx.x] = idxs[0]; // and the max value index
      __syncthreads();
    }
    

    在条件代码中使用它是非法的,除非该条件对块中的每个线程的求值相同。

  3. 你的打印输出是索引到20而不是NROWS.

通过上述更改,您的代码从损坏(产生不正确的结果)变为修复,并且我的系统上内核的执行时间从0.929ms减少到0.656ms。我把所有这些都归功于上面第3项中的合并修复。

当我用nvprof --metrics gld_efficiency ...分析前后情况时,它显示原始代码的效率为12.5%,而更改后的效率为53%。下面是修改后的代码:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <iostream>
#define NCOLS 163317 // number of columns
#define NROWS 8 // number of rows
#define nTPB 1024  // Threads per Block. nTPB should be a power-of-2
#define MAX_BLOCKS_X ((NCOLS/nTPB)+1) // # of blocks I will launch
#define FLOAT_MIN -1.0f // lowest anticipated number of the data. Values in array will be compared with this and updated with the larger one

__device__ volatile float blk_vals[NROWS][MAX_BLOCKS_X];
__device__ volatile int   blk_idxs[NROWS][MAX_BLOCKS_X];
// blk_vals and blk_idxs are the results obtained from reduction within each block.
// after 1st reduction, each row will have blk_vals[MAX_BLOCKS_X] array and blk_idxs[MAX_BLOCKS_X]
// these will be passed to the 2nd kernel
__global__ void max_idx_kernel_reduction_within_block(const float *data, const int xSize, const int ySize){  // first kernel. Reduction within blocks
  __shared__ volatile float vals[nTPB]; // Total amount of shared memory per block: 49152 bytes (50 KB). 1024 gives ~ 4KB for single.
  __shared__ volatile int idxs[nTPB]; // ~ 4 KB for single, when nTPB is 1024. each block will have both indices and values
  int idx = threadIdx.x+blockDim.x * blockIdx.x; // idx in the x direction
  int idy = blockIdx.y;
  float my_val = FLOAT_MIN; // lowest possible number
  int my_idx = -1;  // to check whether you DID perform the kernel. Again, it's the idx in the x dir.
  // sweep from global memory
  while (idx < xSize){   // this ensures you don't go out the size of the array's x direction
    float temp = data[idy*xSize+idx];
    if (temp > my_val) {my_val = temp; my_idx = idx;}
    // compare with my_val, and put the bigger value into my_val for next comparison. my_idx is 0 index based
    idx += blockDim.x*gridDim.x;}
                                                                 // until here takes about 6 ms !! very fast!!
  // populate shared memory: takes ~ 270 ms
  vals[threadIdx.x] = my_val;  // put the computed max value for each thread into the shared memory. -> this is the bottleneck!!
  idxs[threadIdx.x] = my_idx;  // do this for index as well -> this is also slow!!
  __syncthreads();
  // sweep in shared memory
  for (int i = (nTPB>>1); i > 0; i>>=1){
    if (threadIdx.x < i)    // the first half threads of the block
      if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
                            // the above is comparing shared memory of threadIdx.x with shared memory of threadIdx.x + i.
                            // then puts the larger value into shared memory of threadIdx.x
    __syncthreads();}       // so now in each block, shared memory's first element (index 0) is the max value and max value index

  // perform block-level reduction
  if (!threadIdx.x){    // at the shared memory, only the first element (index 0) (actually 2 elements in the first index. max value, and max value index) is what we need.
      blk_vals[blockIdx.y][blockIdx.x] = vals[0]; // For each window (single x row), the first elements of the blocks are stored into the blk_vals[windowNumber][:]
                                                // remember, this is a global variable.
      blk_idxs[blockIdx.y][blockIdx.x] = idxs[0]; // and the max value index
  __syncthreads();
}
}
  // originally the following kernel was in the 1st kernel, performed by the last block. So just use one block for this.
__global__ void max_idx_kernel_final(int *result_maxInd, float *result_maxVal){
  __shared__ volatile float vals[nTPB]; //  Total amount of shared memory per block: 49152 bytes (50 KB). 1024 gives ~ 4KB for single.
  __shared__ volatile int idxs[nTPB]; // ~ 4 KB for single, when nTPB is 1024. each block will have these variables!! (vals and idxs)
  int idx = threadIdx.x;
  int idy = blockIdx.y;
  float my_val = FLOAT_MIN;
  int my_idx = -1;  // remember, these are local variables, so each thread has this variable. This local variable is independent from other thread's local variable
  while (idx < MAX_BLOCKS_X ){                                                          // ?? confused whether it should be gridDim.x (actual # of blocks launched) or MAX_BLOCKS_X (# of elements in x dir of the global array blk_vals)
    float temp = blk_vals[idy][idx];
    if (temp > my_val)
        {my_val = temp; my_idx = blk_idxs[idy][idx]; }
    idx += blockDim.x;} // all threads in this single block (single in the x dir) are working, so you should loop over blockDim.x.
                      // Imagine where gridDim.x (# of blocks) is huge so that you need to loop over to get the max value and index
                      // After this, each thread in the block has a local variable (max value and max value index).
                      // So far it was sort of a reduction, but instead of pairing values we just looped over the blk_vals and blk_idxs
  // populate shared memory
  idx = threadIdx.x;
  vals[idx] = my_val;   // This is now shared memory. This is because reduction requires comparison between different elements
  idxs[idx] = my_idx;   // my_idx value is 0 based. This is done for all blocks (in the y direction)
  __syncthreads();
  // Now the final task is to do reduction for all threads in our single block (single block in the x dir, NROWS blocks in the y dir)!
// sweep in shared memory
  for (int i = (nTPB>>1); i > 0; i>>=1) {
    if (idx < i) // the first half threads of the block
      if (vals[idx] < vals[idx + i]) {vals[idx] = vals[idx+i]; idxs[idx] = idxs[idx+i]; }
    __syncthreads();} // now all the results are in threadIdx.x == 0 for each block (there are NROWS blocks in the y dir)
  // 0th thread. the results are in shared memory, not the local memory, so any thread could do the following. We just selected the 0th thread for no reason. If several threads try to do this, that would be a problem, since we'll have to wait for them
  if(!threadIdx.x){
        result_maxInd[idy] = idxs[0]; // the final result for each row goes into the corresponding position (blockIdx.y)
        result_maxVal[idy] = vals[0];
      }
}

int main(){
    dim3 grids(MAX_BLOCKS_X, NROWS);
    dim3 threads(nTPB,1);
    dim3 grids2(1,NROWS);
    dim3 threads2(nTPB);
    float *d_vector, *h_vector;
    h_vector = (float*)malloc(NROWS * NCOLS * sizeof(float));
    memset(h_vector, 0, NROWS*NCOLS*sizeof(float));
    for (int i =  0; i < NROWS; i++)
      h_vector[i*NCOLS + i] = 10.0f;  // create definite max element per row
    cudaMalloc(&d_vector, NROWS * NCOLS * sizeof(float));
    cudaMemcpy(d_vector, h_vector, NROWS * NCOLS * sizeof(float), cudaMemcpyHostToDevice);
    //d_vector is a pointer on the device pointing to the beginning of the vector, containing nrElements floats.
    int *max_index;
    float *max_val;
    int *d_max_index;
    float *d_max_val;
    max_index = (int*)malloc(NROWS * sizeof(int));
    max_val = (float*)malloc(NROWS * sizeof(float));
    cudaMalloc((void**)&d_max_index, NROWS * sizeof(int));
    cudaMalloc((void**)&d_max_val, NROWS * sizeof(float));
    cudaEvent_t start, stop;
    cudaEventCreate(&start); cudaEventCreate(&stop);
    cudaEventRecord(start);
    max_idx_kernel_reduction_within_block<<<grids, threads>>>(d_vector, NCOLS, NROWS);
    max_idx_kernel_final<<<grids2,threads2>>>(d_max_index, d_max_val);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float et;
    cudaEventElapsedTime(&et, start, stop);
    printf("elapsed time: %fmsn", et);
    cudaMemcpy(max_index, d_max_index, NROWS * sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(max_val, d_max_val, NROWS * sizeof(float), cudaMemcpyDeviceToHost);
    for(int z=0;z<NROWS;z++)
      printf("%d  ",max_index[z]);
    printf("nnn");
    for(int z=0;z<NROWS;z++)
      printf("%f  ",max_val[z]);
    printf("n");
    return 0;
}
$

最新更新