CUDA:使用两倍的块(平铺?)进行编程



我的3D拉普拉斯解算器工作。我获得了350 Gflop/s的功率,我试图升级它,使其具有两倍的块的更好性能。然而,性能仍然是350 Gflop/s:

 #include <iostream>
 #include <sys/time.h>
 #include <cuda.h>
 #include <ctime>
 #include"res3dcb.cuh"
 #include <math.h>
 using namespace std;
 // Constant statement.
 const int blocksize=32;
 const int N=128;
 const int size=(N+2)*(N+2)*(N+2)*sizeof(float);
 // Let's start the main program.
 int main(void) {
 // Variable statement.
 float time1,time2,time3;
 float *x_d, *y_d; 
 float *x,*y; 
 float gflops;
 float NumOps;
 int power=4; // You can change power as you prefer (but keep 2^x)
 // Init x and y. 
 x = new float[size];
 y = new float[size];
 for (int k=1;k<N+1;k++)
    for (int i=1;i<N+1;i++) 
        for (int j=1;j<N+1;j++) { 
            x[k*(N+2)*(N+2)+i*(N+2)+j]=cos(i+j+k);
        }
 // Shadow cases.
 for (int k=1;k<N+1;k++) {
    for (int i=1;i<N+1;i++) { 
      x[k*(N+2)*(N+2)+i*(N+2)]=x[k*(N+2)*(N+2)+i*(N+2)+1]; 
      x[k*(N+2)*(N+2)+i*(N+2)+N+1]=x[k*(N+2)*(N+2)+i*(N+2)+N];}
    for (int j=0;j<N+2;j++) { 
      x[k*(N+2)*(N+2)+j]=x[k*(N+2)*(N+2)+(N+2)+j]; 
      x[k*(N+2)*(N+2)+(N+1)*(N+2)+j]=x[k*(N+2)*(N+2)+N*(N+2)+j];}
 for (int i=0;i<N+2;i++) 
    for (int j=0;j<N+2;j++) {
        x[(N+2)*i+j]=x[(N+2)*(N+2)+(N+2)*i+j];
        x[(N+1)*(N+2)*(N+2)+(N+2)*i+j]=x[(N+2)*(N+2)*N+(N+2)*i+j];
    }
 // Display of initial matrix.
 int id_stage=-2;
 while (id_stage!=-1) {
     cout<<"Which stage do you want to display? (-1 if you don't want to diplay another one)"<<endl;
     cin>>id_stage;
     cout<<endl;
     if (id_stage != -1) {
    cout<<"Etage "<<id_stage<<" du cube :"<<endl;
    for (int i=0;i<N+2;i++) {
        cout<<"| ";
        for (int j=0;j<N+2;j++) {cout<<x[id_stage*(N+2)*(N+2)+i*(N+2)+j]<<" ";}
        cout<<"|"<<endl;
        }
         cout<<endl;
     }
 }
 // CPU to GPU.
 cudaMalloc( (void**) & x_d, size);
 cudaMalloc( (void**) & y_d, size);
 cudaMemcpy(x_d, x, size, cudaMemcpyHostToDevice) ;
 cudaMemcpy(y_d, y, size, cudaMemcpyHostToDevice) ;
 // Solver parameters.
 dim3 dimGrid(power*N/blocksize, power*N/blocksize);
 dim3 dimBlock(blocksize, blocksize);
 // Solver loop.
 time1=clock();
 res2d<<<dimGrid, dimBlock>>>(x_d, y_d, N, power); 
 time2=clock();
 time3=(time2-time1)/CLOCKS_PER_SEC;
 // Power calculation.
 NumOps=(1.0e-9)*N*N*N*7;
 gflops = ( NumOps / (time3));
 // GPU to CPU.
 cudaMemcpy(y, y_d, size, cudaMemcpyDeviceToHost);
 cudaFree(x_d);
 cudaFree(y_d);
 // Display of final matrix.
 id_stage=-2;
 while (id_stage!=-1) {
    cout<<"Which stage do you want to display? (-1 if you don't want to diplay another one)"<<endl;
    cin>>id_stage;
    cout<<endl;
     if (id_stage != -1) {
        cout<<"Etage "<<id_stage<<" du cube :"<<endl;
        for (int i=0;i<N+2;i++) {
            cout<<"| ";
            for (int j=0;j<N+2;j++) {cout<<y[id_stage*(N+2)*(N+2)+i*(N+2)+j]<<" ";}
            cout<<"|"<<endl;
         }
        cout<<endl;
     }
 }
 cout<<"Time : "<<time3<<endl;
 cout<<"Gflops/s : "<<gflops<<endl;
 }

地点:

__ global__ void res2d(volatile float* x, float* y, int N, int power) 
{
    int i = threadIdx.x + blockIdx.x*(blockDim.x);
    int j = threadIdx.y + blockIdx.y*(blockDim.y);
    int id,jd;
    #pragma unroll //Now let's recude the number of operation per block
    for (int incr=1; incr<power+1; incr++) {
        if (i>(incr-1)*N && i<incr*N && j>(incr-1)*N && j<incr*N) {
            #pragma unroll
            for (int k=(incr-1)*(N/power) ; k<incr*N/power ; k++) {
                id=i-(incr-1)*N;
                jd=j-(incr-1)*N;
                y[(N+2)*(N+2)*(k+1)+(N+2)*(id+1)+jd+1] = x[(N+2)*(N+2)*(k+1)+(N+2)*(id+2)+jd+1] 
                                                       + x[(N+2)*(N+2)*(k+1)+(N+2)*id+jd+1] 
                                                       + x[(N+2)*(N+2)*(k+1)+(N+2)*(id+1)+jd+2] 
                                                       + x[(N+2)*(N+2)*(k+1)+(N+2)*(id+1)+jd] 
                                                       + x[(N+2)*(N+2)*(k+2)+(N+2)*(id+1)+jd+1] 
                                                       + x[(N+2)*(N+2)*k+(N+2)*(id+1)+jd+1] 
                                                       - 6*x[(N+2)*(N+2)*(k+1)+(N+2)*(id+1)+jd+1];
            }   
        }
    }
}
与参数:

dimGrid(power * N/blocksize, power * N/blocksize) & dimBlock(blocksize, blocksize)

问题:

  1. power = 248时,每个块的操作数除以248。但它并没有更快。为什么?

  2. 减少每个区块的操作次数是没有用的吗?

提前感谢您的帮助。

CUDA内核启动是异步的。当你这样做时:

 // Solver loop.
 time1=clock();
 res2d<<<dimGrid, dimBlock>>>(x_d, y_d, N, power); 
 time2=clock();
 time3=(time2-time1)/CLOCKS_PER_SEC;

计时器只捕获API启动延迟,而不是代码的实际执行时间。这就是为什么改变内核中完成的工作量显然对性能没有影响的原因——您的计时方法不正确。

不如这样做:

 // Solver loop.
 time1=clock();
 res2d<<<dimGrid, dimBlock>>>(x_d, y_d, N, power); 
 cudaDeviceSynchronize();
 time2=clock();
 time3=(time2-time1)/CLOCKS_PER_SEC;

插入一个阻塞调用,以确保内核在测量时间之前完成执行。

[此答案作为社区wiki条目添加,以将问题从未回答队列中取出]。

最新更新