CUDA加法与移位指令性能



我正在努力了解我正在处理的一个大型CUDA内核的指令吞吐量。我写了两个小程序来比较加法和移位指令的吞吐量。根据CUDA C编程指南,移位指令的吞吐量是加法指令的一半。然而,当我在特斯拉M2070上测量以下两个程序的时间时,时间完全相同。有人能解释一下为什么会这样吗?

附加程序:

#include <limits.h>
#include <stdio.h>
#include <fstream>
#include <iostream>
#include <cstdlib>
#include <stdint.h>
using namespace std;
__global__ void testAdd(int numIterations, uint1* result){
  int total = 1;
  for(int i=0; i< numIterations;i ++){
    total = total+i;
  }
  result[0] = make_uint1(total);
}
int main(){
  uint1* result;
  cudaMalloc((void**)(&(result)), sizeof(uint1));
  float totalElapsedTime = 0;
  int i;
  for(i = 0; i < 10; i++){
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
    testAdd<<<1,1>>>(100000, result);
    cudaError_t e50 = cudaGetLastError();
    if(e50 == cudaSuccess){
      cudaEventRecord(stop, 0);
      cudaEventSynchronize(stop);
      float elapsedTime;
      cudaEventElapsedTime(&elapsedTime, start, stop);
      totalElapsedTime += elapsedTime;;
      //cout << "Elapsed Time:" << elapsedTime << endl;                                                                                                                              
    }else{
      cout << "Error launching kernel: " << e50 << endl;
    }
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
  }
  cout << "Elapsed Time: " << totalElapsedTime/i << endl;
  cudaFree(result);
}

轮班程序:

#include <limits.h>
#include <stdio.h>
#include <fstream>
#include <iostream>
#include <cstdlib>
#include <stdint.h>
using namespace std;
__global__ void testShift(int numIterations, uint1* result){
  int total = 1;
  for(int i=0; i< numIterations;i ++){
    total = total<<i;
  }
  result[0] = make_uint1(total);
}
int main(){
  uint1* result;
  cudaMalloc((void**)(&(result)), sizeof(uint1));
  float totalElapsedTime = 0;
  int i;
  for(i = 0; i < 10; i++){
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
    testShift<<<1,1>>>(100000, result);
    cudaError_t e50 = cudaGetLastError();
    if(e50 == cudaSuccess){
      cudaEventRecord(stop, 0);
      cudaEventSynchronize(stop);
      float elapsedTime;
      cudaEventElapsedTime(&elapsedTime, start, stop);
      totalElapsedTime += elapsedTime;;
      //cout << "Elapsed Time:" << elapsedTime << endl;                                                                                                                              
    }else{
      cout << "Error launching kernel: " << e50 << endl;
    }
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
  }
  cout << "Elapsed Time: " << totalElapsedTime/i << endl;
  cudaFree(result);
}

编辑:添加ptx代码添加和移位程序。正如您所看到的,唯一的区别是在第78行,即add指令与shl指令。

添加PTX代码:

        .entry _Z7testAddiP5uint1 (
                .param .s32 __cudaparm__Z7testAddiP5uint1_numIterations,
                .param .u64 __cudaparm__Z7testAddiP5uint1_result)
        {
        .reg .u32 %r<8>;
        .reg .u64 %rd<3>;
        .reg .pred %p<4>;
        .loc    16      10      0
 //   6  #include <stdint.h>
 //   7
 //   8  using namespace std;
 //   9
 //  10  __global__ void testAdd(int numIterations, uint1* result){
$LDWbegin__Z7testAddiP5uint1:
        ld.param.s32    %r1, [__cudaparm__Z7testAddiP5uint1_numIterations];
        mov.u32         %r2, 0;
        setp.le.s32     %p1, %r1, %r2;
        @%p1 bra        $Lt_0_2306;
        ld.param.s32    %r1, [__cudaparm__Z7testAddiP5uint1_numIterations];
        mov.s32         %r3, %r1;
        mov.s32         %r4, 0;
        mov.s32         %r5, 1;
        mov.s32         %r6, %r3;
$Lt_0_1794:
 //<loop> Loop body line 10, nesting depth: 1, estimated iterations: unknown
        .loc    16      13      0
 //  11    int total = 1;
 //  12    for(int i=0; i< numIterations;i ++){
 //  13      total = total+i;
        add.s32         %r5, %r4, %r5;
        add.s32         %r4, %r4, 1;
        .loc    16      10      0
        ld.param.s32    %r1, [__cudaparm__Z7testAddiP5uint1_numIterations];
        .loc    16      13      0
        setp.ne.s32     %p2, %r1, %r4;
//   6  #include <stdint.h>
 //   7
 //   8  using namespace std;
 //   9
 //  10  __global__ void testAdd(int numIterations, uint1* result){
$LDWbegin__Z7testAddiP5uint1:
        ld.param.s32    %r1, [__cudaparm__Z7testAddiP5uint1_numIterations];
        mov.u32         %r2, 0;
        setp.le.s32     %p1, %r1, %r2;
        @%p1 bra        $Lt_0_2306;
        ld.param.s32    %r1, [__cudaparm__Z7testAddiP5uint1_numIterations];
        mov.s32         %r3, %r1;
        mov.s32         %r4, 0;
        mov.s32         %r5, 1;
        mov.s32         %r6, %r3;
$Lt_0_1794:
 //<loop> Loop body line 10, nesting depth: 1, estimated iterations: unknown
        .loc    16      13      0
 //  11    int total = 1;
 //  12    for(int i=0; i< numIterations;i ++){
 //  13      total = total+i;
        add.s32         %r5, %r4, %r5;
        add.s32         %r4, %r4, 1;
        .loc    16      10      0
        ld.param.s32    %r1, [__cudaparm__Z7testAddiP5uint1_numIterations];
        .loc    16      13      0
        setp.ne.s32     %p2, %r1, %r4;
        @%p2 bra        $Lt_0_1794;
        bra.uni         $Lt_0_1282;
$Lt_0_2306:
        mov.s32         %r5, 1;
$Lt_0_1282:
        .loc    16      15      0
 //  14    }
 //  15    result[0] = make_uint1(total);
        ld.param.u64    %rd1, [__cudaparm__Z7testAddiP5uint1_result];
        st.global.u32   [%rd1+0], %r5;
        .loc    16      16      0
 //  16  }
        exit;
$LDWend__Z7testAddiP5uint1:
        } // _Z7testAddiP5uint1

移位PTX代码:

        .entry _Z9testShiftiP5uint1 (
                .param .s32 __cudaparm__Z9testShiftiP5uint1_numIterations,
                .param .u64 __cudaparm__Z9testShiftiP5uint1_result)
        {
        .reg .u32 %r<8>;
        .reg .u64 %rd<3>;
        .reg .pred %p<4>;
        .loc    16      10      0
 //   6  #include <stdint.h>
 //   7
 //   8  using namespace std;
 //   9
 //  10  __global__ void testShift(int numIterations, uint1* result){
$LDWbegin__Z9testShiftiP5uint1:
        ld.param.s32    %r1, [__cudaparm__Z9testShiftiP5uint1_numIterations];
        mov.u32         %r2, 0;
        setp.le.s32     %p1, %r1, %r2;
        @%p1 bra        $Lt_0_2306;
        ld.param.s32    %r1, [__cudaparm__Z9testShiftiP5uint1_numIterations];
        mov.s32         %r3, %r1;
        mov.s32         %r4, 0;
        mov.s32         %r5, 1;
    mov.s32         %r6, %r3;
$Lt_0_1794:
 //<loop> Loop body line 10, nesting depth: 1, estimated iterations: unknown
        .loc    16      13      0
 //  11    int total = 1;
 //  12    for(int i=0; i< numIterations;i ++){
 //  13      total = total<<i;
        shl.b32         %r5, %r5, %r4;
        add.s32         %r4, %r4, 1;
        .loc    16      10      0
        .reg .u64 %rd<3>;
        .reg .pred %p<4>;
        .loc    16      10      0
 //   6  #include <stdint.h>
 //   7
 //   8  using namespace std;
 //   9
 //  10  __global__ void testShift(int numIterations, uint1* result){
$LDWbegin__Z9testShiftiP5uint1:
        ld.param.s32    %r1, [__cudaparm__Z9testShiftiP5uint1_numIterations];
        mov.u32         %r2, 0;
        setp.le.s32     %p1, %r1, %r2;
        @%p1 bra        $Lt_0_2306;
        ld.param.s32    %r1, [__cudaparm__Z9testShiftiP5uint1_numIterations];
        mov.s32         %r3, %r1;
        mov.s32         %r4, 0;
        mov.s32         %r5, 1;
    mov.s32         %r6, %r3;
$Lt_0_1794:
 //<loop> Loop body line 10, nesting depth: 1, estimated iterations: unknown
        .loc    16      13      0
 //  11    int total = 1;
 //  12    for(int i=0; i< numIterations;i ++){
 //  13      total = total<<i;
        shl.b32         %r5, %r5, %r4;
        add.s32         %r4, %r4, 1;
        .loc    16      10      0
        ld.param.s32    %r1, [__cudaparm__Z9testShiftiP5uint1_numIterations];
        .loc    16      13      0
        setp.ne.s32     %p2, %r1, %r4;
        @%p2 bra        $Lt_0_1794;
        bra.uni         $Lt_0_1282;
$Lt_0_2306:
        mov.s32         %r5, 1;
$Lt_0_1282:
        .loc    16      15      0
 //  14    }
 //  15    result[0] = make_uint1(total);
        ld.param.u64    %rd1, [__cudaparm__Z9testShiftiP5uint1_result];
        st.global.u32   [%rd1+0], %r5;
        .loc    16      16      0
 //  16  }
        exit;
$LDWend__Z9testShiftiP5uint1:
        } // _Z9testShiftiP5uint1

@gmemon:如果你想检查GPU汇编代码,PTX在这里没有多大用处,因为它是中间语言。

要获得实际的汇编代码,可以执行以下操作:

  1. 使用NVCC编译程序-保留选项
  2. 使用cuobjdump--在CUBIN文件上转储sass以获取反汇编

CUBIN文件通常称为foo.sm_20.CUBIN或foo.sm_30.CUBIN,具体取决于您的体系结构。

例如,kepler反汇编如下所示:

    /*7458*/     /*0x001b9e85c0000000*/     LDL.CS R46, [R1];
    /*7460*/     /*0x101ade85c0000000*/     LDL.CS R43, [R1+0x4];
    /*7468*/     /*0xf2655c85c8000063*/     STL [R38+0x18fc], R21;
    /*7470*/     /*0x3ee35c036800c000*/     LOP.AND R13, R46, 0xf;
    /*7478*/     /*0x400000076000000c*/     SSY 0x7790;
    /*7488*/     /*0xfcdfdd0348010000*/     IADD RZ.CC, R13, -RZ;
    /*7490*/     /*0xfff1dc63190e0000*/     ISETP.EQ.X.AND P0, pt, RZ, RZ, pt;
    /*7498*/     /*0x800001e74000000b*/     @P0 BRA 0x7780;
    /*74a0*/     /*0xfc001de428000000*/     MOV R0, RZ;
    /*74a8*/     /*0x04039de218000000*/     MOV32I R14, 0x1;
    /*74b0*/     /*0x0403dde218000000*/     MOV32I R15, 0x1;
    /*74b8*/     /*0x626fdca5c8000064*/     STL.64 [R38+0x1918], RZ;

指令语义可以在cuobjdump工具的手册中找到

我建议查看PTX代码中的指令数量-你能为你的两个例子发布PTX代码吗?这应该能为表演提供线索。

顺便说一句,我不确定你是否可以只使用一个线程来可靠地测试性能。

最新更新