我正在努力了解我正在处理的一个大型CUDA内核的指令吞吐量。我写了两个小程序来比较加法和移位指令的吞吐量。根据CUDA C编程指南,移位指令的吞吐量是加法指令的一半。然而,当我在特斯拉M2070上测量以下两个程序的时间时,时间完全相同。有人能解释一下为什么会这样吗?
附加程序:
#include <limits.h>
#include <stdio.h>
#include <fstream>
#include <iostream>
#include <cstdlib>
#include <stdint.h>
using namespace std;
__global__ void testAdd(int numIterations, uint1* result){
int total = 1;
for(int i=0; i< numIterations;i ++){
total = total+i;
}
result[0] = make_uint1(total);
}
int main(){
uint1* result;
cudaMalloc((void**)(&(result)), sizeof(uint1));
float totalElapsedTime = 0;
int i;
for(i = 0; i < 10; i++){
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
testAdd<<<1,1>>>(100000, result);
cudaError_t e50 = cudaGetLastError();
if(e50 == cudaSuccess){
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
totalElapsedTime += elapsedTime;;
//cout << "Elapsed Time:" << elapsedTime << endl;
}else{
cout << "Error launching kernel: " << e50 << endl;
}
cudaEventDestroy(start);
cudaEventDestroy(stop);
}
cout << "Elapsed Time: " << totalElapsedTime/i << endl;
cudaFree(result);
}
轮班程序:
#include <limits.h>
#include <stdio.h>
#include <fstream>
#include <iostream>
#include <cstdlib>
#include <stdint.h>
using namespace std;
__global__ void testShift(int numIterations, uint1* result){
int total = 1;
for(int i=0; i< numIterations;i ++){
total = total<<i;
}
result[0] = make_uint1(total);
}
int main(){
uint1* result;
cudaMalloc((void**)(&(result)), sizeof(uint1));
float totalElapsedTime = 0;
int i;
for(i = 0; i < 10; i++){
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
testShift<<<1,1>>>(100000, result);
cudaError_t e50 = cudaGetLastError();
if(e50 == cudaSuccess){
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
totalElapsedTime += elapsedTime;;
//cout << "Elapsed Time:" << elapsedTime << endl;
}else{
cout << "Error launching kernel: " << e50 << endl;
}
cudaEventDestroy(start);
cudaEventDestroy(stop);
}
cout << "Elapsed Time: " << totalElapsedTime/i << endl;
cudaFree(result);
}
编辑:添加ptx代码添加和移位程序。正如您所看到的,唯一的区别是在第78行,即add指令与shl指令。
添加PTX代码:
.entry _Z7testAddiP5uint1 (
.param .s32 __cudaparm__Z7testAddiP5uint1_numIterations,
.param .u64 __cudaparm__Z7testAddiP5uint1_result)
{
.reg .u32 %r<8>;
.reg .u64 %rd<3>;
.reg .pred %p<4>;
.loc 16 10 0
// 6 #include <stdint.h>
// 7
// 8 using namespace std;
// 9
// 10 __global__ void testAdd(int numIterations, uint1* result){
$LDWbegin__Z7testAddiP5uint1:
ld.param.s32 %r1, [__cudaparm__Z7testAddiP5uint1_numIterations];
mov.u32 %r2, 0;
setp.le.s32 %p1, %r1, %r2;
@%p1 bra $Lt_0_2306;
ld.param.s32 %r1, [__cudaparm__Z7testAddiP5uint1_numIterations];
mov.s32 %r3, %r1;
mov.s32 %r4, 0;
mov.s32 %r5, 1;
mov.s32 %r6, %r3;
$Lt_0_1794:
//<loop> Loop body line 10, nesting depth: 1, estimated iterations: unknown
.loc 16 13 0
// 11 int total = 1;
// 12 for(int i=0; i< numIterations;i ++){
// 13 total = total+i;
add.s32 %r5, %r4, %r5;
add.s32 %r4, %r4, 1;
.loc 16 10 0
ld.param.s32 %r1, [__cudaparm__Z7testAddiP5uint1_numIterations];
.loc 16 13 0
setp.ne.s32 %p2, %r1, %r4;
// 6 #include <stdint.h>
// 7
// 8 using namespace std;
// 9
// 10 __global__ void testAdd(int numIterations, uint1* result){
$LDWbegin__Z7testAddiP5uint1:
ld.param.s32 %r1, [__cudaparm__Z7testAddiP5uint1_numIterations];
mov.u32 %r2, 0;
setp.le.s32 %p1, %r1, %r2;
@%p1 bra $Lt_0_2306;
ld.param.s32 %r1, [__cudaparm__Z7testAddiP5uint1_numIterations];
mov.s32 %r3, %r1;
mov.s32 %r4, 0;
mov.s32 %r5, 1;
mov.s32 %r6, %r3;
$Lt_0_1794:
//<loop> Loop body line 10, nesting depth: 1, estimated iterations: unknown
.loc 16 13 0
// 11 int total = 1;
// 12 for(int i=0; i< numIterations;i ++){
// 13 total = total+i;
add.s32 %r5, %r4, %r5;
add.s32 %r4, %r4, 1;
.loc 16 10 0
ld.param.s32 %r1, [__cudaparm__Z7testAddiP5uint1_numIterations];
.loc 16 13 0
setp.ne.s32 %p2, %r1, %r4;
@%p2 bra $Lt_0_1794;
bra.uni $Lt_0_1282;
$Lt_0_2306:
mov.s32 %r5, 1;
$Lt_0_1282:
.loc 16 15 0
// 14 }
// 15 result[0] = make_uint1(total);
ld.param.u64 %rd1, [__cudaparm__Z7testAddiP5uint1_result];
st.global.u32 [%rd1+0], %r5;
.loc 16 16 0
// 16 }
exit;
$LDWend__Z7testAddiP5uint1:
} // _Z7testAddiP5uint1
移位PTX代码:
.entry _Z9testShiftiP5uint1 (
.param .s32 __cudaparm__Z9testShiftiP5uint1_numIterations,
.param .u64 __cudaparm__Z9testShiftiP5uint1_result)
{
.reg .u32 %r<8>;
.reg .u64 %rd<3>;
.reg .pred %p<4>;
.loc 16 10 0
// 6 #include <stdint.h>
// 7
// 8 using namespace std;
// 9
// 10 __global__ void testShift(int numIterations, uint1* result){
$LDWbegin__Z9testShiftiP5uint1:
ld.param.s32 %r1, [__cudaparm__Z9testShiftiP5uint1_numIterations];
mov.u32 %r2, 0;
setp.le.s32 %p1, %r1, %r2;
@%p1 bra $Lt_0_2306;
ld.param.s32 %r1, [__cudaparm__Z9testShiftiP5uint1_numIterations];
mov.s32 %r3, %r1;
mov.s32 %r4, 0;
mov.s32 %r5, 1;
mov.s32 %r6, %r3;
$Lt_0_1794:
//<loop> Loop body line 10, nesting depth: 1, estimated iterations: unknown
.loc 16 13 0
// 11 int total = 1;
// 12 for(int i=0; i< numIterations;i ++){
// 13 total = total<<i;
shl.b32 %r5, %r5, %r4;
add.s32 %r4, %r4, 1;
.loc 16 10 0
.reg .u64 %rd<3>;
.reg .pred %p<4>;
.loc 16 10 0
// 6 #include <stdint.h>
// 7
// 8 using namespace std;
// 9
// 10 __global__ void testShift(int numIterations, uint1* result){
$LDWbegin__Z9testShiftiP5uint1:
ld.param.s32 %r1, [__cudaparm__Z9testShiftiP5uint1_numIterations];
mov.u32 %r2, 0;
setp.le.s32 %p1, %r1, %r2;
@%p1 bra $Lt_0_2306;
ld.param.s32 %r1, [__cudaparm__Z9testShiftiP5uint1_numIterations];
mov.s32 %r3, %r1;
mov.s32 %r4, 0;
mov.s32 %r5, 1;
mov.s32 %r6, %r3;
$Lt_0_1794:
//<loop> Loop body line 10, nesting depth: 1, estimated iterations: unknown
.loc 16 13 0
// 11 int total = 1;
// 12 for(int i=0; i< numIterations;i ++){
// 13 total = total<<i;
shl.b32 %r5, %r5, %r4;
add.s32 %r4, %r4, 1;
.loc 16 10 0
ld.param.s32 %r1, [__cudaparm__Z9testShiftiP5uint1_numIterations];
.loc 16 13 0
setp.ne.s32 %p2, %r1, %r4;
@%p2 bra $Lt_0_1794;
bra.uni $Lt_0_1282;
$Lt_0_2306:
mov.s32 %r5, 1;
$Lt_0_1282:
.loc 16 15 0
// 14 }
// 15 result[0] = make_uint1(total);
ld.param.u64 %rd1, [__cudaparm__Z9testShiftiP5uint1_result];
st.global.u32 [%rd1+0], %r5;
.loc 16 16 0
// 16 }
exit;
$LDWend__Z9testShiftiP5uint1:
} // _Z9testShiftiP5uint1
@gmemon:如果你想检查GPU汇编代码,PTX在这里没有多大用处,因为它是中间语言。
要获得实际的汇编代码,可以执行以下操作:
- 使用NVCC编译程序-保留选项
- 使用cuobjdump--在CUBIN文件上转储sass以获取反汇编
CUBIN文件通常称为foo.sm_20.CUBIN或foo.sm_30.CUBIN,具体取决于您的体系结构。
例如,kepler反汇编如下所示:
/*7458*/ /*0x001b9e85c0000000*/ LDL.CS R46, [R1];
/*7460*/ /*0x101ade85c0000000*/ LDL.CS R43, [R1+0x4];
/*7468*/ /*0xf2655c85c8000063*/ STL [R38+0x18fc], R21;
/*7470*/ /*0x3ee35c036800c000*/ LOP.AND R13, R46, 0xf;
/*7478*/ /*0x400000076000000c*/ SSY 0x7790;
/*7488*/ /*0xfcdfdd0348010000*/ IADD RZ.CC, R13, -RZ;
/*7490*/ /*0xfff1dc63190e0000*/ ISETP.EQ.X.AND P0, pt, RZ, RZ, pt;
/*7498*/ /*0x800001e74000000b*/ @P0 BRA 0x7780;
/*74a0*/ /*0xfc001de428000000*/ MOV R0, RZ;
/*74a8*/ /*0x04039de218000000*/ MOV32I R14, 0x1;
/*74b0*/ /*0x0403dde218000000*/ MOV32I R15, 0x1;
/*74b8*/ /*0x626fdca5c8000064*/ STL.64 [R38+0x1918], RZ;
指令语义可以在cuobjdump工具的手册中找到
我建议查看PTX代码中的指令数量-你能为你的两个例子发布PTX代码吗?这应该能为表演提供线索。
顺便说一句,我不确定你是否可以只使用一个线程来可靠地测试性能。