为什么即使使用异步流,cudaMemcpyAsync和内核也会启动块



考虑以下程序来对非阻塞GPU流上的一些工作进行排队:

#include <iostream>
using clock_value_t = long long;
__device__ void gpu_sleep(clock_value_t sleep_cycles) {
clock_value_t start = clock64();
clock_value_t cycles_elapsed;
do { cycles_elapsed = clock64() - start; }
while (cycles_elapsed < sleep_cycles);
}
void callback(cudaStream_t, cudaError_t, void *ptr) { 
*(reinterpret_cast<bool *>(ptr)) = true; 
}
__global__ void dummy(clock_value_t sleep_cycles) { gpu_sleep(sleep_cycles); }
int main() {
const clock_value_t duration_in_clocks = 1e6;
const size_t buffer_size = 1e7;
bool callback_executed = false;
cudaStream_t stream;
auto host_ptr = std::unique_ptr<char[]>(new char[buffer_size]);
char* device_ptr;
cudaMalloc(&device_ptr, buffer_size);
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
cudaMemcpyAsync(device_ptr, host_ptr.get(), buffer_size, cudaMemcpyDefault, stream);
dummy<<<128, 128, 0, stream>>>(duration_in_clocks);
cudaMemcpyAsync(host_ptr.get(), device_ptr, buffer_size, cudaMemcpyDefault, stream);
cudaStreamAddCallback(
stream, callback, &callback_executed, 0 /* fixed and meaningless */);
snapshot = callback_executed;
std::cout << "Right after we finished enqueuing work, the stream has "
<< (snapshot ? "" : "not ") << "concluded execution." << std::endl;
cudaStreamSynchronize(stream);
snapshot = callback_executed;
std::cout << "After cudaStreamSynchronize, the stream has "
<< (snapshot ? "" : "not ") << "concluded execution." << std::endl;
}

缓冲区的大小和内核睡眠周期的长度足够高,当它们与CPU线程并行执行时,它应该在它们结束之前完成排队(复制8ms+8ms,内核20ms)。

然而,从下面的跟踪来看,两个cudaMemcpyAsync()似乎实际上是同步的,即它们会阻塞,直到(非阻塞)流真正结束复制。这是故意的行为吗?它似乎与CUDA运行时API文档的相关部分相矛盾。这有什么意义?


跟踪:(编号行,时间单位为useconds):

1 "Start"        "Duration"    "Grid X"                             "Grid Y"  "Grid Z"    "Block X"   "Block Y"                       "Block Z"  
104 14102.830000   59264.347000  "cudaMalloc"
105 73368.351000   19.886000     "cudaStreamCreateWithFlags"
106 73388.and 20 ms for the kernel).

然而,从下面的跟踪中可以看出,两个cudaMemcpyAsync()实际上是同步的,即它们会阻塞,直到(非阻塞)流真正完成复制。这是故意的行为吗?它似乎与CUDA运行时API文档的相关部分相矛盾。这有什么意义?

850000   8330.257000   "cudaMemcpyAsync"
107 73565.702000   8334.265000   47.683716                            5.587311  "Pageable"  "Device"    "GeForce GTX 650 Ti BOOST (0)"  "1"        
108 81721.124000   2.394000      "cudaConfigureCall"
109 81723.865000   3.585000      "cudaSetupArgument"
110 81729.332000   30.742000     "cudaLaunch (dummy(__int64) [107])"
111 81760.604000   39589.422000  "cudaMemcpyAsync"
112 81906.303000   20157.648000  128                                  1         1           128         1                               1          
113 102073.103000  18736.208000  47.683716                            2.485355  "Device"    "Pageable"  "GeForce GTX 650 Ti BOOST (0)"  "1"        
114 121351.936000  5.560000      "cudaStreamSynchronize"

这看起来很奇怪,所以我联系了CUDA驱动程序团队的某个人,他确认文档是正确的。我也证实了这一点:

#include <iostream>
#include <memory>
using clock_value_t = long long;
__device__ void gpu_sleep(clock_value_t sleep_cycles) {
clock_value_t start = clock64();
clock_value_t cycles_elapsed;
do { cycles_elapsed = clock64() - start; }
while (cycles_elapsed < sleep_cycles);
}
void callback(cudaStream_t, cudaError_t, void *ptr) { 
*(reinterpret_cast<bool *>(ptr)) = true; 
}
__global__ void dummy(clock_value_t sleep_cycles) { gpu_sleep(sleep_cycles); }
int main(int argc, char* argv[]) {
cudaFree(0);
struct timespec start, stop;
const clock_value_t duration_in_clocks = 1e6;
const size_t buffer_size = 2 * 1024 * 1024 * (size_t)1024;
bool callback_executed = false;
cudaStream_t stream;
void* host_ptr;
if (argc == 1){
host_ptr = malloc(buffer_size);
}
else {
cudaMallocHost(&host_ptr, buffer_size, 0);
}
char* device_ptr;
cudaMalloc(&device_ptr, buffer_size);
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
cudaMemcpyAsync(device_ptr, host_ptr, buffer_size, cudaMemcpyDefault, stream);
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &stop);
double result = (stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) / 1e3;
std::cout << "Elapsed: " << result / 1000 / 1000<< std::endl;
dummy<<<128, 128, 0, stream>>>(duration_in_clocks);
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
cudaMemcpyAsync(host_ptr, device_ptr, buffer_size, cudaMemcpyDefault, stream);
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &stop);
result = (stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) / 1e3;
std::cout << "Elapsed: " << result / 1000 / 1000 << std::endl;
cudaStreamAddCallback(
stream, callback, &callback_executed, 0 /* fixed and meaningless */);
auto snapshot = callback_executed;
std::cout << "Right after we finished enqueuing work, the stream has "
<< (snapshot ? "" : "not ") << "concluded execution." << std::endl;
cudaStreamSynchronize(stream);
snapshot = callback_executed;
std::cout << "After cudaStreamSynchronize, the stream has "
<< (snapshot ? "" : "not ") << "concluded execution." << std::endl;
}

这基本上是你的代码,有一些修改:

  • 时间测量
  • 从可分页内存或固定内存分配的开关
  • 2 GiB的缓冲区大小可确保可测量的复制时间
  • cudaFree(0)以强制CUDA延迟初始化

以下是结果:

$ nvcc -std=c++11 main.cu -lrt
$ ./a.out # using pageable memory
Elapsed: 0.360828 # (memcpyDtoH pageable -> device, fully async)
Elapsed: 5.20288 # (memcpyHtoD device -> pageable, sync)
$ ./a.out 1 # using pinned memory
Elapsed: 4.412e-06 # (memcpyDtoH pinned -> device, fully async)
Elapsed: 7.127e-06 # (memcpyDtoH device -> pinned, fully async)

从可分页复制到设备时速度较慢,但实际上是异步的。

我为我的错误感到抱歉。我删除了以前的评论以避免混淆人们。

正如@RobinToni善意地指出的那样,CUDA内存拷贝只是在严格的条件下异步的。对于有问题的代码,问题主要是使用未固定(即分页)的主机内存。

引用运行时API文档的单独部分(重点挖掘):

2.API同步行为

API在同步和异步形式;异步";后缀这是用词不当,因为每个函数可能表现为同步或异步行为取决于传递给函数的参数。

异步

  • 对于从设备内存到可分页主机内存的传输,只有在复制完成后,函数才会返回

,这只是它的一半!确实

  • 对于从可分页主机内存到设备内存的传输,数据将首先暂存在固定主机内存中,然后复制到设备;并且该功能将仅在分段发生之后返回

相关内容

  • 没有找到相关文章

最新更新