考虑以下程序来对非阻塞GPU流上的一些工作进行排队:
#include <iostream>
using clock_value_t = long long;
__device__ void gpu_sleep(clock_value_t sleep_cycles) {
clock_value_t start = clock64();
clock_value_t cycles_elapsed;
do { cycles_elapsed = clock64() - start; }
while (cycles_elapsed < sleep_cycles);
}
void callback(cudaStream_t, cudaError_t, void *ptr) {
*(reinterpret_cast<bool *>(ptr)) = true;
}
__global__ void dummy(clock_value_t sleep_cycles) { gpu_sleep(sleep_cycles); }
int main() {
const clock_value_t duration_in_clocks = 1e6;
const size_t buffer_size = 1e7;
bool callback_executed = false;
cudaStream_t stream;
auto host_ptr = std::unique_ptr<char[]>(new char[buffer_size]);
char* device_ptr;
cudaMalloc(&device_ptr, buffer_size);
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
cudaMemcpyAsync(device_ptr, host_ptr.get(), buffer_size, cudaMemcpyDefault, stream);
dummy<<<128, 128, 0, stream>>>(duration_in_clocks);
cudaMemcpyAsync(host_ptr.get(), device_ptr, buffer_size, cudaMemcpyDefault, stream);
cudaStreamAddCallback(
stream, callback, &callback_executed, 0 /* fixed and meaningless */);
snapshot = callback_executed;
std::cout << "Right after we finished enqueuing work, the stream has "
<< (snapshot ? "" : "not ") << "concluded execution." << std::endl;
cudaStreamSynchronize(stream);
snapshot = callback_executed;
std::cout << "After cudaStreamSynchronize, the stream has "
<< (snapshot ? "" : "not ") << "concluded execution." << std::endl;
}
缓冲区的大小和内核睡眠周期的长度足够高,当它们与CPU线程并行执行时,它应该在它们结束之前完成排队(复制8ms+8ms,内核20ms)。
然而,从下面的跟踪来看,两个cudaMemcpyAsync()
似乎实际上是同步的,即它们会阻塞,直到(非阻塞)流真正结束复制。这是故意的行为吗?它似乎与CUDA运行时API文档的相关部分相矛盾。这有什么意义?
跟踪:(编号行,时间单位为useconds):
1 "Start" "Duration" "Grid X" "Grid Y" "Grid Z" "Block X" "Block Y" "Block Z"
104 14102.830000 59264.347000 "cudaMalloc"
105 73368.351000 19.886000 "cudaStreamCreateWithFlags"
106 73388.and 20 ms for the kernel).
然而,从下面的跟踪中可以看出,两个cudaMemcpyAsync()
实际上是同步的,即它们会阻塞,直到(非阻塞)流真正完成复制。这是故意的行为吗?它似乎与CUDA运行时API文档的相关部分相矛盾。这有什么意义?
850000 8330.257000 "cudaMemcpyAsync"
107 73565.702000 8334.265000 47.683716 5.587311 "Pageable" "Device" "GeForce GTX 650 Ti BOOST (0)" "1"
108 81721.124000 2.394000 "cudaConfigureCall"
109 81723.865000 3.585000 "cudaSetupArgument"
110 81729.332000 30.742000 "cudaLaunch (dummy(__int64) [107])"
111 81760.604000 39589.422000 "cudaMemcpyAsync"
112 81906.303000 20157.648000 128 1 1 128 1 1
113 102073.103000 18736.208000 47.683716 2.485355 "Device" "Pageable" "GeForce GTX 650 Ti BOOST (0)" "1"
114 121351.936000 5.560000 "cudaStreamSynchronize"
这看起来很奇怪,所以我联系了CUDA驱动程序团队的某个人,他确认文档是正确的。我也证实了这一点:
#include <iostream>
#include <memory>
using clock_value_t = long long;
__device__ void gpu_sleep(clock_value_t sleep_cycles) {
clock_value_t start = clock64();
clock_value_t cycles_elapsed;
do { cycles_elapsed = clock64() - start; }
while (cycles_elapsed < sleep_cycles);
}
void callback(cudaStream_t, cudaError_t, void *ptr) {
*(reinterpret_cast<bool *>(ptr)) = true;
}
__global__ void dummy(clock_value_t sleep_cycles) { gpu_sleep(sleep_cycles); }
int main(int argc, char* argv[]) {
cudaFree(0);
struct timespec start, stop;
const clock_value_t duration_in_clocks = 1e6;
const size_t buffer_size = 2 * 1024 * 1024 * (size_t)1024;
bool callback_executed = false;
cudaStream_t stream;
void* host_ptr;
if (argc == 1){
host_ptr = malloc(buffer_size);
}
else {
cudaMallocHost(&host_ptr, buffer_size, 0);
}
char* device_ptr;
cudaMalloc(&device_ptr, buffer_size);
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
cudaMemcpyAsync(device_ptr, host_ptr, buffer_size, cudaMemcpyDefault, stream);
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &stop);
double result = (stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) / 1e3;
std::cout << "Elapsed: " << result / 1000 / 1000<< std::endl;
dummy<<<128, 128, 0, stream>>>(duration_in_clocks);
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
cudaMemcpyAsync(host_ptr, device_ptr, buffer_size, cudaMemcpyDefault, stream);
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &stop);
result = (stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) / 1e3;
std::cout << "Elapsed: " << result / 1000 / 1000 << std::endl;
cudaStreamAddCallback(
stream, callback, &callback_executed, 0 /* fixed and meaningless */);
auto snapshot = callback_executed;
std::cout << "Right after we finished enqueuing work, the stream has "
<< (snapshot ? "" : "not ") << "concluded execution." << std::endl;
cudaStreamSynchronize(stream);
snapshot = callback_executed;
std::cout << "After cudaStreamSynchronize, the stream has "
<< (snapshot ? "" : "not ") << "concluded execution." << std::endl;
}
这基本上是你的代码,有一些修改:
- 时间测量
- 从可分页内存或固定内存分配的开关
- 2 GiB的缓冲区大小可确保可测量的复制时间
cudaFree(0)
以强制CUDA延迟初始化
以下是结果:
$ nvcc -std=c++11 main.cu -lrt
$ ./a.out # using pageable memory
Elapsed: 0.360828 # (memcpyDtoH pageable -> device, fully async)
Elapsed: 5.20288 # (memcpyHtoD device -> pageable, sync)
$ ./a.out 1 # using pinned memory
Elapsed: 4.412e-06 # (memcpyDtoH pinned -> device, fully async)
Elapsed: 7.127e-06 # (memcpyDtoH device -> pinned, fully async)
从可分页复制到设备时速度较慢,但实际上是异步的。
我为我的错误感到抱歉。我删除了以前的评论以避免混淆人们。
正如@RobinToni善意地指出的那样,CUDA内存拷贝只是在严格的条件下异步的。对于有问题的代码,问题主要是使用未固定(即分页)的主机内存。
引用运行时API文档的单独部分(重点挖掘):
2.API同步行为
API在同步和异步形式;异步";后缀这是用词不当,因为每个函数可能表现为同步或异步行为取决于传递给函数的参数。
异步
- 对于从设备内存到可分页主机内存的传输,只有在复制完成后,函数才会返回
,这只是它的一半!确实
- 对于从可分页主机内存到设备内存的传输,数据将首先暂存在固定主机内存中,然后复制到设备;并且该功能将仅在分段发生之后返回