Cuda Unified memory vs cudaMalloc

我正在尝试进行一些基准测试，以确保使用 CUDA 的统一内存 (UM) 方法不会损害我们的性能。

我正在执行FFT。一种方式我使用UM，一种方式我使用cudaMalloc

之后我比较了结果，它们都匹配(这很好)。

但是，我为 UM 方法获得的时间是 ~.5ms，而 cudaMalloc 方式为 ~.04(在多次执行平均运行后)

我正在使用事件记录来计时。我在袖口ExecC2C呼叫之前和之后都有一个。

此外，我添加了另外两个事件记录，以测量任何内存传输到设备之前的时间，以及使用数据后从设备取回数据的时间。

这样做时，我看到 UM 方法需要 ~1.6ms，cudaMalloc 方法需要 ~.7。

下面是执行 UM 方法的代码片段：

cufftHandle plan;
cufftPlan1d(&plan, dataSize, CUFFT_C2C, 1);
cudaMallocManaged(&inData, dataSize * sizeof(cufftComplex));
cudaMallocManaged(&outData, dataSize * sizeof(cufftComplex));
cudaEvent_t start_before_memHtoD, start_kernel, stop_kernel,
stop_after_memDtoH;
cudaEventCreate(&start_kernel);
cudaEventCreate(&start_before_memHtoD);
cudaEventCreate(&stop_kernel);
cudaEventCreate(&stop_after_memDtoH);
setupWave(dataSize, inData);
cudaEventRecord(start_before_memHtoD);
cudaMemPrefetchAsync(inData, dataSize * sizeof(cufftComplex), 1);
cudaDeviceSynchronize();
cudaEventRecord(start_kernel);
cufftExecC2C(plan, inData, outData, CUFFT_FORWARD);
cudaEventRecord(stop_kernel);
cudaEventSynchronize(stop_kernel);
float sum = 0;
for (int i = 0; i < dataSize; i++) {
sum += outData[i].x + outData[i].y;
}
cudaEventRecord(stop_after_memDtoH);
cudaEventSynchronize(stop_after_memDtoH);
std::cout << "sum for UM is " << sum << std::endl;
float umTime = 0;
float overallUmTime = 0;
cudaEventElapsedTime(&umTime, start_kernel, stop_kernel);
cudaEventElapsedTime(&overallUmTime, start_before_memHtoD,
stop_after_memDtoH);
resultString_um += std::to_string(dataSize) + " samples took "
+ std::to_string(umTime) + "ms,  Overall: "
+ std::to_string(overallUmTime) + "n";
cudaFree(outData);
cudaFree(inData);
cudaEventDestroy(start_kernel);
cudaEventDestroy(stop_kernel);
cudaEventDestroy(start_before_memHtoD);
cudaEventDestroy(stop_after_memDtoH);
cufftDestroy(plan);

以下是 cudaMalloc 方法

cufftComplex *d_inData;
cufftComplex *d_outData;
inData = (cufftComplex*) (malloc(sizeof(cufftComplex) * dataSize));
outData = (cufftComplex*) (malloc(sizeof(cufftComplex) * dataSize));
cudaMalloc((void**) (&d_inData), dataSize * sizeof(cufftComplex));
cudaMalloc((void**) (&d_outData), dataSize * sizeof(cufftComplex));
cufftHandle plan;
cufftPlan1d(&plan, dataSize, CUFFT_C2C, 1);
cudaEvent_t start_before_memHtoD, start_kernel, stop_kernel,
stop_after_memDtoH;
cudaEventCreate(&start_kernel);
cudaEventCreate(&start_before_memHtoD);
cudaEventCreate(&stop_kernel);
cudaEventCreate(&stop_after_memDtoH);
setupWave(dataSize, inData);
cudaEventRecord(start_before_memHtoD);
cudaMemcpy(d_inData, inData, dataSize * sizeof(cufftComplex),
cudaMemcpyHostToDevice);
cudaEventRecord(start_kernel);
cufftExecC2C(plan, d_inData, d_outData, CUFFT_FORWARD);
cudaEventRecord(stop_kernel);
cudaEventSynchronize(stop_kernel);
cudaMemcpy(outData, d_outData, dataSize * sizeof(cufftComplex),
cudaMemcpyDefault);
cudaEventRecord(stop_after_memDtoH);
float sum = 0;
for (int i = 0; i < dataSize; i++) {
sum += outData[i].x + outData[i].y;
}
cudaEventRecord(stop_after_memDtoH);
cudaEventSynchronize(stop_after_memDtoH);
std::cout << "sum for UM is " << sum << std::endl;
float umTime = 0;
float overallUmTime = 0;
cudaEventElapsedTime(&umTime, start_kernel, stop_kernel);
cudaEventElapsedTime(&overallUmTime, start_before_memHtoD,
stop_after_memDtoH);
resultString_um += std::to_string(dataSize) + " samples took "
+ std::to_string(umTime) + "ms,  Overall: "
+ std::to_string(overallUmTime) + "n";
cudaFree(outData);
cudaFree(inData);
cudaFree(d_outData);
cudaFree(d_inData);
cudaEventDestroy(start_kernel);
cudaEventDestroy(stop_kernel);
cudaEventDestroy(start_before_memHtoD);
cudaEventDestroy(stop_after_memDtoH);
cufftDestroy(plan);

在使用统一内存方法来加快速度时，我还能做些什么吗？我预计 UM 会慢一点，但不会慢这么多。

我们正在将 P100 与 Cuda 7.3 一起使用 9

您发布的代码的一个问题是您没有对 FFT 的输出数据进行cudaMemPrefetchAsync。根据我的测试，这会产生显着差异。您的代码还有其他一些问题，例如，我们不会在分配了malloc的指针上调用cudaFree。

下面是围绕所显示内容构建的完整代码。当我在 CentOS7.4、CUDA 9.1、Tesla P100 上运行它时，我在托管内存情况下执行的 FFT 时间相当 (3.52ms) 与在非托管内存情况下执行的FFT(3.45ms)：

$ cat t43.cu
#include <cufft.h>
#include <iostream>
#include <string>
//using namespace std;
const int dataSize  = 1048576*32;
void setupWave(const int ds, cufftComplex *d){
for (int i = 0; i < ds; i++){
d[i].x = 1.0f;
d[i].y = 0.0f;}
}
int main(){
cufftComplex *inData, *outData;
cufftHandle plan;
cufftPlan1d(&plan, dataSize, CUFFT_C2C, 1);
cudaMallocManaged(&inData, dataSize * sizeof(cufftComplex));
cudaMallocManaged(&outData, dataSize * sizeof(cufftComplex));
cudaEvent_t start_before_memHtoD, start_kernel, stop_kernel,
stop_after_memDtoH;
cudaEventCreate(&start_kernel);
cudaEventCreate(&start_before_memHtoD);
cudaEventCreate(&stop_kernel);
cudaEventCreate(&stop_after_memDtoH);
setupWave(dataSize, inData);
cudaEventRecord(start_before_memHtoD);
cudaMemPrefetchAsync(inData, dataSize * sizeof(cufftComplex), 0);
cudaMemPrefetchAsync(outData, dataSize * sizeof(cufftComplex), 0);
cudaDeviceSynchronize();
cudaEventRecord(start_kernel);
cufftExecC2C(plan, inData, outData, CUFFT_FORWARD);
cudaEventRecord(stop_kernel);
cudaEventSynchronize(stop_kernel);
float sum = 0;
for (int i = 0; i < dataSize; i++) {
sum += outData[i].x + outData[i].y;
}
cudaEventRecord(stop_after_memDtoH);
cudaEventSynchronize(stop_after_memDtoH);
std::cout << "sum for UM is " << sum << std::endl;
float umTime = 0;
float overallUmTime = 0;
cudaEventElapsedTime(&umTime, start_kernel, stop_kernel);
cudaEventElapsedTime(&overallUmTime, start_before_memHtoD,
stop_after_memDtoH);
std::string resultString_um = std::to_string(dataSize) + " samples took " + std::to_string(umTime) + "ms,  Overall: " + std::to_string(overallUmTime) + "n";
std::cout << resultString_um;
cudaEventDestroy(start_kernel);
cudaEventDestroy(stop_kernel);
cudaFree(inData);
cudaFree(outData);
cudaEventDestroy(start_before_memHtoD);
cudaEventDestroy(stop_after_memDtoH);
cufftDestroy(plan);

cufftComplex *d_inData;
cufftComplex *d_outData;
inData = (cufftComplex*) (malloc(sizeof(cufftComplex) * dataSize));
outData = (cufftComplex*) (malloc(sizeof(cufftComplex) * dataSize));
cudaMalloc((void**) (&d_inData), dataSize * sizeof(cufftComplex));
cudaMalloc((void**) (&d_outData), dataSize * sizeof(cufftComplex));
//cufftHandle plan;
cufftPlan1d(&plan, dataSize, CUFFT_C2C, 1);
//cudaEvent_t start_before_memHtoD, start_kernel, stop_kernel,
//                stop_after_memDtoH;
cudaEventCreate(&start_kernel);
cudaEventCreate(&start_before_memHtoD);
cudaEventCreate(&stop_kernel);
cudaEventCreate(&stop_after_memDtoH);
setupWave(dataSize, inData);
cudaEventRecord(start_before_memHtoD);
cudaMemcpy(d_inData, inData, dataSize * sizeof(cufftComplex),
cudaMemcpyHostToDevice);
cudaEventRecord(start_kernel);
cufftExecC2C(plan, d_inData, d_outData, CUFFT_FORWARD);
cudaEventRecord(stop_kernel);
cudaEventSynchronize(stop_kernel);
cudaMemcpy(outData, d_outData, dataSize * sizeof(cufftComplex),
cudaMemcpyDefault);
sum = 0;
for (int i = 0; i < dataSize; i++) {
sum += outData[i].x + outData[i].y;
}
cudaEventRecord(stop_after_memDtoH);
cudaEventSynchronize(stop_after_memDtoH);
std::cout << "sum for non-UM is " << sum << std::endl;
//float umTime = 0;
//float overallUmTime = 0;
cudaEventElapsedTime(&umTime, start_kernel, stop_kernel);
cudaEventElapsedTime(&overallUmTime, start_before_memHtoD,
stop_after_memDtoH);
resultString_um = std::to_string(dataSize) + " samples took "
+ std::to_string(umTime) + "ms,  Overall: "
+ std::to_string(overallUmTime) + "n";
std::cout << resultString_um;
free(outData);
free(inData);
cudaFree(d_outData);
cudaFree(d_inData);
cudaEventDestroy(start_kernel);
cudaEventDestroy(stop_kernel);
cudaEventDestroy(start_before_memHtoD);
cudaEventDestroy(stop_after_memDtoH);
cufftDestroy(plan);
}
$ nvcc -std=c++11 -arch=sm_60 -o t43 t43.cu -lcufft
$ ./t43
sum for UM is 3.35544e+07
33554432 samples took 3.520640ms,  Overall: 221.909988
sum for non-UM is 3.35544e+07
33554432 samples took 3.456160ms,  Overall: 278.099426
$

相关内容

最新更新

热门标签：