我需要为CUDA内核执行计时。最佳实践指南说,我们可以使用事件或标准计时函数,如Windows中的clock()
。我的问题是,使用这两个函数给我一个完全不同的结果。事实上,与实践中的实际速度相比,事件给出的结果似乎是巨大的。
我真正需要的是能够通过首先在较小的数据集上运行简化版本来预测计算的运行时间。不幸的是,这个基准的结果是完全不现实的,要么太乐观(clock()
),要么太悲观(事件)。
你可以这样做:
#include <sys/time.h>
struct timeval t1, t2;
gettimeofday(&t1, 0);
kernel_call<<<dimGrid, dimBlock, 0>>>();
HANDLE_ERROR(cudaThreadSynchronize();)
gettimeofday(&t2, 0);
double time = (1000000.0*(t2.tv_sec-t1.tv_sec) + t2.tv_usec-t1.tv_usec)/1000.0;
printf("Time to generate: %3.1f ms n", time);
或:
float time;
cudaEvent_t start, stop;
HANDLE_ERROR( cudaEventCreate(&start) );
HANDLE_ERROR( cudaEventCreate(&stop) );
HANDLE_ERROR( cudaEventRecord(start, 0) );
kernel_call<<<dimGrid, dimBlock, 0>>>();
HANDLE_ERROR( cudaEventRecord(stop, 0) );
HANDLE_ERROR( cudaEventSynchronize(stop) );
HANDLE_ERROR( cudaEventElapsedTime(&time, start, stop) );
printf("Time to generate: %3.1f ms n", time);
有一个现成的GpuTimer结构体可供使用:
#ifndef __GPU_TIMER_H__
#define __GPU_TIMER_H__
struct GpuTimer
{
cudaEvent_t start;
cudaEvent_t stop;
GpuTimer()
{
cudaEventCreate(&start);
cudaEventCreate(&stop);
}
~GpuTimer()
{
cudaEventDestroy(start);
cudaEventDestroy(stop);
}
void Start()
{
cudaEventRecord(start, 0);
}
void Stop()
{
cudaEventRecord(stop, 0);
}
float Elapsed()
{
float elapsed;
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed, start, stop);
return elapsed;
}
};
#endif /* __GPU_TIMER_H__ */
你的问题已经有了满意的答案。
我已经构造了用于计时C/c++以及CUDA操作的类,并希望与其他用户分享,希望它们可以对下一个用户有所帮助。您只需要将下面报告的4
文件添加到您的项目中,并将#include
的两个头文件添加为
// --- Timing includes
#include "TimingCPU.h"
#include "TimingGPU.cuh"
这两个类可以如下使用:
定时CPU段
TimingCPU timer_CPU;
timer_CPU.StartCounter();
CPU perations to be timed
std::cout << "CPU Timing = " << timer_CPU.GetCounter() << " ms" << std::endl;
定时GPU部分
TimingGPU timer_GPU;
timer_GPU.StartCounter();
GPU perations to be timed
std::cout << "GPU Timing = " << timer_GPU.GetCounter() << " ms" << std::endl;
在这两种情况下,计时都以毫秒为单位。此外,这两个类也可以在linux或windows下使用。
以下是4
文件:
TimingCPU.cpp
/**************/
/* TIMING CPU */
/**************/
#include "TimingCPU.h"
#ifdef __linux__
#include <sys/time.h>
#include <stdio.h>
TimingCPU::TimingCPU(): cur_time_(0) { StartCounter(); }
TimingCPU::~TimingCPU() { }
void TimingCPU::StartCounter()
{
struct timeval time;
if(gettimeofday( &time, 0 )) return;
cur_time_ = 1000000 * time.tv_sec + time.tv_usec;
}
double TimingCPU::GetCounter()
{
struct timeval time;
if(gettimeofday( &time, 0 )) return -1;
long cur_time = 1000000 * time.tv_sec + time.tv_usec;
double sec = (cur_time - cur_time_) / 1000000.0;
if(sec < 0) sec += 86400;
cur_time_ = cur_time;
return 1000.*sec;
}
#elif _WIN32 || _WIN64
#include <windows.h>
#include <iostream>
struct PrivateTimingCPU {
double PCFreq;
__int64 CounterStart;
};
// --- Default constructor
TimingCPU::TimingCPU() { privateTimingCPU = new PrivateTimingCPU; (*privateTimingCPU).PCFreq = 0.0; (*privateTimingCPU).CounterStart = 0; }
// --- Default destructor
TimingCPU::~TimingCPU() { }
// --- Starts the timing
void TimingCPU::StartCounter()
{
LARGE_INTEGER li;
if(!QueryPerformanceFrequency(&li)) std::cout << "QueryPerformanceFrequency failed!n";
(*privateTimingCPU).PCFreq = double(li.QuadPart)/1000.0;
QueryPerformanceCounter(&li);
(*privateTimingCPU).CounterStart = li.QuadPart;
}
// --- Gets the timing counter in ms
double TimingCPU::GetCounter()
{
LARGE_INTEGER li;
QueryPerformanceCounter(&li);
return double(li.QuadPart-(*privateTimingCPU).CounterStart)/(*privateTimingCPU).PCFreq;
}
#endif
TimingCPU.h
// 1 micro-second accuracy
// Returns the time in seconds
#ifndef __TIMINGCPU_H__
#define __TIMINGCPU_H__
#ifdef __linux__
class TimingCPU {
private:
long cur_time_;
public:
TimingCPU();
~TimingCPU();
void StartCounter();
double GetCounter();
};
#elif _WIN32 || _WIN64
struct PrivateTimingCPU;
class TimingCPU
{
private:
PrivateTimingCPU *privateTimingCPU;
public:
TimingCPU();
~TimingCPU();
void StartCounter();
double GetCounter();
}; // TimingCPU class
#endif
#endif
TimingGPU.cu
/**************/
/* TIMING GPU */
/**************/
#include "TimingGPU.cuh"
#include <cuda.h>
#include <cuda_runtime.h>
struct PrivateTimingGPU {
cudaEvent_t start;
cudaEvent_t stop;
};
// default constructor
TimingGPU::TimingGPU() { privateTimingGPU = new PrivateTimingGPU; }
// default destructor
TimingGPU::~TimingGPU() { }
void TimingGPU::StartCounter()
{
cudaEventCreate(&((*privateTimingGPU).start));
cudaEventCreate(&((*privateTimingGPU).stop));
cudaEventRecord((*privateTimingGPU).start,0);
}
void TimingGPU::StartCounterFlags()
{
int eventflags = cudaEventBlockingSync;
cudaEventCreateWithFlags(&((*privateTimingGPU).start),eventflags);
cudaEventCreateWithFlags(&((*privateTimingGPU).stop),eventflags);
cudaEventRecord((*privateTimingGPU).start,0);
}
// Gets the counter in ms
float TimingGPU::GetCounter()
{
float time;
cudaEventRecord((*privateTimingGPU).stop, 0);
cudaEventSynchronize((*privateTimingGPU).stop);
cudaEventElapsedTime(&time,(*privateTimingGPU).start,(*privateTimingGPU).stop);
return time;
}
TimingGPU.cuh
#ifndef __TIMING_CUH__
#define __TIMING_CUH__
/**************/
/* TIMING GPU */
/**************/
// Events are a part of CUDA API and provide a system independent way to measure execution times on CUDA devices with approximately 0.5
// microsecond precision.
struct PrivateTimingGPU;
class TimingGPU
{
private:
PrivateTimingGPU *privateTimingGPU;
public:
TimingGPU();
~TimingGPU();
void StartCounter();
void StartCounterFlags();
float GetCounter();
}; // TimingCPU class
#endif
如果你想测量GPU时间,你几乎必须使用事件。在nvidia论坛上有一个关于应用程序定时该做什么和不该做什么的讨论。
您可以使用compute可视化分析器,它将非常适合您的目的。它测量每个cuda函数的时间并告诉你调用了它多少次。