我正在使用显卡Nvidia GeForce GT 630。 并运行OpenCL程序。程序收集双精度数组并作为参数传递给 GPU 内核作为缓冲区。使用此数组进行计算的位置。执行工作大约需要 10 分钟。机器我正在使用英特尔(R) 酷睿(TM) i5-3470 CPU
类似的统计数据,
机器: 英特尔(R) 酷睿(TM) i3 处理器
显卡:GeForce 9500 GT
同一程序所需的时间为 16 分钟。
我想时间可能是因为显卡不同。因此,我將 Intel(R) Core(TM) i3 CPU 中的 GeForce 9500 GT 換成了 Intel(R) Core(TM) i5-3470 CPU。
但时间仍然是16分钟。由于程序时CPU和GPU之间没有连接。谁能建议我,在低端机器上安装高端显卡后,为什么时间没有改善,因为一切都只在同一个 GPU 上计算?
提前谢谢。
创建内核的代码:
void create_kernel () {
FILE *fp;
char *source_str;
fp = fopen("cw_calc.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);
cl_uint count;
clGetPlatformIDs(10, NULL, &count);
// get all platforms in array platfroms
platform = (cl_platform_id*) malloc(sizeof(cl_platform_id) * count);
clGetPlatformIDs(count, platform, NULL);
ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
if (ret != 0) {
printf("clGetPlatformIDs error: %d. couldn't loadn", ret);
exit(1);
}
//////////////////////////////////////////////////////////////////////////////////////
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);
if (ret != 0) {
printf("clGetDeviceIDs error: %d. couldn't loadn", ret);
exit(1);
}
/* Create OpenCL context */
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
if (ret != 0) {
printf("clCreateContext error: %d. couldn't loadn", ret);
exit(1);
}
/* Create Command Queue */
command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
if (ret != 0) {
printf("clCreateCommandQueue error: %d. couldn't loadn", ret);
exit(1);
}
/*Initialization complete*/
/* Create Kernel Program from the source */
program = clCreateProgramWithSource(context, 1, (const char **)&source_str,(const size_t *)&source_size, &ret);
if (ret != 0) {
printf("clCreateProgramWithSource error: %d. couldn't loadn", ret);
exit(1);
}
/* Build Kernel Program */
printf("loading GPU kernel..n");
system("date");
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
system("date");
if (ret != 0) {
printf("clBuildProgram error: %d. couldn't loadn", ret);
// Determine the size of the log
size_t log_size;
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
// Allocate memory for the log
char *log = (char *) malloc(log_size);
// Get the log
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
// Print the log
printf("%sn", log);
exit(1);
}
printf("GPU kernel loaded successfuly..n");
/* Create OpenCL Kernel */
printf("creating kernel program..n");
kernel = clCreateKernel(program, "gpu_solve", &ret);
if (ret != 0) {
printf("clCreateKernel error: %d. couldn't loadn", ret);
exit(1);
}
printf("kernel program created successfuly..n");
}
执行内核的代码:
int gpu_solve()
{
printf("Calling gpu_solven");
cl_int ret;
cl_event event;
cl_mem spcmBuffer,pvpmBuffer, frmBuffer, ipcmBuffer;;
size_t global_work_size[1] = {1};
double sending = 0.0, recv = 0.0, calctime = 0.0;
double temp = 0.0;
struct timezone tz;
struct timeval curr_time;
gettimeofday(&curr_time, &tz);
temp = (curr_time.tv_usec/1000000.0) + curr_time.tv_sec;
spcmBuffer = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, numsp * sizeof(double),(void *) cti_spcm, &ret);
if (ret != 0) {
printf("clCreateBuffer spcmBuffer error: %d. couldn't loadn", ret);
exit(1);
}
pvpmBuffer = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, numparams * sizeof(double),(void *) cti_pvpm, &ret);
if (ret != 0) {
printf("clCreateBuffer pvpmBuffer error: %d. couldn't loadn", ret);
exit(1);
}
frmBuffer = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, numreactions * sizeof(double),(void *) cti_frm, &ret);
if (ret != 0) {
printf("clCreateBuffer frmBuffer error: %d. couldn't loadn", ret);
exit(1);
}
ipcmBuffer = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, numsim_spc * sizeof(double),(void *) cti_ipcm, &ret);
if (ret != 0) {
printf("clCreateBuffer ipcmBuffer error: %d. couldn't loadn", ret);
exit(1);
}
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&ipcmBuffer);
if (ret != 0) {
printf("clSetKernelArg 0 error: %d. couldn't loadn", ret);
exit(1);
}
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&spcmBuffer);
if (ret != 0) {
printf("clSetKernelArg 1 error: %d. couldn't loadn", ret);
exit(1);
}
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&pvpmBuffer);
if (ret != 0) {
printf("clSetKernelArg 2 error: %d. couldn't loadn", ret);
exit(1);
}
ret = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&frmBuffer);
if (ret != 0) {
printf("clSetKernelArg 3 error: %d. couldn't loadn", ret);
exit(1);
}
ret = clSetKernelArg(kernel, 4, sizeof(int), (void *)&ct_numsim_spc);
if (ret != 0) {
printf("clSetKernelArg 4 error: %d. couldn't loadn", ret);
exit(1);
}
ret = clSetKernelArg(kernel, 5, sizeof(double), (void *)&ct_deltime);
if (ret != 0) {
printf("clSetKernelArg 5 error: %d. couldn't loadn", ret);
exit(1);
}
gettimeofday(&curr_time, &tz);
sending = ((curr_time.tv_usec/1000000.0) + curr_time.tv_sec) - temp;
printf("sending time %lfn",sending);
gettimeofday(&curr_time, &tz);
temp = (curr_time.tv_usec/1000000.0) + curr_time.tv_sec;
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, &event);
if (ret != 0) {
printf("clEnqueueNDRangeKernel error: %d. couldn't loadn", ret);
exit(1);
}
clWaitForEvents(1, &event);
gettimeofday(&curr_time, &tz);
calctime = ((curr_time.tv_usec/1000000.0) + curr_time.tv_sec) - temp;
printf("calctime time %lfn",calctime);
gettimeofday(&curr_time, &tz);
temp = (curr_time.tv_usec/1000000.0) + curr_time.tv_sec;
ret = clEnqueueReadBuffer(command_queue, ipcmBuffer, CL_TRUE, 0, numsim_spc * sizeof(double), cti_ipcm, 0, NULL, NULL);
if (ret != 0) {
printf("clEnqueueReadBuffer ipcmBuffer error: %d. couldn't loadn", ret);
exit(1);
}
ret = clEnqueueReadBuffer(command_queue, spcmBuffer, CL_TRUE, 0, numsp * sizeof(double), cti_spcm, 0, NULL, NULL);
if (ret != 0) {
printf("clEnqueueReadBuffer spcmBuffer error: %d. couldn't loadn", ret);
exit(1);
}
ret = clEnqueueReadBuffer(command_queue, pvpmBuffer, CL_TRUE, 0, numparams * sizeof(double), cti_pvpm, 0, NULL, NULL);
if (ret != 0) {
printf("clEnqueueReadBuffer pvpmBuffer error: %d. couldn't loadn", ret);
exit(1);
}
ret = clEnqueueReadBuffer(command_queue, frmBuffer, CL_TRUE, 0, numreactions * sizeof(double), cti_frm, 0, NULL, NULL);
if (ret != 0) {
printf("clEnqueueReadBuffer frmBuffer error: %d. couldn't loadn", ret);
exit(1);
}
ret = clReleaseMemObject(spcmBuffer);
ret = clReleaseMemObject(pvpmBuffer);
ret = clReleaseMemObject(frmBuffer);
ret = clReleaseMemObject(ipcmBuffer);
gettimeofday(&curr_time, &tz);
recv = ((curr_time.tv_usec/1000000.0) + curr_time.tv_sec) - temp;
printf("recv time %lfn",recv);
return 0;
}
内核代码:
void calc_ipcm_cc (__global double* cti_ipcm, __global double* cti_fprm) {
cti_ipcm[0] = (-cti_fprm[605] + cti_fprm[3135])*5.000000e-01;
cti_ipcm[1] = (cti_fprm[132] - cti_fprm[1037] + cti_fprm[6734])*1.004016e+01;
cti_ipcm[2] = (cti_fprm[3993] - cti_fprm[4090])*5.000000e-01;
.....
....
~10000 equations to be solved
}
void gpu_eval (__global double* cti_spcm, __global double* cti_pvpm, __global double* cti_fprm) {
cti_fprm[7632] = ((1.00000000e+00*cti_spcm[2986])/(1.00000000e+00+cti_spcm[2986])) ;
cti_pvpm[6208] = (((1.00000000e+00*cti_spcm[2986])*1.66000000e+00) );
cti_pvpm[4434] = (((cti_pvpm[6208]*cti_spcm[212])/(1.00000000e+00+cti_spcm[212])) );
cti_fprm[7633] = cti_pvpm[4434] ;
.....
....
~10000 equations to be solved
}
__kernel void gpu_solve(__global double* cti_ipcm,__global double* cti_spcm, __global double* cti_pvpm, __global double* cti_fprm, int ct_numsim_spc, double ct_deltime)
{
int i = 0, ispc, mnspcidx;
ct_deltime = 0.00001;
double simtime = 0.0;
while (simtime <= 50000.0) {
calc_ipcm(cti_ipcm,cti_fprm);
gpu_eval(cti_spcm,cti_pvpm,cti_fprm);
simtime = simtime + ct_deltime;
}
}
您使用的是CL_DEVICE_TYPE_DEFAULT,我很确定您使用的是 CPU 而不是 GPU。因为如果只需要 16M 才能完成 1 个内核,GPU 驱动程序将重新启动。