OpenCL CL_INVALID_VALUE Error on clEnqueueWriteBuffer



我正在尝试在OpenCL上运行算法。我使用这个存储库(Source.cpp)作为模板。现在我想把整个程序转换成long类型的算法,而不是float。但我总是得到一个CL_INVALID_VALUE(-30)例外第二个clEnqueueWriteBuffer。我已经浪费了几个小时没有找到错误,所以也许我已经监督了一些明显的东西(我还没有做太多与opencl ..) ?

我的代码(不工作)

#include <cassert>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <fstream>
#include<time.h>
#include <CL/cl.h>
//#define DATA_SIZE 1024
#define DATA_SIZE 1024
using namespace std;

//$ /f/Tools/OCL_SDK_Light/lib/x86_64/opencl.lib blelloch_scan.cpp
const char* ProgramSource =
"__kernel void add(__global long *input, __global long *output, __global long *temp, int size){n"
"int thid = get_global_id(0); n"
"int offset = 1; n"
"printf('%d',thid); n"
"temp[2*thid] = input[2*thid]; n"
"temp[2*thid+1] = input[2*thid+1]; n"
"for(int d= size>>1; d>0; d >>= 1){ n"
"barrier(CLK_GLOBAL_MEM_FENCE); n"
"if(thid < d){ n"
"int ai = offset*(2*thid + 1)-1; n"
"int bi = offset*(2*thid + 2)-1; n"
"temp[bi] += temp[ai]; } n"
"offset = offset*2; n"
"} n"
"temp[size-1] = 0; n"
"barrier(CLK_GLOBAL_MEM_FENCE); n"
"for(int d = 1; d<size; d *= 2){ n"
"offset >>= 1; barrier(CLK_GLOBAL_MEM_FENCE); n"
"if(thid < d) { n"
"int ai = offset*(2*thid+1)-1; int bi = offset*(2*thid+2)-1; n"
"long t = temp[ai]; temp[ai] = temp[bi]; temp[bi] += t; }  n"
"} n"
"barrier(CLK_GLOBAL_MEM_FENCE); n"
"output[2*thid] = temp[2*thid]; n"
"output[2*thid+1] = temp[2*thid+1]; n"
"}n"
"n";
/*
*/


int main(void)
{
cl_context context;
cl_context_properties properties[3];
cl_command_queue command_queue;
cl_kernel kernel;
cl_program program;
cl_int err;
cl_uint num_platforms = 0;
cl_platform_id* platforms;
cl_device_id device_id;
cl_uint num_of_devices = 0;
cl_mem inputA, inputB, output;
size_t global, loc;
std::cout << "Setup n";
long arr[DATA_SIZE];
long inputDataA[DATA_SIZE];
long results[2 * DATA_SIZE];
long  i;
for (i = 1; i < DATA_SIZE - 1;i++)
{
inputDataA[i-1] = (long)i;
arr[i-1] = (long)i;
}
clock_t ends;
/* --------------------- Get platform ---------------------*/
cl_int clResult = clGetPlatformIDs(0, NULL, &num_platforms);
assert(clResult == CL_SUCCESS);
platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id) * num_platforms);
clResult = clGetPlatformIDs(num_platforms, platforms, NULL);
assert(clResult == CL_SUCCESS);
/* --------------------- ------------ ---------------------*/

/* --------------------- Get devices ---------------------*/
cl_device_id* devices = NULL;
cl_uint num_devices;
clResult = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
assert(clResult == CL_SUCCESS);
devices = (cl_device_id*)malloc(sizeof(cl_device_id) * num_platforms);
if (clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, num_devices, devices, NULL) != CL_SUCCESS)
{
printf("could not find device id");
}
assert(clResult == CL_SUCCESS);
/* --------------------- ----------- ---------------------*/
properties[0] = CL_CONTEXT_PLATFORM;
properties[1] = 0;
cl_int contextResult;
context = clCreateContext(NULL, 1, &devices[0], NULL, NULL, &contextResult);
assert(contextResult == CL_SUCCESS);
// create command queue using the context and device
command_queue = clCreateCommandQueueWithProperties(context, devices[0], 0, &err);
// create a program from the kernel source code
program = clCreateProgramWithSource(context, 1, (const char**)&ProgramSource, NULL, &err);
// compile the program
if (clBuildProgram(program, 0, NULL, NULL, NULL, NULL) != CL_SUCCESS)
{
printf("Error building programn");
return 1;
}
// specify which kernel from the program to execute
kernel = clCreateKernel(program, "add", &err);
// create buffers for the input and ouput
cl_int result;
inputA = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(long) * DATA_SIZE, NULL, NULL);
inputB = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(long) * DATA_SIZE, NULL, NULL);
output = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(long) * DATA_SIZE, NULL, NULL);
// load data into the input buffer
clResult = clEnqueueWriteBuffer(command_queue, inputA, CL_TRUE, 0, sizeof(long) * DATA_SIZE, inputDataA, 0, NULL, NULL);
assert(clResult == CL_SUCCESS);
clResult = clEnqueueWriteBuffer(command_queue, inputB, CL_TRUE, 0, sizeof(long) * DATA_SIZE , 0, 0, NULL, NULL);
assert(clResult == CL_SUCCESS); // ERROR HERE 
clResult = clEnqueueWriteBuffer(command_queue, output, CL_TRUE, 0, sizeof(long) * DATA_SIZE, 0, 0, NULL, NULL);
assert(clResult == CL_SUCCESS);

int temp = DATA_SIZE;
clock_t start = clock();
// set the argument list for the kernel command
clResult = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputA);
assert(clResult == CL_SUCCESS);

clResult = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
assert(clResult == CL_SUCCESS);

clResult = clSetKernelArg(kernel, 2, sizeof(cl_mem), &inputB);
assert(clResult == CL_SUCCESS);

clResult = clSetKernelArg(kernel, 3, sizeof(int), &temp);
assert(clResult == CL_SUCCESS);

global = DATA_SIZE; // num of processors
loc = 256;
printf("n>> start parallel ---------- n");
// enqueue the kernel command for execution
clResult = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, &loc, 0, NULL, NULL);
assert(clResult == CL_SUCCESS);
// copy the results from out of the output buffer
clResult = clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sizeof(long) * DATA_SIZE, results, 0, NULL, NULL);
assert(clResult == CL_SUCCESS);

clFinish(command_queue);
ends = clock();
// print the results
int k = 1;
for (k = 1;k < 8; k++)
{
printf("%d - ", k);
printf("%d n", results[k]);
}
double time_taken = ((double)(ends - start)) / CLK_TCK;
printf("n>>finished parallel in %lf secondsn", time_taken);
// cleanup - release OpenCL resources
printf("n-------------------------------------n");

/* -------sequential ------- */
printf("n>> start sequential ---------- n");
long prefixSum[DATA_SIZE] = { 0 };
const clock_t startSequential = clock();
prefixSum[0] = arr[0];
long idx = 1;
for (idx = 1; idx < DATA_SIZE; idx++) {
prefixSum[idx] = prefixSum[idx - 1] + arr[idx];
}
const clock_t endSequential = clock();


double seqTime = ((double)(endSequential - startSequential)) / CLK_TCK;
printf("n>> finished sequential in %lfn", seqTime);
for (int j = 0;j < 8; j++)
{
printf("%d - ", j);
printf("%d n", prefixSum[j]);
}

clReleaseMemObject(inputA);
clReleaseMemObject(inputB);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
return 0;
}

存储库代码(工作):

#include <cassert>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <fstream>
#include<time.h>
#include <CL/cl.h>
#define DATA_SIZE 1024
using namespace std;
ofstream outfile;

const char* ProgramSource =
"__kernel void add(__global float *input, __global float *output, __global float *temp, int size){n"
"int thid = get_global_id(0); n"
"int offset = 1; n"
"temp[2*thid] = input[2*thid]; n"
"temp[2*thid+1] = input[2*thid+1]; n"
"for(int d= size>>1; d>0; d >>= 1){ n"
"barrier(CLK_GLOBAL_MEM_FENCE); n"
"if(thid < d){ n"
"int ai = offset*(2*thid + 1)-1; n"
"int bi = offset*(2*thid + 2)-1; n"
"temp[bi] += temp[ai]; } n"
"offset = offset*2; n"
"} n"
"temp[size-1] = 0; n"
"barrier(CLK_GLOBAL_MEM_FENCE); n"
"for(int d = 1; d<size; d *= 2){ n"
"offset >>= 1; barrier(CLK_GLOBAL_MEM_FENCE); n"
"if(thid < d) { n"
"int ai = offset*(2*thid+1)-1; int bi = offset*(2*thid+2)-1; n"
"float t = temp[ai]; temp[ai] = temp[bi]; temp[bi] += t; }  n"
"} n"
"barrier(CLK_GLOBAL_MEM_FENCE); n"
"output[2*thid] = temp[2*thid]; n"
"output[2*thid+1] = temp[2*thid+1]; n"
"}n"
"n";
/*
*/

int main(void)
{
cl_uint num_platforms = 0;
cl_context context;
cl_context_properties properties[3];
cl_kernel kernel;
cl_platform_id* platforms;
cl_command_queue command_queue;
cl_program program;
cl_int err;
cl_uint num_of_platforms = 0;
cl_platform_id platform_id;
cl_device_id device_id;
cl_uint num_of_devices = 0;
cl_mem inputA, inputB, output;
outfile.open("shubham.txt");
size_t global, loc;
float inputDataA[DATA_SIZE];
float results[2 * DATA_SIZE] = { 0 };
int i;
for (i = 0; i < DATA_SIZE;i++)
{
inputDataA[i] = (float)i;
}
clock_t start, ends;
/* --------------------- Get platform ---------------------*/
cl_int clResult = clGetPlatformIDs(0, NULL, &num_platforms);
assert(clResult == CL_SUCCESS);
platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id) * num_platforms);
clResult = clGetPlatformIDs(num_platforms, platforms, NULL);
assert(clResult == CL_SUCCESS);
/* --------------------- ------------ ---------------------*/

/* --------------------- Get devices ---------------------*/
cl_device_id* devices = NULL;
cl_uint num_devices;
clResult = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
assert(clResult == CL_SUCCESS);
devices = (cl_device_id*)malloc(sizeof(cl_device_id) * num_platforms);
if (clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, num_devices, devices, NULL) != CL_SUCCESS)
{
printf("could not find device id");
}
assert(clResult == CL_SUCCESS);
/* --------------------- ----------- ---------------------*/
properties[0] = CL_CONTEXT_PLATFORM;
properties[1] = 0;
cl_int contextResult;
context = clCreateContext(NULL, 1, &devices[0], NULL, NULL, &contextResult);
assert(contextResult == CL_SUCCESS);
// create command queue using the context and device
// create command queue using the context and device
command_queue = clCreateCommandQueueWithProperties(context, devices[0], 0, &err);
// create a program from the kernel source code
program = clCreateProgramWithSource(context, 1, (const char**)&ProgramSource, NULL, &err);
// compile the program
if (clBuildProgram(program, 0, NULL, NULL, NULL, NULL) != CL_SUCCESS)
{
printf("Error building programn");
return 1;
}
// specify which kernel from the program to execute
kernel = clCreateKernel(program, "add", &err);
// create buffers for the input and ouput
inputA = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * DATA_SIZE, NULL, NULL);
inputB = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * DATA_SIZE, NULL, NULL);
output = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * DATA_SIZE, NULL, NULL);
// load data into the input buffer
clEnqueueWriteBuffer(command_queue, inputA, CL_TRUE, 0, sizeof(float) * DATA_SIZE, inputDataA, 0, NULL, NULL);
clEnqueueWriteBuffer(command_queue, inputB, CL_TRUE, 0, sizeof(float) * DATA_SIZE, 0, 0, NULL, NULL);
clEnqueueWriteBuffer(command_queue, output, CL_TRUE, 0, sizeof(float) * DATA_SIZE, 0, 0, NULL, NULL);
int temp = DATA_SIZE;
start = clock();
// set the argument list for the kernel command
clResult = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputA);
assert(clResult == CL_SUCCESS);
clResult = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
assert(clResult == CL_SUCCESS);
clResult = clSetKernelArg(kernel, 2, sizeof(cl_mem), &inputB);
assert(clResult == CL_SUCCESS);
clResult = clSetKernelArg(kernel, 3, sizeof(int), &temp);
assert(clResult == CL_SUCCESS);
global = DATA_SIZE;
loc = 256;
// enqueue the kernel command for execution
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, &loc, 0, NULL, NULL);
clFinish(command_queue);
// copy the results from out of the output buffer
clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sizeof(float) * DATA_SIZE, results, 0, NULL, NULL);
//clEnqueueReadBuffer(command_queue, inputB, CL_TRUE, 0, sizeof(float) *16, shubh, 0, NULL, NULL);
// print the results
printf("output: ");
for (i = 0;i < 5; i++)
{
printf("%f n", results[i]);
outfile << results[i] << " ";
}
ends = clock();
double time_taken = ((double)(ends - start)) / CLK_TCK;
outfile << endl << "Time taken is : " << time_taken << endl;
clReleaseMemObject(inputA);
clReleaseMemObject(inputB);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
return 0;
}

Thanks in Advance

我发现你错了。对我来说,它首先不会编译OpenCL C代码,所以我用

进行调试。
char info[1024];
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 1024*sizeof(char), (void*)info, NULL); // print build log
printf(info);

获取构建日志:

<kernel>:4:8: warning: multi-character character constant
printf('0',thid);
^
<kernel>:4:8: warning: incompatible integer to pointer conversion passing 'int' to parameter of type '__constant char *'
printf('190',thid);
^~~~
cl_kernel.h:4694:32: note: passing argument to parameter here
printf(constant char * restrict, ...) __asm("llvm.nvvm.internal.printf.cl");

似乎'而不是"是问题所在。修改这一行:

"printf("%d",thid); n"

然后OpenCL C代码编译,我可以重现CL_INVALID_VALUE错误。

这里是问题:您使用clEnqueueWriteBuffer将数据从inputB复制到0。您需要添加c++数组来将数据复制到:

long inputDataA[DATA_SIZE];
long inputDataB[DATA_SIZE];
long outputData[DATA_SIZE];

clResult = clEnqueueWriteBuffer(command_queue, inputA, CL_TRUE, 0, sizeof(long) * DATA_SIZE, inputDataA, 0, NULL, NULL);
assert(clResult == CL_SUCCESS);
clResult = clEnqueueWriteBuffer(command_queue, inputB, CL_TRUE, 0, sizeof(long) * DATA_SIZE, inputDataB, 0, NULL, NULL);
assert(clResult == CL_SUCCESS); // WORKS NOW
clResult = clEnqueueWriteBuffer(command_queue, output, CL_TRUE, 0, sizeof(long) * DATA_SIZE, outputData, 0, NULL, NULL);
assert(clResult == CL_SUCCESS);

然后它工作,我得到这个输出:

Setup
0
>> start parallel ----------
25625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035101234567891011121314151617181920212223242526272829303138438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441564656667686970717273747576777879808182838485868788899091929394953523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823839697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612741641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863932333435363738394041424344454647484950515253545556575859606162634484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784798328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628635445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745751921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222234804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105118648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948955125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425431281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581598008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308315765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066072242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542559609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909916406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706711601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901917687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987997367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667679929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210236726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027039289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589597047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347358968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269271 - 0
2 - 1
3 - 2
4 - 4
5 - 6
6 - 9
7 - 12
>>finished parallel in 0.023000 seconds
-------------------------------------
>> start sequential ----------
>> finished sequential in 0.000000
0 - 1
1 - 3
2 - 6
3 - 10
4 - 15
5 - 21
6 - 28
7 - 36

还请注意,在OpenCL C中,long= 64位整数,但在c++中,long="至少";32位整数,不管出于什么愚蠢的原因。在c++中,你应该使用long long int,因为它总是64位整数。例如,您可以使用typedef int64_t slong;,其中int64_t本身是long long int的类型定义。

另一个问题是程序不是确定性的。当多次执行时,每次都得到不同的结果。一定存在某种竞争条件。我想你错误地认为barrier(CLK_GLOBAL_MEM_FENCE);提供了所有线程的全局同步,但这不是真的。唯一的全局同步是在所需的同步点将内核拆分为多个内核,然后一个接一个地执行它们。


最后,为了使c++中的OpenCL开发更容易,并防止在这样简单的错误上浪费时间,我创建了一个轻量级的OpenCL- wrapper来消除所有的OpenCL代码开销。这样,您的代码缩短了4倍,并且更容易理解:

#include "opencl.hpp"
#define DATA_SIZE 1024
int main() {
Clock clock;
clock.start();
Device device(select_device_with_most_flops()); // compile OpenCL C code for the fastest available device
Memory<slong> arr(device, DATA_SIZE, 1u, true, false);
Memory<slong> inputA(device, DATA_SIZE);
Memory<slong> inputB(device, DATA_SIZE);
Memory<slong> output(device, DATA_SIZE);
for(int i=1; i<DATA_SIZE-1; i++) {
inputA[i-1] = (slong)i;
arr[i-1] = (slong)i;
}
inputA.write_to_device();

Kernel kernel(device, DATA_SIZE, "add", inputA, output, inputB);
kernel.add_constants(DATA_SIZE);
kernel.run();
output.read_from_device();
double time_taken = clock.stop();
// print the results
for(int k=1; k<8; k++) {
printf("%d - ", k);
printf("%d n", output[k]);
}
printf("n>>finished parallel in %lf secondsn", time_taken);
printf("n-------------------------------------n");
printf("n>> start sequential ---------- n");
long prefixSum[DATA_SIZE] = { 0 };
clock.start();
prefixSum[0] = arr[0];
for(long idx=1; idx<DATA_SIZE; idx++) {
prefixSum[idx] = prefixSum[idx-1]+arr[idx];
}
double seqTime = clock.stop();
printf("n>> finished sequential in %lfn", seqTime);
for(int j=0; j<8; j++) {
printf("%d - ", j);
printf("%d n", prefixSum[j]);
}
wait();
return 0;
}
#include "kernel.hpp" // note: unbalanced round brackets () are not allowed and string literals can't be arbitrarily long, so periodically interrupt with )+R(
string opencl_c_container() { return R( // ########################## begin of OpenCL C code ####################################################################
kernel void add(__global long* input, __global long* output, __global long* temp, int size) {
int thid = get_global_id(0);
int offset = 1;
printf("%d",thid);
temp[2*thid] = input[2*thid];
temp[2*thid+1] = input[2*thid+1];
for(int d= size>>1; d>0; d >>= 1) {
barrier(CLK_GLOBAL_MEM_FENCE);
if(thid < d) {
int ai = offset*(2*thid + 1)-1;
int bi = offset*(2*thid + 2)-1;
temp[bi] += temp[ai];
}
offset = offset*2;
}
temp[size-1] = 0;
barrier(CLK_GLOBAL_MEM_FENCE);
for(int d = 1; d<size; d *= 2) {
offset >>= 1; barrier(CLK_GLOBAL_MEM_FENCE);
if(thid < d) {
int ai = offset*(2*thid+1)-1; int bi = offset*(2*thid+2)-1;
long t = temp[ai]; temp[ai] = temp[bi]; temp[bi] += t;
}
}
barrier(CLK_GLOBAL_MEM_FENCE);
output[2*thid] = temp[2*thid];
output[2*thid+1] = temp[2*thid+1];
}
);} // ############################################################### end of OpenCL C code #####################################################################

最新更新