OpenCL:是64位的global_id()不支持?



我是一个OpenCL新手,我不能从编译的内核返回64位值。我做错了什么?

我有一个Intel(R) HD Graphics 520显卡,我想写一个处理64位值的算法。但是当全局id超过4e12(更准确地说是2^32-1)时,它似乎被溢出了。似乎所有东西都是用x64构建的。我用Visual Studio 2019编译,目标:x64。我已经安装了最新的英特尔图形设备驱动程序(30.0.101.1660)。它可以构建内核并工作,除了它使用32位而不是64位!

谁能告诉我我哪里错了?

这是我的代码。对不起,有点长……我已经尽量简短了。我知道,它有一些小故障(例如,不使用原子写),但这只是一个POC代码,这并不像我预期的那样真正起作用。(
#include <cstdio>
#include <cassert>
#include <iostream>
using namespace std;
#include <CL/opencl.h>
int runCL(const cl_ulong n) {
cl_int err = 0;
cl_uint num_platforms;
cl_platform_id platforms[16]; // Can be on stack!
err = clGetPlatformIDs(16, platforms, &num_platforms);
assert(err == 0);
assert(num_platforms);
cl_uint num_devices;
cl_device_id devices[16]; // Can be on stack!
err = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 16, devices, &num_devices);
assert(err == 0);
assert(num_devices);
#define PR_DEV_INFO(name, type) invoke([devices]()->type { type wrk; 
cl_uint err = clGetDeviceInfo(devices[0], name, sizeof(wrk), (void*)&wrk, NULL);
assert(err == 0); cout << #name << ": " << wrk << endl; return wrk;})
#define PR_DEV_INFO_CHAR(name) invoke([devices]()->string { size_t size; 
cl_uint err = clGetDeviceInfo(devices[0], name, 0, NULL, &size);
assert(err == 0); char* wrk = new char[size];
err = clGetDeviceInfo(devices[0], name, size, (void*)wrk, NULL);
assert(err == 0); string s(wrk); delete[] wrk;
cout << #name << " [" << size << "]: " << s << endl; return s;})
#define PR_DEV_INFO_ARR(name, type, len) invoke([devices](size_t arr_len)->void { 
type *wrk = new type[arr_len]; 
cl_uint err = clGetDeviceInfo(devices[0], name, sizeof(type)*arr_len, (void*)wrk, NULL);
assert(err == 0); cout << #name << ":";
for(int i=0; i<arr_len;++i) cout << ' ' << wrk[i]; cout << endl; delete[] wrk;}, len)
PR_DEV_INFO_CHAR(CL_DEVICE_NAME);
PR_DEV_INFO_CHAR(CL_DEVICE_VERSION);
PR_DEV_INFO_CHAR(CL_DRIVER_VERSION);
PR_DEV_INFO_CHAR(CL_DEVICE_EXTENSIONS);
PR_DEV_INFO(CL_DEVICE_ADDRESS_BITS, cl_uint);
PR_DEV_INFO(CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint);
const size_t max_item_dim = 
PR_DEV_INFO(CL_DEVICE_MAX_WORK_GROUP_SIZE, size_t);
cl_uint dims = PR_DEV_INFO(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, cl_uint);
PR_DEV_INFO_ARR(CL_DEVICE_MAX_WORK_ITEM_SIZES, size_t, dims);
cl_context context = clCreateContext(NULL, num_devices, devices, NULL, NULL, &err);
assert(err == 0);
string kernel_txt(
"#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enablen"
"#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enablen"
"#pragma OPENCL EXTENSION cles_khr_int64 : enablen"
"__kernel void render(__global ulong * out) {n"
"  size_t gid = get_global_id(0);n"
"  size_t lid = get_local_id(0);n"
"  ulong val = out[lid];n"
"  out[lid] = val < gid ? gid : val;n" // Not atomic!
"  if (lid == 255) out[lid] = sizeof(size_t) * 1000 + sizeof(ulong);n"
"}n");
const char* kernel_mem = kernel_txt.c_str();
// kernel_mem cannot be on stack
cl_program program = clCreateProgramWithSource(context, 1, &kernel_mem, NULL, &err);
assert(err == 0);
//https://www.khronos.org/registry/OpenCL/specs/2.2/html/OpenCL_API.html#compiler-options
const char* options = "-w -Werror -cl-std=CL3.0";
err = clBuildProgram(program, num_devices, devices, options, NULL, NULL);
if (err) {
cerr << "Build error: " << err << endl;
size_t size = 0;
// Just get log size first, then read it again to the proper log
cl_int err2 = clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &size);
char* plog = new char[size];
err2 = clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, size, plog, &size);
cerr << "Build log (size: " << size << "): '" << plog << "' [err:" << err2 << "d]" << endl;
delete[] plog;
exit(1);
}
cl_kernel kernel = clCreateKernel(program, "render", &err);
assert(err == 0);
cl_ulong* host_image = new cl_ulong[max_item_dim](); // cannot be on stack!
size_t buffer_size = sizeof(cl_ulong) * max_item_dim;
cl_mem image = clCreateBuffer(context, CL_MEM_READ_WRITE, buffer_size, NULL, &err);
assert(err == 0);
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &image);
assert(err == 0);
cl_command_queue cmd_queue = clCreateCommandQueueWithProperties(context, devices[0], NULL, &err);
assert(err == 0);
size_t dev_wrk_size[1] = { n };
size_t dev_wrk_offs[1] = { 0 };
size_t loc_wrk_size[1] = { (size_t)max_item_dim };
// https://www.khronos.org/registry/OpenCL/sdk/1.0/docs/man/xhtml/clEnqueueNDRangeKernel.html
err = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, dev_wrk_offs, dev_wrk_size, loc_wrk_size, 0, NULL, NULL);
assert(err == 0);
// Non-blocking read, so we can continue queuing up more kernels
err = clEnqueueReadBuffer(cmd_queue, image, CL_FALSE, 0, buffer_size, host_image, 0, NULL, NULL);
assert(err == 0);
err = clFinish(cmd_queue);
assert(err == 0);
for (int i = 0; i < 256; ++i) cout << '[' << i << ':' << host_image[i] << "]";
cout << '{' << n << '}' << endl;
for (int i = 0; i < 256; ++i) printf("[%d:%zd]", i, host_image[i]);
printf("{%zd}nsize_t:%zd, cl_ulong:%zdn", n, sizeof(size_t), sizeof(cl_ulong));
clReleaseMemObject(image);
clReleaseKernel(kernel);
clReleaseCommandQueue(cmd_queue);
clReleaseContext(context);
delete[] host_image;
return CL_SUCCESS;
}
int main() {
runCL(10'000'000'000ULL);
return 0;
}

在内核中,最后一个返回项(out[255])包含ulongsize_t的大小组合为8008,这似乎是ok的,因为它们都是8字节长。

和输出(我剪掉重复的行):

CL_DEVICE_NAME [25]: Intel(R) HD Graphics 520
CL_DEVICE_VERSION [16]: OpenCL 3.0 NEO
CL_DRIVER_VERSION [14]: 30.0.101.1660
CL_DEVICE_EXTENSIONS [1654]: cl_khr_byte_addressable_store cl_khr_fp16 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_icd cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_intel_command_queue_families cl_intel_subgroups cl_intel_required_subgroup_size cl_intel_subgroups_short cl_khr_spir cl_intel_accelerator cl_intel_driver_diagnostics cl_khr_priority_hints cl_khr_throttle_hints cl_khr_create_command_queue cl_intel_subgroups_char cl_intel_subgroups_long cl_khr_il_program cl_intel_mem_force_host_memory cl_khr_subgroup_extended_types cl_khr_subgroup_non_uniform_vote cl_khr_subgroup_ballot cl_khr_subgroup_non_uniform_arithmetic cl_khr_subgroup_shuffle cl_khr_subgroup_shuffle_relative cl_khr_subgroup_clustered_reduce cl_intel_device_attribute_query cl_khr_suggested_local_work_size cl_khr_fp64 cl_khr_subgroups cl_intel_spirv_device_side_avc_motion_estimation cl_intel_spirv_media_block_io cl_intel_spirv_subgroups cl_khr_spirv_no_integer_wrap_decoration cl_intel_unified_shared_memory_preview cl_khr_mipmap_image cl_khr_mipmap_image_writes cl_intel_planar_yuv cl_intel_packed_yuv cl_intel_motion_estimation cl_intel_device_side_avc_motion_estimation cl_intel_advanced_motion_estimation cl_khr_int64_base_atomics cl_khr_int64_extended_atomics cl_khr_image2d_from_buffer cl_khr_depth_images cl_khr_3d_image_writes cl_intel_media_block_io cl_khr_gl_sharing cl_khr_gl_depth_images cl_khr_gl_event cl_khr_gl_msaa_sharing cl_intel_dx9_media_sharing cl_khr_dx9_media_sharing cl_khr_d3d10_sharing cl_khr_d3d11_sharing cl_intel_d3d11_nv12_media_sharing cl_intel_sharing_format_query cl_khr_pci_bus_info cl_intel_simultaneous_sharing
CL_DEVICE_ADDRESS_BITS: 64
CL_DEVICE_MAX_COMPUTE_UNITS: 24
CL_DEVICE_MAX_WORK_GROUP_SIZE: 256
CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: 3
CL_DEVICE_MAX_WORK_ITEM_SIZES: 256 256 256
[0:4294965760][1:4294965761]<...>[253:4294966781][254:4294966782][255:8008]{10000000000}
[0:4294965760][1:4294965761]<...>[253:4294966781][254:4294966782][255:8008]{10000000000}
size_t:8, cl_ulong:8

我用coutprintf(%zd)打印结果,以确保不是cout导致问题。:)

我应该在clBuildProgram或内核代码(#pragma)中打开一些东西,以便能够在内核端使用64位?

我在内核代码中做了一个轻微的修改,以计算global_id(0)的位数,它似乎总是32,而不是以上!

"  int i = 0; for(; i<64 && gid;++i, gid>>=1);"
"  out[lid] = val < i ? i : val;n" // Not atomic!

所以,看起来global_id(0)返回一个32位的值!

更新2

我将内核代码修改为size_t gid = get_local_id(0)+get_local_size(0)*get_group_id(0);而不是size_t gid = get_global_id(0);

结果变成:

[0:9999999744][1:9999999745]<...>[253:9999998973][254:9999998974][255:8008]{10000000000}
size_t:8, cl_ulong:8

看起来好多了!

我还做了一个测试,以避免竞争条件使用原子比较和交换更迂腐:

"  size_t gid = get_global_id(0);n"
"  size_t lid = get_local_id(0);n"
"  //out[lid] = gid;n" // Not atomic!
"  ulong val_new, val_org = out[lid];n"
"  do {n"
"    val_new = val_org > gid ? val_org : gid;n"
"  } while (!atomic_compare_exchange_strong(out + lid, &val_org, val_new));n"

结果相同(坏):

[0:4294967040][1:4294967041]<...>[253:4294967293][254:4294967294][255:8008]{10000000000}

简而言之:支持64位寻址,如CL_DEVICE_ADDRESS_BITS: 64所示。一般OpenCL器件都支持64位整数(c++中的unsigned long long int, OpenCL C中的ulong), Intel HD 520甚至支持FP64双精度。

问题是你在内核中有一个竞争条件,因为你没有使用原子。许多线程试图同时写入out[lid],并且哪个线程获胜完全是随机的。

这是Nvidia GPU和Intel GPU的输出。对于英特尔GPU,每次执行的行为是随机的,但我偶尔会得到大于4294966784的值。

CL_DEVICE_NAME [24]: NVIDIA GeForce GTX 960M
CL_DEVICE_VERSION [16]: OpenCL 3.0 CUDA
CL_DRIVER_VERSION [7]: 511.79
CL_DEVICE_EXTENSIONS [606]: cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_fp64 cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_icd cl_khr_gl_sharing cl_nv_compiler_options cl_nv_device_attribute_query cl_nv_pragma_unroll cl_nv_d3d10_sharing cl_khr_d3d10_sharing cl_nv_d3d11_sharing cl_nv_copy_opts cl_nv_create_buffer cl_khr_int64_base_atomics cl_khr_int64_extended_atomics cl_khr_device_uuid cl_khr_pci_bus_info cl_khr_external_semaphore cl_khr_external_memory cl_khr_external_semaphore_win32 cl_khr_external_memory_win32
CL_DEVICE_ADDRESS_BITS: 64
CL_DEVICE_MAX_COMPUTE_UNITS: 5
CL_DEVICE_MAX_WORK_GROUP_SIZE: 1024
CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: 3
CL_DEVICE_MAX_WORK_ITEM_SIZES: 1024 1024 64
[0:18446744073709546496][1:18446744073709546497]...[254:18446744073709549822][255:8008]{10000000000}
[0:-5120][1:-5119]...[254:-1794][255:8008]{10000000000}
size_t:8, cl_ulong:8
CL_DEVICE_NAME [26]: Intel(R) HD Graphics 4600
CL_DEVICE_VERSION [12]: OpenCL 1.2
CL_DRIVER_VERSION [14]: 20.19.15.4624
CL_DEVICE_EXTENSIONS [616]: cl_intel_accelerator cl_intel_advanced_motion_estimation cl_intel_ctz cl_intel_d3d11_nv12_media_sharing cl_intel_dx9_media_sharing cl_intel_motion_estimation cl_intel_simultaneous_sharing cl_intel_subgroups cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_d3d10_sharing cl_khr_d3d11_sharing cl_khr_depth_images cl_khr_dx9_media_sharing cl_khr_gl_depth_images cl_khr_gl_event cl_khr_gl_msaa_sharing cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_gl_sharing cl_khr_icd cl_khr_image2d_from_buffer cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_spir
CL_DEVICE_ADDRESS_BITS: 64
CL_DEVICE_MAX_COMPUTE_UNITS: 20
CL_DEVICE_MAX_WORK_GROUP_SIZE: 512
CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: 3
CL_DEVICE_MAX_WORK_ITEM_SIZES: 512 512 512
[0:4294966784][1:2154343490416]...[254:4294966526][255:8008]{10000000000}
[0:4294966784][1:2154343490416]...[254:4294966526][255:8008]{10000000000}
size_t:8, cl_ulong:8

为了简化OpenCL开发,考虑这个OpenCL- wrapper。这样,您的代码(不修复竞争条件错误)就会明显缩短,可读性也会更好:

int main() {
const ulong N = 10000000000ull;
Device device(select_device_with_most_flops()); // compile OpenCL C code for the fastest available device
Memory<ulong> image(device, 64u); // allocate memory on both host and device
Kernel kernel(device, N, "render", image); // kernel that runs on the device
kernel.run(); // run add_kernel on the device
image.read_from_device(); // copy data from device memory to host memory
for(int i=0; i<256; i++) print("["+to_string(i)+":"+to_string(image[i])+"]");
println("{"+to_string(N)+"}");
println("size_t:"+to_string(sizeof(size_t))+", cl_ulong:"+to_string(sizeof(cl_ulong)));
wait();
return 0;
}
#include "kernel.hpp" // note: unbalanced round brackets () are not allowed and string literals can't be arbitrarily long, so periodically interrupt with )+R(
string opencl_c_container() { return R( // ########################## begin of OpenCL C code ####################################################################
__kernel void render(__global ulong * out) {
size_t gid = get_global_id(0);
size_t lid = get_local_id(0);
ulong val = out[lid];
out[lid] = val < gid ? gid : val; // RACE CONDITION here
if (lid == 255) out[lid] = sizeof(size_t) * 1000 + sizeof(ulong); // another race condition here
}
);} // ############################################################### end of OpenCL C code #####################################################################

我已经向英特尔报告了这个问题。他们在这里回答。答案很简单:

提供给全局ID计算的一些硬件计数器被限制为32位,特别是工作组ID。…

如果全局大小与局部大小可整除,则解决方案

// If you need the global offset:
size_t global_id_0 = get_group_id(0) * get_local_size(0) + 
get_global_offset(0) + get_local_id(0);
// If you do not need the global offset:
size_t global_id_0 = get_group_id(0) * get_local_size(0) + 
get_local_id(0);

如果不可整除,则使用get_enqueued_local_size代替get_local_size。

相关内容

最新更新