如何提高此 OpenCL 缩减内核代码的性能?



我已经编写了负责对大量数据执行归约的代码,虽然代码在逻辑上似乎是正确的,但事实证明它比简单的std::accumulatestd::max_element调用相同的数据要慢,我正在寻找任何关于我如何拙劣地破坏此代码性能的见解。

这些是我得到的结果。请注意,即使是执行内核的原始时间也比简单的 CPU 减少数据慢。

Select which Device to use: 
0:                Cedar (AMD Accelerated P... - OpenCL 1.2 AMD-AP...)
1:                Cedar (AMD Accelerated P... - OpenCL 1.2 AMD-AP...)
2:         Intel(R) ... (AMD Accelerated P... - OpenCL 1.2 AMD-AP...)
3:         Intel(R) ... (Experimental Open... - OpenCL 2.0 (Build...)
Device: Cedar
Platform: AMD Accelerated Parallel Processing
Num of compute units: 8
Work Group Size: 128
i = 9419918
Internal Duration:    95609555ns //Time to run just the kernel, no setup
Num of Work Groups to sum up: 78125
Reduced Value was detected to be:    -5.06886
(Index):                             1008460
Value at index is:                   -5.06886
Kernel Duration:     153748214ns //Includes copying of data, excludes building of kernel
Counting manually, Reduced Value is: -5.06886
(Index of):                          1008460
Value at index is:                   -5.06886
Manual Duration:      48173322ns //CPU runtime using std::max_element`.
Press any key to continue . . . 

内核代码是通过连接所有以下四个文件来构造的:

expand.cl

R"D(
#define EXPAND(type) 
typedef     type        Scalar;
typedef     type ## 2   Vector2;
typedef     type ## 4   Vector4;
typedef     type ## 8   Vector8;
typedef     type ## 16  Vector16;
)D"

float.cl

R"D(
EXPAND(float);
#define SCALAR_MAXIMUM INFINITY;
#define SCALAR_MINIMUM -INFINITY;
#define SCALAR_ZERO 0;
)D"

max.cl

R"D(
constant Scalar IDENTITY = SCALAR_MINIMUM;
#define REDUCE_IMPL(a, b, indexa, indexb, reduced_value, reduced_index) 
if(a > b) {
reduced_value = a;
reduced_index = indexa;
} else {
reduced_value = b;
reduced_index = indexb;
}
)D"

减少 Main.cl

R"D(
kernel void reduce(global Scalar * a, global Scalar * output, local Scalar * scratch, global long * index_output, local long * index_scratch, long size) {
size_t gid = get_global_id(0);
size_t lid = get_local_id(0);
size_t wid = get_group_id(0);
size_t gsize = get_global_size(0);
size_t lsize = get_local_size(0);
size_t wsize = get_num_groups(0);
if(gid < size) {
scratch[lid] = a[gid];
index_scratch[lid] = gid;
} else {
scratch[lid] = IDENTITY;
index_scratch[lid] = -1;
}
barrier(CLK_LOCAL_MEM_FENCE);
for(size_t offset = lsize / 2; offset > 0; offset >>= 1) {
if(lid < offset) {
size_t indexa = index_scratch[lid];
size_t indexb = index_scratch[lid + offset];
Scalar a = scratch[lid];
Scalar b = scratch[lid + offset];
Scalar reduced_value;
size_t reduced_index;
REDUCE_IMPL(a, b, indexa, indexb, reduced_value, reduced_index);
scratch[lid] = reduced_value;
index_scratch[lid] = reduced_index;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if(lid == 0) {
output[wid] = scratch[0];
index_output[wid] = index_scratch[0];
}
}
)D"

CL Reduction.hperform_reduction

std::future<result> perform_reduction(std::vector<T> const& values) {
cl_long size = values.size();
uint64_t num_of_work_groups = size / work_group_size;
int64_t global_size = work_group_size * num_of_work_groups;
if (global_size < size) {
num_of_work_groups++;
global_size = work_group_size * num_of_work_groups;
}
cl::Buffer input_buffer(context, CL_MEM_READ_ONLY, global_size * sizeof(T), nullptr);
std::vector<cl::Event> write_events(1);
queue.enqueueWriteBuffer(input_buffer, false, 0, size * sizeof(T), values.data(), nullptr, &write_events.back());
if (global_size != size) {
write_events.emplace_back();
queue.enqueueFillBuffer(input_buffer, reduction::identity<T>(), size * sizeof(T), (global_size - size) * sizeof(T), nullptr, &write_events.back());
}
return std::async([size, num_of_work_groups, global_size, input_buffer, write_events, this] {
cl::Buffer output_buffer( context, CL_MEM_WRITE_ONLY, num_of_work_groups * sizeof(T) );
cl::Buffer output_index_buffer(context, CL_MEM_WRITE_ONLY, num_of_work_groups * sizeof(cl_long));
kernel.setArg(0, input_buffer);
kernel.setArg(1, output_buffer);
kernel.setArg(2, sizeof(T) * work_group_size, nullptr);
kernel.setArg(3, output_index_buffer);
kernel.setArg(4, sizeof(cl_long) * work_group_size, nullptr);
kernel.setArg(5, size);
std::vector<cl::Event> kernel_event;
kernel_event.emplace_back();
queue.enqueueNDRangeKernel(kernel, {}, { uint64_t(global_size) }, { work_group_size }, &write_events, &kernel_event.back());
std::vector<T> results;
std::vector<int64_t> indexes;
results.resize(num_of_work_groups);
indexes.resize(num_of_work_groups);
queue.enqueueReadBuffer(output_buffer, false, 0, num_of_work_groups * sizeof(T), results.data(), &kernel_event);
queue.enqueueReadBuffer(output_index_buffer, false, 0, num_of_work_groups * sizeof(cl_long), indexes.data(), &kernel_event);
queue.finish();
std::cout << "Internal Duration: " << std::setw(11) << (kernel_event[0].getProfilingInfo<CL_PROFILING_COMMAND_END>() - kernel_event[0].getProfilingInfo<CL_PROFILING_COMMAND_START>()) << "ns" << std::endl;
std::cout << "Num of Work Groups to sum up: " << num_of_work_groups << std::endl;
result t{ reduction::identity<T>(), 0 };
for (size_t i = 0; i < results.size(); i++) {
T const& val = results[i];
size_t const& index = indexes[i];
t = reduction::reduce(t.reduced_value, val, t.reduced_index, index);
}
return t;
});
}

减速主.cpp:

#define _HAS_AUTO_PTR_ETC 1
#include <vector>
#include <list>
#include <memory>
#include <utility>
#include<fstream>
#include<chrono>
#include<numeric>
#include<random>
#include<iomanip>
#include "CL Reduction.h"
std::string limit(std::string string, size_t limit) {
if (string.size() >= limit) return string.substr(0, limit - 3) + "...";
else return std::move(string);
}
cl::Device choose_device() {
std::vector<cl::Device> all_devices;
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
for (cl::Platform const& platform : platforms) {
std::vector<cl::Device> devices;
platform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
all_devices.insert(all_devices.end(), devices.begin(), devices.end());
}
std::cout << "Select which Device to use: " << std::endl;
for (size_t i = 0; i < all_devices.size(); i++) {
cl::Device const& device = all_devices[i];
std::cout << i;
std::cout << ": ";
std::cout << std::setw(20) << limit(device.getInfo<CL_DEVICE_NAME>(), 20);
std::cout << " (";
std::cout << std::setw(20) << limit(cl::Platform{ device.getInfo<CL_DEVICE_PLATFORM>() }.getInfo<CL_PLATFORM_NAME>(), 20);
std::cout << " - ";
std::cout << std::setw(20) << limit(device.getInfo<CL_DEVICE_VERSION>(), 20);
std::cout << ")";
std::cout << std::endl;
}
size_t chosen;
std::cin >> chosen;
return all_devices[chosen];
}
int main() {
using type = float;
using reduction_type = cl_reduction_type::reduction_type<cl_reduction_type::type::maximum>;
using datatype = cl_datatype::datatype<type>;
using context_t = cl_reduction::reduction_context<datatype, reduction_type>;
std::ofstream err_log{ "err.txt" };
cl::Device device = choose_device();
try {
cl_reduction::reduction_context<datatype, reduction_type> context{ { device }, err_log };
std::vector<type> values;
auto last_ping = std::chrono::steady_clock::now();
std::default_random_engine engine{ std::random_device{}() };
std::uniform_real_distribution<type> distribution{ -100.f, 100.f };
//std::uniform_int_distribution<type> distribution(1, 500);
values.resize(10'000'000ull);
//values.resize(10'000);
type start = distribution(engine);
for (size_t i = 0; i < values.size(); i++) {
values[i] = start;
start = std::nextafter(start, std::numeric_limits<type>::infinity());
if (std::chrono::steady_clock::now() - last_ping > std::chrono::seconds(1)) {
std::cout << "i = " << i << 'r';
last_ping += std::chrono::seconds(1);
}
}
std::shuffle(values.begin(), values.end(), engine);
auto begin = std::chrono::steady_clock::now();
auto future = context.perform_reduction(values);
context_t::result t;
try {
t = future.get();
}
catch (cl::Error const& e) {
err_log << e.what() << std::endl;
err_log << e.err() << std::endl;
}
auto end = std::chrono::steady_clock::now();
std::cout << "Reduced Value was detected to be:    " << t.reduced_value << std::endl;
std::cout << "(Index):                             " << t.reduced_index << std::endl;
std::cout << "Value at index is:                   " << values[t.reduced_index] << std::endl;
std::cout << "Kernel Duration:   " << std::setw(11) << (end - begin).count() << "ns" << std::endl;
begin = std::chrono::steady_clock::now();
//auto value = std::accumulate(values.begin(), values.end(), type(0));
auto it = std::max_element(values.begin(), values.end());
auto index = std::distance(values.begin(), it);
auto value = values[index];
end = std::chrono::steady_clock::now();
std::cout << "Counting manually, Reduced Value is: " << value << std::endl;
std::cout << "(Index of):                          " << index << std::endl;
std::cout << "Value at index is:                   " << values[index] << std::endl;
std::cout << "Manual Duration:   " << std::setw(11) << (end - begin).count() << "ns" << std::endl;
}
catch (cl::Error const& e) {
std::cerr << e.what() << ':' << e.err() << std::endl;
if (e.err() == CL_INVALID_BUFFER_SIZE)
std::cerr << device.getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>() << std::endl;
}
system("pause");
return 0;
}

我在这里包含了整个代码库,其中包括使用的三个标头和 main 函数。 ("CL Datatype.h"、"Cl Reduce Type.h"、"CL Reduction.h"、"Reduce Main.cpp")。我只在这篇文章中包含了我认为相关的代码,但如果你认为问题出在其他方面,你可以在 Github 存储库中指出它。

Vector4 a = vload4(...)阅读您的输入并使用.xyzw。您也可以尝试使用vload8按 8 进行矢量化。

代替a > b,使用isgreater(a, b)anyallselect一起使用。

每个循环执行多次缩减以将其保留在寄存器中并减少本地内存的带宽。对于工作组大小为 128 且矢量大小为 4,第一个线程将使用 512-515 减少 0-3,然后使用 1024-1027 等,然后使用vstore4写入本地内存。尝试不同的内环尺寸。

尽可能,您不希望线程无所事事。内核应该只是从全局内存减少到寄存器一次,存储到本地内存,然后在一个线程从内核的本地值减少到单个值并将其存储在全局内存中之前同步线程。最后,您可以在 CPU 上执行最后一个相对较小的降低级别。此级别将仅包含每个工作组中的一个值:total_size / (work_group_size = 128) / (vector_size = 4) / (inner_loop_size = 16)

最新更新