c-高斯消除程序不能并行工作——OpenCL



一段时间以来,我一直在尝试并行实现高斯消去过程。内核似乎忽略了设置的障碍,执行了它能执行的所有操作,然后让下一个内核完成它的工作。但我需要他们一起反复地工作。我的输入A是修改后的矩阵,最后一列是形象的"输出"。换句话说:每个内核对单独的第j行执行行缩减,以使第i列中的元素为零。障碍冲洗并重复。最后会有一个单位矩阵。执行第j行的内核也将x[j]分配给最后一列中的值。

主程序创建一个以2D形式组织的指定矩阵,然后使其成为内核的1D矩阵。它自己串行求解,并将串行结果与返回的opencl结果进行比较。

Main:

// Entry point
int main(){
// Initialize OpenCL.
if(!init_opencl()){return -1;}
// Request user for size of parallel operation
kr=k_input();
// Initialize the problem data. Requires number of devices to be known
init_problem();
// Run the kernel.
run();
// Free the resources allocated
cleanup();
return 0;
}
// Initializes the OpenCL objects
bool init_opencl(){
cl_int status;
printf("Initializing OpenCLn");
if(!setCwdToExeDir()) {
return false;       }
// Get the OpenCL platform.
platform = findPlatform("Intel"); // CHANGE TO Altera FOR FINAL
if(platform == NULL) {
printf("ERROR: Unable to find Intel OpenCL platform.n");
return false;      }
// Query the available OpenCL device.
device.reset(getDevices(platform, CL_DEVICE_TYPE_ALL, &num_devices));
printf("Platform: %sn", getPlatformName(platform).c_str());
printf("Using %d device(s)n", num_devices);
for(unsigned i = 0; i < num_devices; ++i) {
printf("  %snn", getDeviceName(device[i]).c_str()); // ANOTHER LINE FOR PRINTF DEBUGGING
}
// Create the context.
context = clCreateContext(NULL, num_devices, device, NULL, NULL, &status);
checkError(status, "Failed to create context");
// Create the program for all device. Use the first device as the
// representative device (assuming all device are of the same type).
std::string binary_file = getBoardBinaryFile("GaussianEl", device[0]);
printf("Using AOCX: %sn", binary_file.c_str());
program = createProgramFromBinary(context, binary_file.c_str(), device, num_devices);
// Build the program that was just created.
status = clBuildProgram(program, 0, NULL, "", NULL, NULL);
checkError(status, "Failed to build program");
// Create per-device objects.
queue.reset(num_devices);
kernel.reset(num_devices);
n_per_device.reset(num_devices);
input_a_buf.reset(num_devices);
input_b_buf.reset(num_devices);
output_buf.reset(num_devices);
n = Make_Matrix(A);N=n;
for (int j = 0; j < n ; j++)
{
for (int k = 0 ; k < n+1 ; k++)
{
parallelA[k+j*(1+n)]=A[j][k];
}
}
for(unsigned i = 0; i < num_devices; ++i) {
// Command queue.
queue[i] = clCreateCommandQueue(context, device[i], CL_QUEUE_PROFILING_ENABLE, &status);
checkError(status, "Failed to create command queue");
// Kernel.
const char *kernel_name = "GaussianEl";
kernel[i] = clCreateKernel(program, kernel_name, &status);
checkError(status, "Failed to create kernel");
// Determine the number of elements processed by this device.
n_per_device[i] = N / num_devices; // Maybe change to n ?
// Spread out the remainder of the elements over the first
// N % num_devices.
if(i < (N % num_devices)) {
n_per_device[i]++;      }
// Input buffers. // A, x, n
input_a_buf[i] = clCreateBuffer(context, CL_MEM_READ_WRITE,
n_per_device[i]*(n_per_device[i]+1)*sizeof(float), NULL, &status);
checkError(status, "Failed to create buffer for input A");
input_b_buf[i] = clCreateBuffer(context, CL_MEM_READ_ONLY,
sizeof(int), NULL, &status);
checkError(status, "Failed to create buffer for input B");
// Output buffer.
output_buf[i] = clCreateBuffer(context, CL_MEM_READ_WRITE,
n_per_device[i] * sizeof(float), NULL, &status);
checkError(status, "Failed to create buffer for output");
}
return true;
}
// Request value for k
long int k_input(){
int ch;
char *p;
char buffer[16];
while ( ( ch=getchar() ) != 'n' && (ch != EOF) ){} // clear input stream
printf ( "In base 2, ranging from 1 to 1024, enter the number of parallel threads to execute: " );
INPUT:
if( fgets( buffer,sizeof(buffer),stdin ) != NULL ) // waits for input
{
long int kr = strtol(buffer,&p,10);
if( buffer[0] != 'n' && (*p == 'n' || *p == ''))
{
float x = log2(kr);
if( fmodf(x,1) == 0 )
{
if( kr<=1024 )
{
printf("%ld kernals will be launched, good job!nn",kr);
return kr;
}
else
{
printf("Too many kernals, must be from 1 to 1024.n");
goto INPUT;
}
}
else
printf("%ld is not a power of 2, try again..n",kr);
goto INPUT;
}
else //
{
for( int i = 0;i<strlen(buffer);i++ )
{
if( buffer[i]=='n' )
buffer[i]='';
}
printf("Received %ld. Invalid input: "%s". Fatal Error.n", kr, p);
return 0;
}
}
else // fgets is NULL due to error or EOF condition
printf ( "Error reading input.n" );
return 0;
}
// Initialize the data for the problem. Requires num_devices to be known
void init_problem(){
if(num_devices == 0)            {
checkError(-1, "No devices");   }
input_a.reset(num_devices);
input_b.reset(num_devices);
output.reset(num_devices);
ref_output.reset(num_devices);
// Generate input vectors A and B and the reference output consisting
// of a total of N elements.
// We create separate arrays for each device so that each device has an
// aligned buffer.
for(unsigned i = 0; i < num_devices; ++i) {
input_a[i].reset(n_per_device[i]*(n_per_device[i]+1));
input_b[i].reset(1);
output[i].reset(n_per_device[i]);
ref_output[i].reset(n_per_device[i]);
input_a[i]=parallelA;
input_b[i][0]=n;
// Serial Code
start_time = getCurrentTimestamp();
Solve_Matrix(n,A,x);
ref_output[i]=x; // ref_output[1]
end_time = getCurrentTimestamp();
}
Print_Solution(n,x);
}
// Run kernel ///////////////////////////////////
void run(){
cl_int status;
// Launch the problem for each device.
scoped_array<cl_event> kernel_event(num_devices);
scoped_array<cl_event> finish_event(num_devices);
for(unsigned i = 0; i < num_devices; ++i) {
// Transfer inputs to each device. Each of the host buffers supplied to
// clEnqueueWriteBuffer here is already aligned to ensure that DMA is used
// for the host-to-device transfer.
//status = clEnqueueMapBuffer(queue[i],input_a_buf[i], CL_TRUE, ,
//    0, n_per_device[i]*(n_per_device[i]+1)*sizeof(float), 0, NULL, &write_event[0]);
cl_event write_event[2];
status = clEnqueueWriteBuffer(queue[i], input_a_buf[i], CL_TRUE,
0, n_per_device[i]*(n_per_device[i]+1)*sizeof(float), input_a[i], 0, NULL, &write_event[0]);
checkError(status, "Failed to transfer input A");
// WRITTEN FROM input_a // CL_FALSE
status = clEnqueueWriteBuffer(queue[i], input_b_buf[i], CL_FALSE,
0, sizeof(int), input_b[i], 0, NULL, &write_event[1]);
checkError(status, "Failed to transfer input B");
// WRITTEN FROM input_b
// Set kernel arguments.
unsigned argi = 0;
status = clSetKernelArg(kernel[i], argi++, sizeof(cl_mem), &input_a_buf[i]);
checkError(status, "Failed to set argument %d", argi - 1);
status = clSetKernelArg(kernel[i], argi++, sizeof(cl_mem), &input_b_buf[i]);
checkError(status, "Failed to set argument %d", argi - 1);
status = clSetKernelArg(kernel[i], argi++, sizeof(cl_mem), &output_buf[i]);
checkError(status, "Failed to set argument %d", argi - 1);
// Enqueue kernel.
// Use a global work size corresponding to the number of elements to add
// for this device.
//
// We don't specify a local work size and let the runtime choose
// (it'll choose to use one work-group with the same size as the global
// work-size).
//
// Events are used to ensure that the kernel is not launched until
// the writes to the input buffers have completed.
const size_t global_work_size = kr; // kr = (1024, 512, 256, etc)
printf("nLaunching for device %d (%lu elements)n", i, global_work_size);
status = clEnqueueNDRangeKernel(queue[i], kernel[i], 1, NULL,
&global_work_size, NULL, 2, write_event, &kernel_event[i]);
checkError(status, "Failed to launch kernel");
// Read the result. This the final operation.
status = clEnqueueReadBuffer(queue[i], output_buf[i], CL_FALSE,
0, n_per_device[i]*sizeof(float), output[i], 1, &kernel_event[i], &finish_event[i]);
// READ INTO output WHICH IS parallelx
// Release local events.
clReleaseEvent(write_event[0]);
clReleaseEvent(write_event[1]);
}
// Wait for all devices to finish.
clWaitForEvents(num_devices, finish_event);
//parallelx=*output[1]; /////////////////////////////////// not required for pass
//Print_Solution(n,parallelx);
// Wall-clock time taken.
Serial_Time = (end_time - start_time) * 1e3;
printf("nSerial Time: %0.3f msn", Serial_Time);
// Get kernel times using the OpenCL event profiling API.
for(unsigned i = 0; i < num_devices; ++i) {
cl_ulong time_ns = getStartEndTime(kernel_event[i]);
printf("Kernel time (device %d): %0.3f msn", i, double(time_ns) * 1e-6);
printf("Speed-up is: %0.3f", Serial_Time/(double(time_ns)*1e-6));
}
// Release all events.
for(unsigned i = 0; i < num_devices; ++i) {
clReleaseEvent(kernel_event[i]);
clReleaseEvent(finish_event[i]);
}
// Verify results.
bool pass = true;
for(unsigned i = 0; i < num_devices && pass; ++i) {
for(unsigned j = 0; j < n_per_device[i] && pass; ++j) {
if(fabsf(output[i][j] - ref_output[i][j]) > 1.0e-5f) {
printf("Failed verification @ device %d, index %dnOutput: %fnReference: %fn",
i, j, output[i][j], ref_output[i][j]);
pass = false;
}
}
}
printf("nVerification: %sn", pass ? "PASS" : "FAIL");
}
// Free the resources allocated during initialization
void cleanup(){
for(unsigned i = 0; i < num_devices; ++i) {
if(kernel && kernel[i]) {
clReleaseKernel(kernel[i]);
}
if(queue && queue[i]) {
clReleaseCommandQueue(queue[i]);
}
if(input_a_buf && input_a_buf[i]) {
clReleaseMemObject(input_a_buf[i]);
}
if(input_b_buf && input_b_buf[i]) {
clReleaseMemObject(input_b_buf[i]);
}
if(output_buf && output_buf[i]) {
clReleaseMemObject(output_buf[i]);
}
}
if(program) {
clReleaseProgram(program);
}
if(context) {
clReleaseContext(context);
}
}

内核:

__kernel void GaussianEl(__global float *A,
__global int *y,
__global float *restrict z)
{
unsigned index = get_global_id(0);  // get index of the work item
unsigned kr = get_global_size(0);   // number of global work items
unsigned n = *y;                    // dimension of matrix
float scale;            // initialize scale
int a,b,i,j,k,h;        // variables
if(index==0) { // Show received info is good
printf("----------------------nThe dimension size is: %dnThe kr is: %dnThe original matrix is:n",n,kr); //
print_matrix(n, index, A);
}
for(i=0; i<n; i++) //* LOOP FOR UPPER TRIANGULAR MATRIX
{
if( A[i+i*(n+1)] == 0 ){printf("Error, divide by zero encountered.n");} // do something for if divide by zero for re-calculating matrix?
for(j=0; j<n; j++) // THIS FOR REPRESENTS THE jTH ROW OF THE iTH COLUMN PARALLELIZED
{                  // BARRIER MUST BE PLACED AFTER THE iTH COLUMN ELEMENT IN A ROW IS ZEROED
h=j/kr;        // h=1/2=0, h=2/2=1 E.G VALUES FOR 3X3 KR=2
if(j==i && index==(j-h*kr))
{
scale=A[i+i*(n+1)];
printf("kernel=%d, i=%d, j=%d, h=%d, scale=%.1f/%.1f, Value=%dn", index, i, j, h, A[i+j*(n+1)], A[i+i*(n+1)],(j-h*kr) );
for(k=i; k<=n; k++)
{
A[k+j*(n+1)]=A[k+j*(n+1)]/scale;
}
print_matrix(n, index, A);
}
if( (j>i) && (index==(j-h*kr)) )
{
scale=A[i+j*(n+1)]/A[i+i*(n+1)];
printf("kernel=%d, i=%d, j=%d, h=%d, scale=%.1f/%.1f, Value=%dn", index, i, j, h, A[i+j*(n+1)], A[i+i*(n+1)],(j-h*kr) );
for(k=0; k<=n; k++)
{
A[k+j*(n+1)]=A[k+j*(n+1)]-scale*A[k+i*(n+1)];
}
print_matrix(n, index, A);
}
}
barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
}
... // 1 more loop after this to finish upper half

结果:

The serial/parallel solution is:
x0=3.0
x1=4.0
x2=-2.0
Launching for device 0 (2 elements)
----------------------
The dimension size is: 3
The kr is: 2
The original matrix is:
1.0  1.0  1.0  5.0
2.0  3.0  5.0  8.0
4.0  0.0  5.0  2.0
kernel=0, i=0, j=0, h=0, scale=1.0/1.0, Value=0
1.0  1.0  1.0  5.0
2.0  3.0  5.0  8.0
4.0  0.0  5.0  2.0
kernel=0, i=0, j=2, h=1, scale=4.0/1.0, Value=0
1.0  1.0  1.0  5.0
2.0  3.0  5.0  8.0
0.0  -4.0  1.0  -18.0
kernel=0, i=1, j=2, h=1, scale=-4.0/3.0, Value=0
1.0  1.0  1.0  5.0
2.0  3.0  5.0  8.0
2.7  0.0  7.7  -7.3
kernel=0, i=2, j=2, h=1, scale=7.7/7.7, Value=0
1.0  1.0  1.0  5.0
2.0  3.0  5.0  8.0
2.7  0.0  1.0  -1.0
..and now the rest...
test 1
test 2: i=0,j=0, h=0
test 2: i=0,j=1, h=0
test 2: i=0,j=2, h=1
test 1
test 2: i=1,j=0, h=0
kernel=0, i=1, j=0, h=0, scale=1.0/3.0, Value=0
0.3  0.0  -0.7  2.3
2.0  3.0  5.0  8.0
2.7  0.0  1.0  -1.0
test 2: i=1,j=1, h=0
test 2: i=1,j=2, h=1
test 1
test 2: i=2,j=0, h=0
kernel=0, i=2, j=0, h=0, scale=-0.7/1.0, Value=0
2.1  0.0  0.0  1.7
2.0  3.0  5.0  8.0
2.7  0.0  1.0  -1.0

test 2: i=2,j=1, h=0
test 2: i=2,j=2, h=1
The value of x0 is: 1.7
The value of x1 is: 8.0
The value of x2 is: -1.0
kernel=1, i=0, j=1, h=0, scale=2.0/2.1, Value=1
kernel=1, i=1, j=1, h=0, scale=3.0/3.0, Value=1
test 1
test 2: i=0,j=0, h=0
test 2: i=0,j=1, h=0
test 2: i=0,j=2, h=1
test 1
test 2: i=1,j=0, h=0
test 2: i=1,j=1, h=0
test 2: i=1,j=2, h=1
test 1
test 2: i=2,j=0, h=0
test 2: i=2,j=1, h=0
test 2: i=2,j=2, h=1
The value of x0 is: 1.7
The value of x1 is: 2.1
The value of x2 is: -1.0

Serial Time: 0.570 ms
Kernel time (device 0): 1.442 ms
Speed-up is: 0.395Failed verification @ device 0, index 0
Output: 1.695652
Reference: 3.000000
Verification: FAIL

出现这个问题似乎是因为我使用了API,并且以以前实验室中的方式调度内核。问题是,在以前的实验室中,内核执行各自的部分,然后返回。当我在内核代码中调试打印语句时,命令窗口会一个接一个地显示它们。这很好,它很有效,我可以看到它是如何工作的,因为我可以看到加速(当我上传代码到Altera板时,当然不是在模拟过程中(。然而,现在我需要在内核完成之前从全局意义上返回结果,这需要多次发生,并且所有内核都需要能够读取其他内核执行的矩阵的这种更改。我一直在使用clEnqueueMapBuffer,但我甚至不确定这是否是正确的API来使用或替换clEnqueueWriteBuffer。今晚我将按照教授的要求,把这个问题去掉。

首先,您应该从一开始就提到它是关于Altera FPGA的,因为OpenCL在那里的工作方式与在GPU上的工作方式有点不同。

根据我的经验,尤其是在使用FPGA时,你需要一步一步地完成。因此:

  1. 从更简单的开始,使用1个设备
  2. 使用阻塞调用而不是事件,例如:clEnqueueWriteBuffer、clEnqueueNDRangeKernel、clEnQueue ReadBuffer
  3. 不信任环境,在调度内核时也设置local_work_size=global_work_sze(clEnqueueNDRangeKernel(
  4. 从1个工作项内核开始,如果需要,稍后迁移到许多工作项(如果我记得很清楚,你可能不需要迁移到fpga上的许多工作项,因为性能会相同(
  5. 首先在CPU上测试内核的正确性,可以在Altera上使用调试模式,也可以在CPU上直接使用OpenCL,但您需要稍微调整主机代码才能做到这一点(例如,您不会读取aoxc文件,而是构建内核JIT(。对我来说,后者效果更好
  6. 在您需要类似屏障的情况下,将内核拆分为多个内核可能是更好的主意。缓冲区不需要重新分配,也不需要读回,只需直接通过它们到达下一个内核即可
  7. 熟悉Altera/Intel FPGA OpenCL手册、最佳实践等

这或多或少是我为Altera FPGA开发内核时所记得的全部内容。

最新更新