我正在使用OpenCL
在Android平台中开发面部检测应用程序。面部检测算法基于中提琴琼斯算法。我试图制作级联分类步骤内核代码。我将CASCADE阶段1的classifier data
设置为local memory(__local)
,因为分类器数据用于所有工作项目。
但是,内核分析时间不使用本地mem(使用全局mem)比使用本地内存更快。
编辑:
我上传了完整的代码。
使用本地内存版本
__kernel void CASCADE(__read_only image2d_t input_image, __write_only image2d_t output_image,__constant float* classifierMem,__constant int* idxNumValStageArray, int numTotStage, __constant int* vecSkin){
int cascadeLocalSize = get_local_size(0);
__local float localS1F1[42];
int localIdx = get_local_id(1)*cascadeLocalSize + get_local_id(0);
if(localIdx<42)
{
int stage1Idx = localIdx + idxNumValStageArray[0]+4;
localS1F1[localIdx] = classifierMem[stage1Idx];
}
barrier(CLK_LOCAL_MEM_FENCE);
float resizeFactor = 1.0;
int2 im_dim = get_image_dim(input_image);
unsigned int srcWidth = im_dim.x*(float)resizeFactor;
unsigned int srcHeight = im_dim.y*(float)resizeFactor;
int gx = get_global_id(0);
int gy = get_global_id(1);
int skinX=0;
int skinY=0;
int coordi=vecSkin[512*gy+gx];
skinX = coordi%im_dim.x;
skinY = coordi/im_dim.x;
if( skinX >= 10 && skinY >= 10 )
{
skinX -= 10;
skinY -= 10;
}
int type = gx%3;
unsigned int windowWidth = classifierMem[0];
unsigned int windowHeight = classifierMem[1];
unsigned int stageIndex;
float stageThres;
float numFeatures;
unsigned int featureIndex;
float featureValue;
if(skinX<srcWidth-windowWidth-1 && skinY<srcHeight-windowHeight-1){
bool stagePass = true;
unsigned int index = 0;
for(unsigned int i=numTotStage; i>0;i--){
if(stagePass){
if(index == 0){
stageIndex = idxNumValStageArray[0];
stageThres = classifierMem[stageIndex+2];
numFeatures = classifierMem[stageIndex+3];
featureIndex = 0;
featureValue = 0.0;
}
else{
stageIndex = idxNumValStageArray[index];
stageThres = classifierMem[stageIndex+2];
numFeatures = classifierMem[stageIndex+3];
featureIndex = stageIndex+4;
featureValue = 0.0;
}
float featureThres;
float succVal;
float failVal;
unsigned int numRegions;
float regionValue;
if(type ==0 && index==0)
{
for(unsigned int j=numFeatures; j>0;j--){
if(stagePass){
featureThres=localS1F1[featureIndex++]*(windowWidth*windowHeight);
succVal=localS1F1[featureIndex++];
failVal=localS1F1[featureIndex++];
numRegions = localS1F1[featureIndex++];
regionValue =0.0;
float4 rectValue;
int4 regionP;
for(unsigned int k=numRegions; k>0;k--){
regionP.x=(int)(localS1F1[featureIndex])+skinX;
regionP.y=(int)(localS1F1[featureIndex+1])+skinY;
regionP.z=(int)(localS1F1[featureIndex+2])+regionP.x;
regionP.w=(int)(localS1F1[featureIndex+3])+regionP.y;
rectValue.x = read_imagef(input_image, sampler, regionP.xy).x;
rectValue.y = read_imagef(input_image, sampler, regionP.zy).x;
rectValue.z = read_imagef(input_image, sampler, regionP.xw).x;
rectValue.w = read_imagef(input_image, sampler, regionP.zw).x;
regionValue += dot(rectValue, (float4)(1.0f, -1.0f, -1.0f, 1.0f))*localS1F1[featureIndex+4];
featureIndex+=5;
}
featureValue += (regionValue < featureThres)?failVal:succVal;
if(j*2 == (unsigned int)numFeatures && featureValue*2 < stageThres) stagePass =false;
}// end of if(stagePass)
}// end of for(unsigned int j=numFeatures; j>0;j--)
index++;
if(featureValue < stageThres) stagePass =false;
else if(index==numTotStage) write_imagef(output_image, (int2)(skinX, skinY), (0.1));
}
else if(type ==1 && index ==0)
{
featureIndex +=14;
for(unsigned int j=numFeatures; j>0;j--){
if(stagePass){
if(j==1)
featureIndex -= 42;
featureThres=localS1F1[featureIndex++]*(windowWidth*windowHeight);
succVal=localS1F1[featureIndex++];
failVal=localS1F1[featureIndex++];
numRegions = localS1F1[featureIndex++];
regionValue =0.0;
float4 rectValue;
int4 regionP;
for(unsigned int k=numRegions; k>0;k--){
regionP.x=(int)(localS1F1[featureIndex])+skinX;
regionP.y=(int)(localS1F1[featureIndex+1])+skinY;
regionP.z=(int)(localS1F1[featureIndex+2])+regionP.x;
regionP.w=(int)(localS1F1[featureIndex+3])+regionP.y;
rectValue.x = read_imagef(input_image, sampler, regionP.xy).x;
rectValue.y = read_imagef(input_image, sampler, regionP.zy).x;
rectValue.z = read_imagef(input_image, sampler, regionP.xw).x;
rectValue.w = read_imagef(input_image, sampler, regionP.zw).x;
regionValue += dot(rectValue, (float4)(1.0f, -1.0f, -1.0f, 1.0f))*localS1F1[featureIndex+4];
featureIndex+=5;
}
featureValue += (regionValue < featureThres)?failVal:succVal;
if(j*2 == (unsigned int)numFeatures && featureValue*2 < stageThres) stagePass =false;
}
}
index++;
if(featureValue < stageThres) stagePass =false;
else if(index==numTotStage) write_imagef(output_image, (int2)(skinX, skinY), (0.1));
}
else if(index == 0)
{
featureIndex +=28;
for(unsigned int j=numFeatures; j>0;j--){
if(stagePass){
if(j==2) featureIndex -= 42;
featureThres=localS1F1[featureIndex++]*(windowWidth*windowHeight);
succVal=localS1F1[featureIndex++];
failVal=localS1F1[featureIndex++];
numRegions = localS1F1[featureIndex++];
regionValue =0.0;
float4 rectValue;
int4 regionP;
for(unsigned int k=numRegions; k>0;k--){
regionP.x=(int)(localS1F1[featureIndex])+skinX;
regionP.y=(int)(localS1F1[featureIndex+1])+skinY;
regionP.z=(int)(localS1F1[featureIndex+2])+regionP.x;
regionP.w=(int)(localS1F1[featureIndex+3])+regionP.y;
rectValue.x = read_imagef(input_image, sampler, regionP.xy).x;
rectValue.y = read_imagef(input_image, sampler, regionP.zy).x;
rectValue.z = read_imagef(input_image, sampler, regionP.xw).x;
rectValue.w = read_imagef(input_image, sampler, regionP.zw).x;
regionValue += dot(rectValue, (float4)(1.0f, -1.0f, -1.0f, 1.0f))*localS1F1[featureIndex+4];
featureIndex+=5;
}// end of for(unsigned int k=numRegions; k>0;k--)
featureValue += (regionValue < featureThres)?failVal:succVal;
if(j*2 == (unsigned int)numFeatures && featureValue*2 < stageThres) stagePass =false;
}// end of if(stagePass)
}//end of for(unsigned int j=numFeatures; j>0;j--)
index++;
if(featureValue < stageThres) stagePass =false;
else if(index==numTotStage) write_imagef(output_image, (int2)(skinX, skinY), (0.1));
}
//stage
else{
for(unsigned int j=numFeatures; j>0;j--){
if(stagePass){
featureThres=classifierMem[featureIndex++]*(windowWidth*windowHeight);
succVal=classifierMem[featureIndex++];
failVal=classifierMem[featureIndex++];
numRegions = classifierMem[featureIndex++];
regionValue =0.0;
float4 rectValue;
int4 regionP;
for(unsigned int k=numRegions; k>0;k--){
regionP.x=(int)(classifierMem[featureIndex])+skinX;
regionP.y=(int)(classifierMem[featureIndex+1])+skinY;
regionP.z=(int)(classifierMem[featureIndex+2])+regionP.x;
regionP.w=(int)(classifierMem[featureIndex+3])+regionP.y;
rectValue.x = read_imagef(input_image, sampler, regionP.xy).x;
rectValue.y = read_imagef(input_image, sampler, regionP.zy).x;
rectValue.z = read_imagef(input_image, sampler, regionP.xw).x;
rectValue.w = read_imagef(input_image, sampler, regionP.zw).x;
regionValue += dot(rectValue, (float4)(1.0f, -1.0f, -1.0f, 1.0f))*classifierMem[featureIndex+4];
featureIndex+=5;
}
featureValue += (regionValue < featureThres)?failVal:succVal;
if(j*2 == (unsigned int)numFeatures && featureValue*2 < stageThres) stagePass =false;
}
}
index++;
if(featureValue < stageThres) stagePass =false;
else if(index==numTotStage) write_imagef(output_image, (int2)(skinX, skinY), (0.1));
}
}
}
}else return;
}
原始版本(无本地mem)
__kernel void CASCADE(__read_only image2d_t input_image, __write_only image2d_t output_image,__constant float* classifierMem,__constant int* idxNumValStageArray, int numTotStage, __constant int* vecSkin){
float resizeFactor = 1.0;
int2 im_dim = get_image_dim(input_image);
unsigned int srcWidth = im_dim.x*(float)resizeFactor;
unsigned int srcHeight = im_dim.y*(float)resizeFactor;
int gx = get_global_id(0);
int gy = get_global_id(1);
int skinX=0;
int skinY=0;
int coordi=vecSkin[512*gy+gx];
skinX = coordi%im_dim.x;
skinY = coordi/im_dim.x;
if( skinX >= 10 && skinY >= 10 )
{
skinX -= 10;
skinY -= 10;
}
unsigned int windowWidth = classifierMem[0];
unsigned int windowHeight = classifierMem[1];
if(gx<srcWidth-windowWidth-1 && gy<srcHeight-windowHeight-1){
bool stagePass = true;
unsigned int index = 0;
for(unsigned int i=numTotStage; i>0;i--){
if(stagePass){
unsigned int stageIndex = idxNumValStageArray[index++];
float stageThres = classifierMem[stageIndex+2];
float numFeatures = classifierMem[stageIndex+3];
unsigned int featureIndex = stageIndex+4;
float featureValue = 0.0;
for(unsigned int j=numFeatures; j>0;j--){
if(stagePass){
float featureThres=classifierMem[featureIndex++]*(windowWidth*windowHeight);
float succVal=classifierMem[featureIndex++];
float failVal=classifierMem[featureIndex++];
unsigned int numRegions = classifierMem[featureIndex++];
float regionValue =0.0;
for(unsigned int k=numRegions; k>0;k--){
float4 rectValue;
int4 regionP;
regionP.x=(int)(classifierMem[featureIndex])+skinX;
regionP.y=(int)(classifierMem[featureIndex+1])+skinY;
regionP.z=(int)(classifierMem[featureIndex+2])+regionP.x;
regionP.w=(int)(classifierMem[featureIndex+3])+regionP.y;
rectValue.x = read_imagef(input_image, sampler, regionP.xy).x;
rectValue.y = read_imagef(input_image, sampler, regionP.zy).x;
rectValue.z = read_imagef(input_image, sampler, regionP.xw).x;
rectValue.w = read_imagef(input_image, sampler, regionP.zw).x;
regionValue += dot(rectValue, (float4)(1.0f, -1.0f, -1.0f, 1.0f))*classifierMem[featureIndex+4];
featureIndex+=5;
}
featureValue += (regionValue < featureThres)?failVal:succVal;
if(j*2 == (unsigned int)numFeatures && featureValue*2 < stageThres) stagePass =false;
}
}
if(featureValue < stageThres) stagePass =false;
else if(index==numTotStage) write_imagef(output_image, (int2)(skinX, skinY), (0.1));
}
}
}else return;
}
分析时间:原始版本(无本地内存):24ms修改版本(与本地MEM):28ms
编辑:实际上,localworksize null null gosasue globalworksize始终会因矢量大小而变化,而矢量大小将ndrangekernel构成。当将特定的localworksize放置时,面部检测率下降...因此我试图将localworksize null放置,然后面部检测率良好。所以我想解决原因。
这是主机代码:
//localWorkSize[0] = 16;
//localWorkSize[1] = 12;
numThreadsX=512;
globalWorkSize[0] = numThreadsX;
globalWorkSize[1] = vecCoordinate.size()% numThreadsX == 0 ? vecCoordinate.size()/ numThreadsX :(vecCoordinate.size()/ numThreadsX) + 1;
errNum = clEnqueueWriteBuffer(commandQueue,classifierMem,CL_TRUE,0,sizeof(float)*cntValArray,stageValArray,0,NULL,NULL);
errNum |= clEnqueueWriteBuffer(commandQueue,idxStageMem,CL_TRUE,0,sizeof(int)*haar.numStages,idxNumValStageArray,0,NULL,NULL);
errNum |= clSetKernelArg(kHaar_Cascade, 0, sizeof(memObjBuffer_Haar22), &memObjBuffer_Haar22);
errNum |= clSetKernelArg(kHaar_Cascade, 1, sizeof(memObjBuffer22), &memObjBuffer22);
errNum |= clSetKernelArg(kHaar_Cascade, 2, sizeof(cl_mem), &classifierMem);
errNum |= clSetKernelArg(kHaar_Cascade, 3, sizeof(cl_mem), &idxStageMem);
errNum |= clSetKernelArg(kHaar_Cascade, 4, sizeof(cl_int), &haar.numStages);
errNum |= clSetKernelArg(kHaar_Cascade, 5, sizeof(cl_mem), &memVecCoordi);
errNum = clEnqueueNDRangeKernel(commandQueue, kHaar_Cascade, 2, NULL,globalWorkSize, NULL,0, NULL, &event[3]);
有多种原因:
- 内存负载的线程差异
- 障碍不是免费的
- 更多说明
- 在大多数平台上,常数内存与共享内存几乎相同,因此额外的工作没有收益。
- 我们不知道共享内存版本正在保存多少读 - 请告诉我们工作组的大小。它越小,我们节省的非本地记忆的读取就越少。这是否是恒定的内存并不重要。
配置了它们两个,看看profilter的输出是否在最大利用任何事物时显示出任何内容,并向我们指示。
另外,我不相信您的localidx和stage1idx是有道理的,它们可能会脱离数组的界限并引起奇怪的行为。至少对于给定的GX/GY,您看起来好像正在使用分类中的不同索引。
您有五个浮点值(20个字节),每个工作项目都在读取。大多数现代的GPU都将具有硬件管理的缓存,这些缓存位置与用户管理的本地内存相似。在您不使用本地内存的版本中,这20个字节会很乐意坐在靠近Alus的缓存中,并提供您所希望的所有数据恢复性能优势。您使用本地内存的版本只是明确地做同一件事,但也添加了一堆额外的开销,以手动启动副本和工作项目之间的一些额外同步,这必定会减慢事物的速度。
>当仔细应用某些情况下,本地记忆可以在某些情况下带来性能优势。但是,在许多情况下,硬件缓存将做得更好,并为您提供更简单的代码。