金属 - 优化内存访问



这个问题有两个部分,但它们密切相关:

问题1

Metal 是否提供了一种利用共享线程组内存的方法?

例如,在 CUDA 中,您可以像这样将数据从设备内存显式加载到共享内存中:

__shared__ float example1

金属是否提供此类功能?似乎所有缓冲区访问都从全局内存加载,除非幕后发生了一些隐藏的魔法。

问题2

这可能不是 Metal 独有的,所以任何 GPU 大师都可能会有所帮助。Apple 在这里提供了一个矩阵乘法示例 - 我将粘贴下面的内核以供参考:

typedef struct
{
ushort m, k, n, pbytes, qbytes;
} MetalMatrixDim;

kernel void MatrixMultiply(const device float*       A    [[ buffer(0) ]],
const device float*       B    [[ buffer(1) ]],
device float*             C    [[ buffer(2) ]],
constant MetalMatrixDim&  dims [[ buffer(3) ]],
ushort2                   gid  [[ thread_position_in_grid ]])
{
ushort m = dims.m;
ushort k = dims.k;
ushort n = dims.n;
ushort pbytes = dims.pbytes;
ushort qbytes = dims.qbytes;
ushort2 gidIn = ushort2(gid.x << 3, gid.y << 3);
if (gidIn.x >= m || gidIn.y >= k) return;
const device float4* a = (const device float4*)(A + gidIn.x);
const device float4* b = (const device float4*)(B + gidIn.y);
C = (device float*)((device char*)C + gidIn.x*qbytes);
device float4* c = (device float4*)(C + gidIn.y);
const device float4* Bend = (const device float4*)((const device char*)B + qbytes*n);
float4 s0  = 0.0f, s1  = 0.0f, s2  = 0.0f, s3  = 0.0f;
float4 s4  = 0.0f, s5  = 0.0f, s6  = 0.0f, s7  = 0.0f;
float4 s8  = 0.0f, s9  = 0.0f, s10 = 0.0f, s11 = 0.0f;
float4 s12 = 0.0f, s13 = 0.0f, s14 = 0.0f, s15 = 0.0f;
do
{
float4 aCurr0 = a[0];
float4 aCurr1 = a[1];
float4 bCurr0 = b[0];
float4 bCurr1 = b[1];
s0   += (aCurr0.x * bCurr0);
s2   += (aCurr0.y * bCurr0);
s4   += (aCurr0.z * bCurr0);
s6   += (aCurr0.w * bCurr0);
s1   += (aCurr0.x * bCurr1);
s3   += (aCurr0.y * bCurr1);
s5   += (aCurr0.z * bCurr1);
s7   += (aCurr0.w * bCurr1);
s8   += (aCurr1.x * bCurr0);
s10  += (aCurr1.y * bCurr0);
s12  += (aCurr1.z * bCurr0);
s14  += (aCurr1.w * bCurr0);
s9   += (aCurr1.x * bCurr1);
s11  += (aCurr1.y * bCurr1);
s13  += (aCurr1.z * bCurr1);
s15  += (aCurr1.w * bCurr1);
a = (device float4*)((device char*)a + pbytes);
b = (device float4*)((device char*)b + qbytes);
} while(b < Bend);
c[0] = s0;  c[1] = s1;  c = (device float4*)((device char*)c + qbytes);
c[0] = s2;  c[1] = s3;  c = (device float4*)((device char*)c + qbytes);
c[0] = s4;  c[1] = s5;  c = (device float4*)((device char*)c + qbytes);
c[0] = s6;  c[1] = s7;  c = (device float4*)((device char*)c + qbytes);
c[0] = s8;  c[1] = s9;  c = (device float4*)((device char*)c + qbytes);
c[0] = s10; c[1] = s11; c = (device float4*)((device char*)c + qbytes);
c[0] = s12; c[1] = s13; c = (device float4*)((device char*)c + qbytes);
c[0] = s14; c[1] = s15;
}

问题:对于每个线程,此内核计算输出C的 8 x 8 扇区。这是什么原因呢?为什么不允许每个线程计算C的单个元素,这将消除 8 倍数的大小限制并为较小的矩阵提供更好的并行化?

我假设这个实现必须以某种方式优化,我它与线程同步和内存访问有关 - 这就是我将其与问题 1 捆绑在一起的原因。有什么想法吗?

我看不出你的两个问题之间有什么关系。关于问题 1:是的,Metal 在计算函数中提供共享线程组内存。只需在变量声明上指定threadgroup地址空间限定符即可。例如:

threadgroup float example1;

还可以将线程组缓冲区指定为计算函数的输入参数。

kernel void my_func(...,
threadgroup float *example2 [[threadgroup(0)]],
...)
{
...
}

缓冲区由设备分配。缓冲区的大小是使用计算命令编码器的-setThreadgroupMemoryLength:atIndex:方法设置的。

最新更新