let column_mean_partial = blockReducer.Reduce(temp_storage, acc, fun a b -> a + b) / (float32 num_rows)
if threadIdx.x = 0 then
means.[col] <- column_mean_partial
column_mean_shared := column_mean_partial
__syncthreads()
let column_mean = !column_mean_shared
上面的代码片段计算二维矩阵每列的平均值。由于只有一个块的线程0具有完整的值,我将其存储到共享内存column_mean_shared
,使用__syncthreads()
,然后将其广播给块中的所有线程,因为我需要它们具有该值以计算方差。
是否有更好的方法来广播值,或者上面的方法已经足够有效了?
当我发布这个问题时,我并没有期望太多,但事实证明,对于大型矩阵,例如128x10000,有一个更好的方法。我编写了一个warreduce内核,它的块大小为32,这允许它使用shuffle xor来完成整个缩减。
对于128x100000的100次迭代,第一个版本每个网格使用64个块(每个块32个线程),耗时0.5s。对于CUB行减少,需要0.25s。
当我将每个网格的块增加到256时,我得到了近4倍的加速,达到1.5s左右。在每个线程384个块时,它需要1.1s,并且增加块的数量似乎并没有提高性能。
对于我感兴趣的问题规模,改进几乎没有那么显著。
对于128x1024和128x512情况,10000次迭代:
对于1024:1秒vs 0.82秒,支持warreduce。对于512:0.914秒vs 0.873秒,支持warreduce .
对于小矩阵,任何由并行性带来的加速似乎都会被内核启动时间所抵消。
256: 0.94秒vs 0.78秒支持warreduce。对于160:0.88秒vs 0.77秒有利于warreduce。
使用GTX 970进行测试。
对于开普勒和早期的nVidia卡,数字可能会有所不同,因为在Maxwell中,每个网格的块限制从每个SM的32块提高到64块,从而提高了多处理器占用率。
我对此很满意,因为性能改进很好,而且在使用Cub块缩减之前,我实际上无法编写正确使用共享内存的内核。我忘了Cuda有时有多痛苦。
不使用共享内存的版本如此有竞争力,真是令人惊讶。下面是我测试的两个模块:
type rowModule(target) =
inherit GPUModule(target)
let grid_size = 64
let block_size = 128
let blockReducer = BlockReduce.RakingCommutativeOnly<float32>(dim3(block_size,1,1),worker.Device.Arch)
[<Kernel;ReflectedDefinition>]
member this.Kernel (num_rows:int) (num_cols:int) (x:deviceptr<float32>) (means:deviceptr<float32>) (stds:deviceptr<float32>) =
// Point block_start to where the column starts in the array.
let mutable col = blockIdx.x
let temp_storage = blockReducer.TempStorage.AllocateShared()
let column_mean_shared = __shared__.Variable()
while col < num_cols do
// i is the row index
let mutable row = threadIdx.x
let mutable acc = 0.0f
while row < num_rows do
// idx is the absolute index in the array
let idx = row + col * num_rows
acc <- acc + x.[idx]
// Increment the row index
row <- row + blockDim.x
let column_mean_partial = blockReducer.Reduce(temp_storage, acc, fun a b -> a + b) / (float32 num_rows)
if threadIdx.x = 0 then
means.[col] <- column_mean_partial
column_mean_shared := column_mean_partial
__syncthreads()
let column_mean = !column_mean_shared
row <- threadIdx.x
acc <- 0.0f
while row < num_rows do
// idx is the absolute index in the array
let idx = row + col * num_rows
// Accumulate the variances.
acc <- acc + (x.[idx]-column_mean)*(x.[idx]-column_mean)
// Increment the row index
row <- row + blockDim.x
let variance_sum = blockReducer.Reduce(temp_storage, acc, fun a b -> a + b) / (float32 num_rows)
if threadIdx.x = 0 then stds.[col] <- sqrt(variance_sum)
col <- col + gridDim.x
member this.Apply((dmat: dM), (means: dM), (stds: dM)) =
let lp = LaunchParam(grid_size, block_size)
this.GPULaunch <@ this.Kernel @> lp dmat.num_rows dmat.num_cols dmat.dArray.Ptr means.dArray.Ptr stds.dArray.Ptr
type rowWarpModule(target) =
inherit GPUModule(target)
let grid_size = 384
let block_size = 32
[<Kernel;ReflectedDefinition>]
member this.Kernel (num_rows:int) (num_cols:int) (x:deviceptr<float32>) (means:deviceptr<float32>) (stds:deviceptr<float32>) =
// Point block_start to where the column starts in the array.
let mutable col = blockIdx.x
while col < num_cols do
// i is the row index
let mutable row = threadIdx.x
let mutable acc = 0.0f
while row < num_rows do
// idx is the absolute index in the array
let idx = row + col * num_rows
acc <- acc + x.[idx]
// Increment the row index
row <- row + blockDim.x
let inline butterflyWarpReduce (value:float32) =
let v1 = value + __shfl_xor value 16 32
let v2 = v1 + __shfl_xor v1 8 32
let v3 = v2 + __shfl_xor v2 4 32
let v4 = v3 + __shfl_xor v3 2 32
v4 + __shfl_xor v4 1 32
let column_mean = (butterflyWarpReduce acc) / (float32 num_rows)
row <- threadIdx.x
acc <- 0.0f
while row < num_rows do
// idx is the absolute index in the array
let idx = row + col * num_rows
// Accumulate the variances.
acc <- acc + (x.[idx]-column_mean)*(x.[idx]-column_mean)
// Increment the row index
row <- row + blockDim.x
let variance_sum = (butterflyWarpReduce acc) / (float32 num_rows)
if threadIdx.x = 0
then stds.[col] <- sqrt(variance_sum)
means.[col] <- column_mean
col <- col + gridDim.x
member this.Apply((dmat: dM), (means: dM), (stds: dM)) =
let lp = LaunchParam(grid_size, block_size)
this.GPULaunch <@ this.Kernel @> lp dmat.num_rows dmat.num_cols dmat.dArray.Ptr means.dArray.Ptr stds.dArray.Ptr