在使用Cuda Unbound的blockReduce后,向所有线程广播一个值的有效方法是什么?


let column_mean_partial = blockReducer.Reduce(temp_storage, acc, fun a b -> a + b) / (float32 num_rows)
if threadIdx.x = 0 then 
    means.[col] <- column_mean_partial
    column_mean_shared := column_mean_partial
__syncthreads()
let column_mean = !column_mean_shared

上面的代码片段计算二维矩阵每列的平均值。由于只有一个块的线程0具有完整的值,我将其存储到共享内存column_mean_shared,使用__syncthreads(),然后将其广播给块中的所有线程,因为我需要它们具有该值以计算方差。

是否有更好的方法来广播值,或者上面的方法已经足够有效了?

当我发布这个问题时,我并没有期望太多,但事实证明,对于大型矩阵,例如128x10000,有一个更好的方法。我编写了一个warreduce内核,它的块大小为32,这允许它使用shuffle xor来完成整个缩减。

对于128x100000的100次迭代,第一个版本每个网格使用64个块(每个块32个线程),耗时0.5s。对于CUB行减少,需要0.25s。

当我将每个网格的块增加到256时,我得到了近4倍的加速,达到1.5s左右。在每个线程384个块时,它需要1.1s,并且增加块的数量似乎并没有提高性能。

对于我感兴趣的问题规模,改进几乎没有那么显著。

对于128x1024和128x512情况,10000次迭代:

对于1024:1秒vs 0.82秒,支持warreduce。对于512:0.914秒vs 0.873秒,支持warreduce .

对于小矩阵,任何由并行性带来的加速似乎都会被内核启动时间所抵消。

256: 0.94秒vs 0.78秒支持warreduce。对于160:0.88秒vs 0.77秒有利于warreduce。

使用GTX 970进行测试。

对于开普勒和早期的nVidia卡,数字可能会有所不同,因为在Maxwell中,每个网格的块限制从每个SM的32块提高到64块,从而提高了多处理器占用率。

我对此很满意,因为性能改进很好,而且在使用Cub块缩减之前,我实际上无法编写正确使用共享内存的内核。我忘了Cuda有时有多痛苦。

不使用共享内存的版本如此有竞争力,真是令人惊讶。

下面是我测试的两个模块:

type rowModule(target) =
    inherit GPUModule(target)
    let grid_size = 64
    let block_size = 128
    let blockReducer = BlockReduce.RakingCommutativeOnly<float32>(dim3(block_size,1,1),worker.Device.Arch)
    [<Kernel;ReflectedDefinition>]
    member this.Kernel (num_rows:int) (num_cols:int) (x:deviceptr<float32>) (means:deviceptr<float32>) (stds:deviceptr<float32>) =
        // Point block_start to where the column starts in the array.
        let mutable col = blockIdx.x
        let temp_storage = blockReducer.TempStorage.AllocateShared()
        let column_mean_shared = __shared__.Variable()
        while col < num_cols do
            // i is the row index
            let mutable row = threadIdx.x
            let mutable acc = 0.0f
            while row < num_rows do
                // idx is the absolute index in the array
                let idx = row + col * num_rows
                acc <- acc + x.[idx]
                // Increment the row index
                row <- row + blockDim.x
            let column_mean_partial = blockReducer.Reduce(temp_storage, acc, fun a b -> a + b) / (float32 num_rows)
            if threadIdx.x = 0 then 
                means.[col] <- column_mean_partial
                column_mean_shared := column_mean_partial
            __syncthreads()
            let column_mean = !column_mean_shared
            row <- threadIdx.x
            acc <- 0.0f
            while row < num_rows do
                // idx is the absolute index in the array
                let idx = row + col * num_rows
                // Accumulate the variances.
                acc <- acc + (x.[idx]-column_mean)*(x.[idx]-column_mean)
                // Increment the row index
                row <- row + blockDim.x
            let variance_sum = blockReducer.Reduce(temp_storage, acc, fun a b -> a + b) / (float32 num_rows)
            if threadIdx.x = 0 then stds.[col] <- sqrt(variance_sum)
            col <- col + gridDim.x
    member this.Apply((dmat: dM), (means: dM), (stds: dM)) =
        let lp = LaunchParam(grid_size, block_size)
        this.GPULaunch <@ this.Kernel @> lp dmat.num_rows dmat.num_cols dmat.dArray.Ptr means.dArray.Ptr stds.dArray.Ptr
type rowWarpModule(target) =
    inherit GPUModule(target)
    let grid_size = 384
    let block_size = 32
    [<Kernel;ReflectedDefinition>]
    member this.Kernel (num_rows:int) (num_cols:int) (x:deviceptr<float32>) (means:deviceptr<float32>) (stds:deviceptr<float32>) =
        // Point block_start to where the column starts in the array.
        let mutable col = blockIdx.x
        while col < num_cols do
            // i is the row index
            let mutable row = threadIdx.x
            let mutable acc = 0.0f
            while row < num_rows do
                // idx is the absolute index in the array
                let idx = row + col * num_rows
                acc <- acc + x.[idx]
                // Increment the row index
                row <- row + blockDim.x
            let inline butterflyWarpReduce (value:float32) = 
                let v1 = value + __shfl_xor value 16 32
                let v2 = v1 + __shfl_xor v1 8 32
                let v3 = v2 + __shfl_xor v2 4 32
                let v4 = v3 + __shfl_xor v3 2 32
                v4 + __shfl_xor v4 1 32
            let column_mean = (butterflyWarpReduce acc) / (float32 num_rows)
            row <- threadIdx.x
            acc <- 0.0f
            while row < num_rows do
                // idx is the absolute index in the array
                let idx = row + col * num_rows
                // Accumulate the variances.
                acc <- acc + (x.[idx]-column_mean)*(x.[idx]-column_mean)
                // Increment the row index
                row <- row + blockDim.x
            let variance_sum = (butterflyWarpReduce acc) / (float32 num_rows)
            if threadIdx.x = 0 
            then stds.[col] <- sqrt(variance_sum)
                 means.[col] <- column_mean
            col <- col + gridDim.x
    member this.Apply((dmat: dM), (means: dM), (stds: dM)) =
        let lp = LaunchParam(grid_size, block_size)
        this.GPULaunch <@ this.Kernel @> lp dmat.num_rows dmat.num_cols dmat.dArray.Ptr means.dArray.Ptr stds.dArray.Ptr

相关内容

最新更新