在使用Cuda Unbound的blockReduce后，向所有线程广播一个值的有效方法是什么?

let column_mean_partial = blockReducer.Reduce(temp_storage, acc, fun a b -> a + b) / (float32 num_rows)
if threadIdx.x = 0 then 
    means.[col] <- column_mean_partial
    column_mean_shared := column_mean_partial
__syncthreads()
let column_mean = !column_mean_shared

上面的代码片段计算二维矩阵每列的平均值。由于只有一个块的线程0具有完整的值，我将其存储到共享内存column_mean_shared，使用__syncthreads()，然后将其广播给块中的所有线程，因为我需要它们具有该值以计算方差。

是否有更好的方法来广播值，或者上面的方法已经足够有效了?

当我发布这个问题时，我并没有期望太多，但事实证明，对于大型矩阵，例如128x10000，有一个更好的方法。我编写了一个warreduce内核，它的块大小为32，这允许它使用shuffle xor来完成整个缩减。

对于128x100000的100次迭代，第一个版本每个网格使用64个块(每个块32个线程)，耗时0.5s。对于CUB行减少，需要0.25s。

当我将每个网格的块增加到256时，我得到了近4倍的加速，达到1.5s左右。在每个线程384个块时，它需要1.1s，并且增加块的数量似乎并没有提高性能。

对于我感兴趣的问题规模，改进几乎没有那么显著。

对于128x1024和128x512情况，10000次迭代:

对于1024:1秒vs 0.82秒，支持warreduce。对于512:0.914秒vs 0.873秒，支持warreduce .

对于小矩阵，任何由并行性带来的加速似乎都会被内核启动时间所抵消。

256: 0.94秒vs 0.78秒支持warreduce。对于160:0.88秒vs 0.77秒有利于warreduce。

使用GTX 970进行测试。

对于开普勒和早期的nVidia卡，数字可能会有所不同，因为在Maxwell中，每个网格的块限制从每个SM的32块提高到64块，从而提高了多处理器占用率。

我对此很满意，因为性能改进很好，而且在使用Cub块缩减之前，我实际上无法编写正确使用共享内存的内核。我忘了Cuda有时有多痛苦。

不使用共享内存的版本如此有竞争力，真是令人惊讶。

下面是我测试的两个模块:

type rowModule(target) =
    inherit GPUModule(target)
    let grid_size = 64
    let block_size = 128
    let blockReducer = BlockReduce.RakingCommutativeOnly<float32>(dim3(block_size,1,1),worker.Device.Arch)
    [<Kernel;ReflectedDefinition>]
    member this.Kernel (num_rows:int) (num_cols:int) (x:deviceptr<float32>) (means:deviceptr<float32>) (stds:deviceptr<float32>) =
        // Point block_start to where the column starts in the array.
        let mutable col = blockIdx.x
        let temp_storage = blockReducer.TempStorage.AllocateShared()
        let column_mean_shared = __shared__.Variable()
        while col < num_cols do
            // i is the row index
            let mutable row = threadIdx.x
            let mutable acc = 0.0f
            while row < num_rows do
                // idx is the absolute index in the array
                let idx = row + col * num_rows
                acc <- acc + x.[idx]
                // Increment the row index
                row <- row + blockDim.x
            let column_mean_partial = blockReducer.Reduce(temp_storage, acc, fun a b -> a + b) / (float32 num_rows)
            if threadIdx.x = 0 then 
                means.[col] <- column_mean_partial
                column_mean_shared := column_mean_partial
            __syncthreads()
            let column_mean = !column_mean_shared
            row <- threadIdx.x
            acc <- 0.0f
            while row < num_rows do
                // idx is the absolute index in the array
                let idx = row + col * num_rows
                // Accumulate the variances.
                acc <- acc + (x.[idx]-column_mean)*(x.[idx]-column_mean)
                // Increment the row index
                row <- row + blockDim.x
            let variance_sum = blockReducer.Reduce(temp_storage, acc, fun a b -> a + b) / (float32 num_rows)
            if threadIdx.x = 0 then stds.[col] <- sqrt(variance_sum)
            col <- col + gridDim.x
    member this.Apply((dmat: dM), (means: dM), (stds: dM)) =
        let lp = LaunchParam(grid_size, block_size)
        this.GPULaunch <@ this.Kernel @> lp dmat.num_rows dmat.num_cols dmat.dArray.Ptr means.dArray.Ptr stds.dArray.Ptr
type rowWarpModule(target) =
    inherit GPUModule(target)
    let grid_size = 384
    let block_size = 32
    [<Kernel;ReflectedDefinition>]
    member this.Kernel (num_rows:int) (num_cols:int) (x:deviceptr<float32>) (means:deviceptr<float32>) (stds:deviceptr<float32>) =
        // Point block_start to where the column starts in the array.
        let mutable col = blockIdx.x
        while col < num_cols do
            // i is the row index
            let mutable row = threadIdx.x
            let mutable acc = 0.0f
            while row < num_rows do
                // idx is the absolute index in the array
                let idx = row + col * num_rows
                acc <- acc + x.[idx]
                // Increment the row index
                row <- row + blockDim.x
            let inline butterflyWarpReduce (value:float32) = 
                let v1 = value + __shfl_xor value 16 32
                let v2 = v1 + __shfl_xor v1 8 32
                let v3 = v2 + __shfl_xor v2 4 32
                let v4 = v3 + __shfl_xor v3 2 32
                v4 + __shfl_xor v4 1 32
            let column_mean = (butterflyWarpReduce acc) / (float32 num_rows)
            row <- threadIdx.x
            acc <- 0.0f
            while row < num_rows do
                // idx is the absolute index in the array
                let idx = row + col * num_rows
                // Accumulate the variances.
                acc <- acc + (x.[idx]-column_mean)*(x.[idx]-column_mean)
                // Increment the row index
                row <- row + blockDim.x
            let variance_sum = (butterflyWarpReduce acc) / (float32 num_rows)
            if threadIdx.x = 0 
            then stds.[col] <- sqrt(variance_sum)
                 means.[col] <- column_mean
            col <- col + gridDim.x
    member this.Apply((dmat: dM), (means: dM), (stds: dM)) =
        let lp = LaunchParam(grid_size, block_size)
        this.GPULaunch <@ this.Kernel @> lp dmat.num_rows dmat.num_cols dmat.dArray.Ptr means.dArray.Ptr stds.dArray.Ptr

相关内容

最新更新

热门标签：