iOS 上数组的快速金属并行和计算



import UIKit
import Metal
class ViewController: UIViewController {
// Data type, has to be the same as in the shader
typealias DataType = CInt
override func viewDidLoad() {
    let data = (0..<10000000).map{ _ in DataType(200) } // Our data, randomly generated

    var start, end : UInt64

    var result:DataType = 0
    start = mach_absolute_time()
    data.withUnsafeBufferPointer { buffer in
        for elem in buffer {
            result += elem
    end = mach_absolute_time()
    print("CPU result: (result), time: (Double(end - start) / Double(NSEC_PER_SEC))")
    result = 0

    start = mach_absolute_time()
    result = sumParallel4(data)
    end = mach_absolute_time()
    print("Metal result: (result), time: (Double(end - start) / Double(NSEC_PER_SEC))")

    result = 0
    start = mach_absolute_time()
    result = sumParralel(data)
    end = mach_absolute_time()
    print("Metal result: (result), time: (Double(end - start) / Double(NSEC_PER_SEC))")
    result = 0
    start = mach_absolute_time()
    result = sumParallel3(data)
    end = mach_absolute_time()
    print("Metal result: (result), time: (Double(end - start) / Double(NSEC_PER_SEC))")

func sumParralel(data : Array<DataType>) -> DataType {
    let count = data.count
    let elementsPerSum: Int = Int(sqrt(Double(count)))
    let device = MTLCreateSystemDefaultDevice()!
    let parsum = device.newDefaultLibrary()!.newFunctionWithName("parsum")!
    let pipeline = try! device.newComputePipelineStateWithFunction(parsum)

    var dataCount = CUnsignedInt(count)
    var elementsPerSumC = CUnsignedInt(elementsPerSum)
    let resultsCount = (count + elementsPerSum - 1) / elementsPerSum // Number of individual results = count / elementsPerSum (rounded up)

    let dataBuffer = device.newBufferWithBytes(data, length: strideof(DataType) * count, options: []) // Our data in a buffer (copied)
    let resultsBuffer = device.newBufferWithLength(strideof(DataType) * resultsCount, options: []) // A buffer for individual results (zero initialized)
    let results = UnsafeBufferPointer<DataType>(start: UnsafePointer(resultsBuffer.contents()), count: resultsCount) // Our results in convenient form to compute the actual result later
    let queue = device.newCommandQueue()
    let cmds = queue.commandBuffer()
    let encoder = cmds.computeCommandEncoder()
    encoder.setBuffer(dataBuffer, offset: 0, atIndex: 0)
    encoder.setBytes(&dataCount, length: sizeofValue(dataCount), atIndex: 1)
    encoder.setBuffer(resultsBuffer, offset: 0, atIndex: 2)
    encoder.setBytes(&elementsPerSumC, length: sizeofValue(elementsPerSumC), atIndex: 3)
    // We have to calculate the sum `resultCount` times => amount of threadgroups is `resultsCount` / `threadExecutionWidth` (rounded up) because each threadgroup will process `threadExecutionWidth` threads
    let threadgroupsPerGrid = MTLSize(width: (resultsCount + pipeline.threadExecutionWidth - 1) / pipeline.threadExecutionWidth, height: 1, depth: 1)
    // Here we set that each threadgroup should process `threadExecutionWidth` threads, the only important thing for performance is that this number is a multiple of `threadExecutionWidth` (here 1 times)
    let threadsPerThreadgroup = MTLSize(width: pipeline.threadExecutionWidth, height: 1, depth: 1)
    encoder.dispatchThreadgroups(threadgroupsPerGrid, threadsPerThreadgroup: threadsPerThreadgroup)

    var result : DataType = 0

    for elem in results {
        result += elem

    return result

func sumParralel1(data : Array<DataType>) -> UnsafeBufferPointer<DataType> {
    let count = data.count
    let elementsPerSum: Int = Int(sqrt(Double(count)))
    let device = MTLCreateSystemDefaultDevice()!
    let parsum = device.newDefaultLibrary()!.newFunctionWithName("parsum")!
    let pipeline = try! device.newComputePipelineStateWithFunction(parsum)

    var dataCount = CUnsignedInt(count)
    var elementsPerSumC = CUnsignedInt(elementsPerSum)
    let resultsCount = (count + elementsPerSum - 1) / elementsPerSum // Number of individual results = count / elementsPerSum (rounded up)
    let dataBuffer = device.newBufferWithBytes(data, length: strideof(DataType) * count, options: []) // Our data in a buffer (copied)
    let resultsBuffer = device.newBufferWithLength(strideof(DataType) * resultsCount, options: []) // A buffer for individual results (zero initialized)
    let results = UnsafeBufferPointer<DataType>(start: UnsafePointer(resultsBuffer.contents()), count: resultsCount) // Our results in convenient form to compute the actual result later
    let queue = device.newCommandQueue()
    let cmds = queue.commandBuffer()
    let encoder = cmds.computeCommandEncoder()
    encoder.setBuffer(dataBuffer, offset: 0, atIndex: 0)
    encoder.setBytes(&dataCount, length: sizeofValue(dataCount), atIndex: 1)
    encoder.setBuffer(resultsBuffer, offset: 0, atIndex: 2)
    encoder.setBytes(&elementsPerSumC, length: sizeofValue(elementsPerSumC), atIndex: 3)
    // We have to calculate the sum `resultCount` times => amount of threadgroups is `resultsCount` / `threadExecutionWidth` (rounded up) because each threadgroup will process `threadExecutionWidth` threads
    let threadgroupsPerGrid = MTLSize(width: (resultsCount + pipeline.threadExecutionWidth - 1) / pipeline.threadExecutionWidth, height: 1, depth: 1)
    // Here we set that each threadgroup should process `threadExecutionWidth` threads, the only important thing for performance is that this number is a multiple of `threadExecutionWidth` (here 1 times)
    let threadsPerThreadgroup = MTLSize(width: pipeline.threadExecutionWidth, height: 1, depth: 1)
    encoder.dispatchThreadgroups(threadgroupsPerGrid, threadsPerThreadgroup: threadsPerThreadgroup)


    return results
func sumParallel3(data : Array<DataType>) -> DataType {
    var results = sumParralel1(data)
    repeat {
        results = sumParralel1(Array(results))
    } while results.count >= 100
    var result : DataType = 0
    for elem in results {
        result += elem

    return result
func sumParallel4(data : Array<DataType>) -> DataType {
    let queue = NSOperationQueue()
    queue.maxConcurrentOperationCount = 4
    var a0 : DataType = 0
    var a1 : DataType = 0
    var a2 : DataType = 0
    var a3 : DataType = 0
    let op0 = NSBlockOperation( block : {
        for i in 0..<(data.count/4) {
            a0 = a0 + data[i]
    let op1 = NSBlockOperation( block : {
        for i in (data.count/4)..<(data.count/2) {
            a1 = a1 + data[i]
    let op2 = NSBlockOperation( block : {
        for i in (data.count/2)..<(3 * data.count/4) {
            a2 = a2 + data[i]
    let op3 = NSBlockOperation( block : {
        for i in (3 * data.count/4)..<(data.count) {
            a3 = a3 + data[i]

    queue.suspended = false
    let aaa: DataType = a0 + a1 + a2 + a3
    return aaa


kernel void parsum(const device DataType* data [[ buffer(0) ]],
               const device uint& dataLength [[ buffer(1) ]],
               device DataType* sums [[ buffer(2) ]],
               const device uint& elementsPerSum [[ buffer(3) ]],
               const uint tgPos [[ threadgroup_position_in_grid ]],
               const uint tPerTg [[ threads_per_threadgroup ]],
               const uint tPos [[ thread_position_in_threadgroup ]]) {
    uint resultIndex = tgPos * tPerTg + tPos; // This is the index of the individual result, this var is unique to this thread
    uint dataIndex = resultIndex * elementsPerSum; // Where the summation should begin
    uint endIndex = dataIndex + elementsPerSum < dataLength ? dataIndex + elementsPerSum : dataLength; // The index where summation should end
    for (; dataIndex < endIndex; dataIndex++)
        sums[resultIndex] += data[dataIndex];


这是为什么呢?为什么sumParallel3不比sumParallel快很多?为什么 sumParallel4 是最快的,尽管它是在 CPU 上计算的?

如何使用posix_memalign更新我的 GPU 功能?我知道它应该工作得更快,因为它会在 GPU 和 CPU 之间共享内存,但我不知道应该以这种方式分配 witch 数组(数据或结果),如果数据是参数传入函数,我如何使用posix_memalign分配数据?

在iPhone 6上运行这些测试时,我看到Metal版本的运行速度比朴素的CPU总和慢3倍和2倍。通过我在下面描述的修改,它始终更快。

我发现运行 Metal 版本的大量成本不仅可以归因于缓冲区的分配(尽管这很重要),还可以归因于首次创建设备和计算管道状态。这些操作通常在应用程序初始化时执行一次,因此将它们包含在计时中并不完全公平。

还应该注意的是,如果您在启用 Metal 验证层和 GPU 帧捕获的情况下通过 Xcode 运行这些测试,则会产生显着的运行时成本,并且会使结果偏向 CPU 的方向。


let dataCount = 1_000_000
let dataSize = dataCount * strideof(DataType)
let pageSize = Int(getpagesize())
let pageCount = (dataSize + (pageSize - 1)) / pageSize
var dataPointer: UnsafeMutablePointer<Void> = nil
posix_memalign(&dataPointer, pageSize, pageCount * pageSize)
let data = UnsafeMutableBufferPointer(start: UnsafeMutablePointer<DataType>(dataPointer),
                                      count: (pageCount * pageSize) / strideof(DataType))
for i in 0..<dataCount {
    data[i] = 200

这确实需要data成为UnsafeMutableBufferPointer<DataType>,而不是[DataType],因为 Swift 的Array分配了自己的后备存储。您还需要传递要操作的数据项计数,因为可变缓冲区指针的count已向上舍入以使缓冲区页面对齐。

若要实际创建使用此数据支持的MTLBuffer,请使用newBufferWithBytesNoCopy(_:length:options:deallocator:) API。同样,您提供的长度是页面大小的倍数,这一点至关重要;否则此方法返回nil

let roundedUpDataSize = strideof(DataType) * data.count
let dataBuffer = device.newBufferWithBytesNoCopy(data.baseAddress, length: roundedUpDataSize, options: [], deallocator: nil)

在这里,我们不提供释放分配器,但您应该在使用完内存后释放内存,方法是将缓冲区指针的baseAddress传递给 free()
