视图上的融合运算符比Julia中的嵌套循环慢得多



我正在尝试制作一个函数,通过使用view和融合运算符尽可能快地计算扩散核。第二个函数能和第一个函数一样快吗?目前,diff需要59.6ms,而diff_view需要384.3ms。

using BenchmarkTools

function diff(
at::Array{Float64, 3}, a::Array{Float64, 3},
visc::Float64, dxidxi::Float64, dyidyi::Float64, dzidzi::Float64,
itot::Int64, jtot::Int64, ktot::Int64)
for k in 2:ktot-1
for j in 2:jtot-1
@simd for i in 2:itot-1
@inbounds at[i, j, k] += visc * (
(a[i-1, j  , k  ] - 2. * a[i, j, k] + a[i+1, j  , k  ]) * dxidxi +
(a[i  , j-1, k  ] - 2. * a[i, j, k] + a[i  , j+1, k  ]) * dyidyi +
(a[i  , j  , k-1] - 2. * a[i, j, k] + a[i  , j  , k+1]) * dzidzi )
end
end
end
end

function diff_view(
at::Array{Float64, 3}, a::Array{Float64, 3},
visc::Float64, dxidxi::Float64, dyidyi::Float64, dzidzi::Float64,
itot::Int64, jtot::Int64, ktot::Int64)
at_c = view(at, 2:itot-1, 2:jtot-1, 2:ktot-1)
a_c = view(a, 2:itot-1, 2:jtot-1, 2:ktot-1)
a_w = view(a, 1:itot-2, 2:jtot-1, 2:ktot-1)
a_e = view(a, 3:itot  , 2:jtot-1, 2:ktot-1)
a_s = view(a, 2:itot-1, 1:jtot-2, 2:ktot-1)
a_n = view(a, 2:itot-1, 3:jtot  , 2:ktot-1)
a_b = view(a, 2:itot-1, 2:jtot-1, 1:ktot-2)
a_t = view(a, 2:itot-1, 2:jtot-1, 3:ktot  )
at_c .+= visc .* ( (a_w .- 2. .* a_c .+ a_e) .* dxidxi .+
(a_s .- 2. .* a_c .+ a_n) .* dyidyi .+
(a_b .- 2. .* a_c .+ a_n) .* dzidzi )
end

itot = 384
jtot = 384
ktot = 384
a = rand(Float64, (itot, jtot, ktot))
at = zeros(Float64, (itot, jtot, ktot))
visc = 0.1
dxidxi = 0.1
dyidyi = 0.1
dzidzi = 0.1
@btime diff(
at, a,
visc, dxidxi, dyidyi, dzidzi,
itot, jtot, ktot)
@btime diff_view(
at, a,
visc, dxidxi, dyidyi, dzidzi,
itot, jtot, ktot)

您可以使用LoopVectorization.jl的@turbo宏来实现这一点,该宏将确保广播尽可能编译为高效的SIMD指令。

using LoopVectorization
function diff_view_lv!(
at::Array{Float64, 3}, a::Array{Float64, 3},
visc::Float64, dxidxi::Float64, dyidyi::Float64, dzidzi::Float64,
itot::Int64, jtot::Int64, ktot::Int64)
at_c = view(at, 2:itot-1, 2:jtot-1, 2:ktot-1)
a_c = view(a, 2:itot-1, 2:jtot-1, 2:ktot-1)
a_w = view(a, 1:itot-2, 2:jtot-1, 2:ktot-1)
a_e = view(a, 3:itot  , 2:jtot-1, 2:ktot-1)
a_s = view(a, 2:itot-1, 1:jtot-2, 2:ktot-1)
a_n = view(a, 2:itot-1, 3:jtot  , 2:ktot-1)
a_b = view(a, 2:itot-1, 2:jtot-1, 1:ktot-2)
a_t = view(a, 2:itot-1, 2:jtot-1, 3:ktot  )
@turbo at_c .+= visc .* ( (a_w .- 2. .* a_c .+ a_e) .* dxidxi .+
(a_s .- 2. .* a_c .+ a_n) .* dyidyi .+
(a_b .- 2. .* a_c .+ a_n) .* dzidzi )
# Could also use @turbo @. to apply the broadcast to every operator, so you don't have to type `.` before each one.
end

撇开风格不谈,由于所有这些函数都会变异at,因此它们的名称应该以!结尾,以表示它们会变异其自变量。

而且,正如评论所指出的,我们希望确保使用$将任何全局变量插入到基准中。但除此之外,使用与上面问题中相同的设置(在看起来稍微慢一点的CPU上(:

julia> @benchmark diff!(
$at, $a,
$visc, $dxidxi, $dyidyi, $dzidzi,
$itot, $jtot, $ktot)
BenchmarkTools.Trial: 50 samples with 1 evaluation.
Range (min … max):  100.575 ms … 101.855 ms  ┊ GC (min … max): 0.00% … 0.00%
Time  (median):     100.783 ms               ┊ GC (median):    0.00%
Time  (mean ± σ):   100.798 ms ± 173.505 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%
▆▁▁█▄
▄▄▄▄▄▆▇█████▇▆▄▆▁▁▁▄▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▄ ▁
101 ms           Histogram: frequency by time          102 ms <
Memory estimate: 0 bytes, allocs estimate: 0.
julia> @benchmark diff_view!(
$at, $a,
$visc, $dxidxi, $dyidyi, $dzidzi,
$itot, $jtot, $ktot)
BenchmarkTools.Trial: 13 samples with 1 evaluation.
Range (min … max):  397.203 ms … 397.800 ms  ┊ GC (min … max): 0.00% … 0.00%
Time  (median):     397.427 ms               ┊ GC (median):    0.00%
Time  (mean ± σ):   397.436 ms ± 173.079 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%
▁ ▁    ▁     ▁  ▁  ▁  ▁ ▁ █                ▁  ▁             ▁
█▁█▁▁▁▁█▁▁▁▁▁█▁▁█▁▁█▁▁█▁█▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁█ ▁
397 ms           Histogram: frequency by time          398 ms <
Memory estimate: 0 bytes, allocs estimate: 0.
julia> @benchmark diff_view_lv!(
$at, $a,
$visc, $dxidxi, $dyidyi, $dzidzi,
$itot, $jtot, $ktot)
BenchmarkTools.Trial: 61 samples with 1 evaluation.
Range (min … max):  82.226 ms …  83.015 ms  ┊ GC (min … max): 0.00% … 0.00%
Time  (median):     82.364 ms               ┊ GC (median):    0.00%
Time  (mean ± σ):   82.395 ms ± 115.205 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%
▁▄  ▁▁▁▁▄▄▄█▁  ▄  ▁█ ▁▁ ▁ ▁      ▁
▆▁▁▆▁▁▁██▆▁█████████▆▆█▆▆██▁██▁█▆█▁▁▁▁▁▁█▆▆▆▁▁▁▆▁▁▁▁▁▁▁▁▁▁▁▆ ▁
82.2 ms         Histogram: frequency by time         82.7 ms <
Memory estimate: 1008 bytes, allocs estimate: 41.

有了这个,广播版本现在比原来的循环版本更快了!然而,正如评论所指出的,简单的循环方法可以说是更干净、更可读的,并且(正如你可能从名称中猜到的(你也可以将LoopVectorization应用于循环版本:

using LoopVectorization
function diff_lv!(
at::Array{Float64, 3}, a::Array{Float64, 3},
visc::Float64, dxidxi::Float64, dyidyi::Float64, dzidzi::Float64,
itot::Int64, jtot::Int64, ktot::Int64)
@turbo for k in 2:ktot-1
for j in 2:jtot-1
for i in 2:itot-1
at[i, j, k] += visc * (
(a[i-1, j  , k  ] - 2. * a[i, j, k] + a[i+1, j  , k  ]) * dxidxi +
(a[i  , j-1, k  ] - 2. * a[i, j, k] + a[i  , j+1, k  ]) * dyidyi +
(a[i  , j  , k-1] - 2. * a[i, j, k] + a[i  , j  , k+1]) * dzidzi )
end
end
end
end
julia> @benchmark diff_lv!(
$at, $a,
$visc, $dxidxi, $dyidyi, $dzidzi,
$itot, $jtot, $ktot)
BenchmarkTools.Trial: 56 samples with 1 evaluation.
Range (min … max):  89.489 ms …  90.166 ms  ┊ GC (min … max): 0.00% … 0.00%
Time  (median):     89.657 ms               ┊ GC (median):    0.00%
Time  (mean ± σ):   89.660 ms ± 103.127 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%
▁    ▁ ▁       █▃   ▆ ▁
▄▁▁▄▁▁▄█▄▁▁▄█▄█▄▁▄▇▇▇▇██▄▄▇█▇█▁▁▁▄▄▁▁▁▁▄▁▄▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▄ ▁
89.5 ms         Histogram: frequency by time         89.9 ms <
Memory estimate: 0 bytes, allocs estimate: 0.

最后,如果您想要多线程,您可以在宏的名称中添加另一个t(@tturbo而不是@turbo(

julia> @benchmark diff_lvt!(
$at, $a,
$visc, $dxidxi, $dyidyi, $dzidzi,
$itot, $jtot, $ktot)
BenchmarkTools.Trial: 106 samples with 1 evaluation.
Range (min … max):  47.225 ms … 47.560 ms  ┊ GC (min … max): 0.00% … 0.00%
Time  (median):     47.434 ms              ┊ GC (median):    0.00%
Time  (mean ± σ):   47.432 ms ± 67.185 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%
▁ ▁▁ ▄▂       █  ▂
▃▁▁▃▁▁▅▁▁▅▁▁▁▁▁▁▁▁▃▃▃▃▃▃▁▅▅▃▅▅▃█▃██▃██▃▃▆▃▃▅▆█▆▅██▆▆▅▅▃▃▁▃▆ ▃
47.2 ms         Histogram: frequency by time        47.5 ms <
Memory estimate: 0 bytes, allocs estimate: 0.

只要您已经用多个线程启动Julia,它就应该提供一些额外的加速。

最新更新