如何使用实时编译 (JIT) 提高 Julia 的性能



我一直在玩JAX (Python中的自动微分库)和Zygote (Julia中的自动微分库)来实现高斯-牛顿最小化方法。我在Jax中遇到了@jit宏,它运行我的Python代码大约需要0.6秒,而不使用@jit的版本需要60秒。茱莉亚在四十秒内运行了代码。在Julia或Zygote中是否有相当于@jit的结果是更好的性能?

下面是我使用的代码:

Python

from jax import grad, jit, jacfwd
import jax.numpy as jnp
import numpy as np
import time
def gaussian(x, params):
amp = params[0]
mu  = params[1]
sigma = params[2]
amplitude = amp/(jnp.abs(sigma)*jnp.sqrt(2*np.pi))
arg = ((x-mu)/sigma)
return amplitude*jnp.exp(-0.5*(arg**2))
def myjacobian(x, params):
return jacfwd(gaussian, argnums = 1)(x, params)
def op(jac):
return jnp.matmul(
jnp.linalg.inv(jnp.matmul(jnp.transpose(jac),jac)),
jnp.transpose(jac))

def res(x, data, params):
return data - gaussian(x, params)
@jit
def step(x, data, params):
residuals = res(x, data, params)
jacobian_operation = op(myjacobian(x, params))
temp = jnp.matmul(jacobian_operation, residuals)
return params + temp
N = 2000
x = np.linspace(start = -100, stop = 100, num= N)
data = gaussian(x, [5.65, 25.5, 37.23])
ini = jnp.array([0.9, 5., 5.0])
t1 = time.time()
for i in range(5000):
ini = step(x, data, ini)
t2 = time.time()
print('t2-t1: ', t2-t1)
ini

茱莉亚

using Zygote
function gaussian(x::Union{Vector{Float64}, Float64}, params::Vector{Float64})
amp = params[1]
mu  = params[2]
sigma = params[3]

amplitude = amp/(abs(sigma)*sqrt(2*pi))
arg = ((x.-mu)./sigma)
return amplitude.*exp.(-0.5.*(arg.^2))

end
function myjacobian(x::Vector{Float64}, params::Vector{Float64})
output = zeros(length(x), length(params))
for (index, ele) in enumerate(x)
output[index,:] = collect(gradient((params)->gaussian(ele, params), params))[1]
end
return output
end
function op(jac::Matrix{Float64})
return inv(jac'*jac)*jac'
end
function res(x::Vector{Float64}, data::Vector{Float64}, params::Vector{Float64})
return data - gaussian(x, params)
end
function step(x::Vector{Float64}, data::Vector{Float64}, params::Vector{Float64})
residuals = res(x, data, params)
jacobian_operation = op(myjacobian(x, params))

temp = jacobian_operation*residuals
return params + temp
end
N = 2000
x = collect(range(start = -100, stop = 100, length= N))
params = vec([5.65, 25.5, 37.23])
data = gaussian(x, params)
ini = vec([0.9, 5., 5.0])
@time for i in range(start = 1, step = 1, length = 5000)
ini = step(x, data, ini)
end
ini

您的Julia代码做了许多不习惯的事情,并且正在恶化您的性能。这不是一个完整的概述,但它应该给你一个开始的好主意。

第一件事是传递params作为Vector是一个坏主意。这意味着它必须被堆分配,并且编译器不知道它有多长。相反,使用Tuple将允许更多的优化。其次,不要让gaussianxVector起作用。相反,应该编写标量版本并将其广播。具体来说,通过这些更改,您将拥有

function gaussian(x::Number, params::NTuple{3, Float64})
amp, mu, sigma = params

# The next 2 lines should probably be done outside this function, but I'll leave them here for now.
amplitude = amp/(abs(sigma)*sqrt(2*pi))
arg = ((x-mu)/sigma)
return amplitude*exp(-0.5*(arg^2))
end

加快速度的一种直接方法是使用ForwardDiff而不是Zygote,因为您正在多次对长度为3的向量进行梯度。在这里,这使我从16秒减少到3.5秒,最后的2因素涉及到Chunk(3)来提高类型稳定性。也许这可以进一步改进。

function myjacobian(x::Vector, params)
# return rand(eltype(x), length(x), length(params))  # with no gradient, takes 0.5s
output = zeros(eltype(x), length(x), length(params))
config = ForwardDiff.GradientConfig(nothing, params, ForwardDiff.Chunk(3))
for (i, xi) in enumerate(x)
# grad = gradient(p->gaussian(xi, p), params)[1]       # original, takes 16s
# grad = ForwardDiff.gradient(p-> gaussian(xi, p))     # ForwardDiff, takes 7s
grad = ForwardDiff.gradient(p-> gaussian(xi, p), params, config)  # takes 3.5s
copyto!(view(output,i,:), grad)  # this allows params::Tuple, OK for Zygote, no help
end
return output
end
# This needs gaussian.(x, Ref(params)) elsewhere to use on many x, same params
function gaussian(x::Real, params)
# amp, mu, sigma = params  # with params::Vector this is slower, 19 sec
amp = params[1]
mu  = params[2]
sigma = params[3]  # like this, 16 sec
T = typeof(x)  # avoids having (2*pi)::Float64 promote everything
amplitude = amp/(abs(sigma)*sqrt(2*T(pi)))
arg = (x-mu)/sigma
return amplitude * exp(-(arg^2)/2)
end

然而,这仍然是在循环中计算许多小的梯度数组。它可以很容易地计算一个大的梯度数组。

虽然Julia通常很乐意将循环编译为快速的东西,但生成单个数组的循环往往是一个坏主意。对于Zygote来说尤其如此,它在类似matlab的全数组代码上是最快的。

这是它的样子,它使我在整个程序中少于15:

function gaussian(x::Real, amp::Real, mu::Real, sigma::Real)
T = typeof(x)
amplitude = amp/(abs(sigma)*sqrt(2*T(pi)))
arg = (x-mu)/sigma
return amplitude * exp(-(arg^2)/2)
end
function myjacobian2(x::Vector, params)  # with this, 0.9s
amp = fill(params[1], length(x))
mu  = fill(params[2], length(x))
sigma = fill(params[3], length(x))  # use same sigma & different x value at each row:
grads = gradient((amp, mu, sigma) -> sum(gaussian.(x, amp, mu, sigma)), amp, mu, sigma)
hcat(grads...)
end
# Check that it agrees:
myjacobian2(x, params) ≈ myjacobian(x, params)

虽然这对速度影响不大,但我认为你可能也想要op(jac::Matrix) = Hermitian(jac'*jac) jac'而不是inv

最新更新