我希望答案是别担心,编译器会处理的但是我不确定
当我在fortran中自定义类型/类中创建一些方法时,与仅使用a(i) = b(i) + c(i)
等数组相比,是否存在由于引用/解引用对象(如this%a(i) = this%b(i) + this%c(i)
)的字段而导致的性能下降
更复杂的例子:
例如,我有这个函数,它应该在3D网格上插入一个值,这是真正的性能关键(它将在其他3D数组的三重循环中调用)。所以我在想,如果它是更好的(性能)是使用类的方法,或者更确切地说,使一个正常的子程序,以数组为参数。type grid3D ! 3D grid maps of observables
real, dimension (3) :: Rmin, Rmax, Rspan, step ! grid size and spacing (x,y,z)
integer, dimension (3) :: N ! dimension in x,y,z
real, dimension (3,:, :, :), allocatable :: f ! array storing values of othe observable
contains
procedure :: interpolate => grid3D_interpolate
end type grid3D
function grid3D_interpolate(this, R ) result(ff)
implicit none
! variables
class (grid3D) :: this
real, dimension (3), intent (in) :: R
real :: ff
integer ix0,iy0,iz0
integer ix1,iy1,iz1
real dx,dy,dz
real mx,my,mz
! function body
ix0 = int( (R(1)/this%step(1)) + fastFloorOffset ) - fastFloorOffset
iy0 = int( (R(2)/this%step(2)) + fastFloorOffset ) - fastFloorOffset
iz0 = int( (R(3)/this%step(3)) + fastFloorOffset ) - fastFloorOffset
dx = R(1) - x0*this%step(1)
dy = R(2) - y0*this%step(2)
dz = R(3) - z0*this%step(3)
ix0 = modulo( x0 , this%N(1) )+1
iy0 = modulo( y0 , this%N(2) )+1
iz0 = modulo( z0 , this%N(3) )+1
ix1 = modulo( x0+1 , this%N(1) )+1
iy1 = modulo( y0+1 , this%N(2) )+1
iz1 = modulo( z0+1 , this%N(3) )+1
mx=1.0-dx
my=1.0-dy
mz=1.0-dz
ff = mz*(my*(mx*this%f(ix0,iy0,iz0) &
+dx*this%f(ix1,iy0,iz0)) &
+dy*(mx*this%f(ix0,iy1,iz0) &
+dx*this%f(ix1,iy1,iz0))) &
+dz*(my*(mx*this%f(ix0,iy0,iz1) &
+dx*this%f(ix1,iy0,iz1)) &
+dy*(mx*this%f(ix0,iy1,iz1) &
+dx*this%f(ix1,iy1,iz1)))
end if
end function grid3D_interpolate
end module T_grid3Dvec
不完全是。
- 只要你的代码结构非常清晰(对编译器来说),它可以很容易地优化它。
- 一旦您的OOP结构变得太复杂,或者解引用的级别变得太大,您可能会从手动解引用方案中获得一些改进。(我经常使用它,尽管通常是为了保持我的代码可读性。但是我曾经在这里有过一点改进,但是使用了>5级解引用的代码。)
下面是一些例子:
module vec_mod
implicit none
type t_vector
real :: x = 0.
real :: y = 0.
real :: z = 0.
end type
type t_group
type(t_vector),allocatable :: vecs(:)
end type
contains
subroutine sum_vec( vecs, res )
implicit none
type(t_vector),intent(in) :: vecs(:)
type(t_vector),intent(out) :: res
integer :: i
res%x = 0. ; res%y = 0. ; res%z = 0.
do i=1,size(vecs)
res%x = res%x + vecs(i)%x
res%y = res%y + vecs(i)%y
res%z = res%z + vecs(i)%z
enddo
end subroutine
subroutine sum_vec_ptr( vecs, res )
implicit none
type(t_vector),intent(in),target :: vecs(:)
type(t_vector),intent(out) :: res
integer :: i
type(t_vector),pointer :: curVec
res%x = 0. ; res%y = 0. ; res%z = 0.
do i=1,size(vecs)
curVec => vecs(i)
res%x = res%x + curVec%x
res%y = res%y + curVec%y
res%z = res%z + curVec%z
enddo
end subroutine
subroutine sum_vecGrp( vecGrp, res )
implicit none
type(t_group),intent(in) :: vecGrp
type(t_vector),intent(out) :: res
integer :: i
res%x = 0. ; res%y = 0. ; res%z = 0.
do i=1,size(vecGrp%vecs)
res%x = res%x + vecGrp%vecs(i)%x
res%y = res%y + vecGrp%vecs(i)%y
res%z = res%z + vecGrp%vecs(i)%z
enddo
end subroutine
subroutine sum_vecGrp_ptr( vecGrp, res )
implicit none
type(t_group),intent(in),target :: vecGrp
type(t_vector),intent(out) :: res
integer :: i
type(t_vector),pointer :: curVec, vecs(:)
res%x = 0. ; res%y = 0. ; res%z = 0.
vecs => vecGrp%vecs
do i=1,size(vecs)
curVec => vecs(i)
res%x = res%x + curVec%x
res%y = res%y + curVec%y
res%z = res%z + curVec%z
enddo
end subroutine
end module
program test
use omp_lib
use vec_mod
use,intrinsic :: ISO_Fortran_env
implicit none
type(t_vector),allocatable :: vecs(:)
type(t_vector) :: res
type(t_group) :: vecGrp
integer,parameter :: N=100000000
integer :: i, stat
real(REAL64) :: t1, t2
allocate( vecs(N), vecGrp%vecs(N), stat=stat )
if (stat /= 0) stop 'Cannot allocate memory'
do i=1,N
call random_number(vecs(i)%x)
call random_number(vecs(i)%y)
call random_number(vecs(i)%z)
enddo
print *,''
print *,'1 Level'
t1 = omp_get_wtime()
call sum_vec( vecs, res )
print *,res
t2 = omp_get_wtime()
print *,'Normal [s]:', t2-t1
t1 = omp_get_wtime()
call sum_vec_ptr( vecs, res )
print *,res
t2 = omp_get_wtime()
print *,'Pointer [s]:', t2-t1
print *,''
print *,'2 Levels'
vecGrp%vecs = vecs
t1 = omp_get_wtime()
call sum_vecGrp( vecGrp, res )
print *,res
t2 = omp_get_wtime()
print *,'Normal [s]:', t2-t1
t1 = omp_get_wtime()
call sum_vecGrp_ptr( vecGrp, res )
print *,res
t2 = omp_get_wtime()
print *,'Pointer [s]:', t2-t1
end program
使用默认选项(gfortran test.F90 -fopenmp
)编译,3是手动解引用的一个小好处,特别是对于两个级别的解引用:
OMP_NUM_THREADS=1 ./a.out
1 Level
16777216.0 16777216.0 16777216.0
Normal [s]: 0.69216769299237058
16777216.0 16777216.0 16777216.0
Pointer [s]: 0.67321390099823475
2 Levels
16777216.0 16777216.0 16777216.0
Normal [s]: 0.84902219301147852
16777216.0 16777216.0 16777216.0
Pointer [s]: 0.71247501399193425
一旦您打开优化(gfortran test.F90 -fopenmp -O3
),您可以看到编译器实际上自动地做了更好的工作:
OMP_NUM_THREADS=1 ./a.out
1 Level
16777216.0 16777216.0 16777216.0
Normal [s]: 0.13888958499592263
16777216.0 16777216.0 16777216.0
Pointer [s]: 0.19099253200693056
2 Levels
16777216.0 16777216.0 16777216.0
Normal [s]: 0.13436777899914887
16777216.0 16777216.0 16777216.0
Pointer [s]: 0.21104205500159878