fortran中解引用类属性的速度



我希望答案是别担心,编译器会处理的但是我不确定

当我在fortran中自定义类型/类中创建一些方法时,与仅使用a(i) = b(i) + c(i)等数组相比,是否存在由于引用/解引用对象(如this%a(i) = this%b(i) + this%c(i))的字段而导致的性能下降

更复杂的例子:

例如,我有这个函数,它应该在3D网格上插入一个值,这是真正的性能关键(它将在其他3D数组的三重循环中调用)。所以我在想,如果它是更好的(性能)是使用类的方法,或者更确切地说,使一个正常的子程序,以数组为参数。
type grid3D                                         ! 3D grid maps of observables
  real,    dimension (3) :: Rmin, Rmax, Rspan, step ! grid size and spacing (x,y,z)
  integer, dimension (3) :: N                       ! dimension in x,y,z
  real, dimension (3,:, :, :), allocatable :: f     ! array storing values of othe observable
  contains
    procedure :: interpolate => grid3D_interpolate
end type grid3D
function grid3D_interpolate(this, R ) result(ff)
 implicit none
  ! variables
  class (grid3D) :: this
  real, dimension (3), intent (in)   :: R
  real :: ff
  integer ix0,iy0,iz0
  integer ix1,iy1,iz1
  real dx,dy,dz
  real mx,my,mz
  ! function body
  ix0 = int( (R(1)/this%step(1)) + fastFloorOffset ) - fastFloorOffset
  iy0 = int( (R(2)/this%step(2)) + fastFloorOffset ) - fastFloorOffset
  iz0 = int( (R(3)/this%step(3)) + fastFloorOffset ) - fastFloorOffset
  dx = R(1) - x0*this%step(1)
  dy = R(2) - y0*this%step(2)
  dz = R(3) - z0*this%step(3)
  ix0 = modulo( x0   , this%N(1) )+1
  iy0 = modulo( y0   , this%N(2) )+1
  iz0 = modulo( z0   , this%N(3) )+1
  ix1 = modulo( x0+1 , this%N(1) )+1
  iy1 = modulo( y0+1 , this%N(2) )+1
  iz1 = modulo( z0+1 , this%N(3) )+1
  mx=1.0-dx
  my=1.0-dy
  mz=1.0-dz
  ff =    mz*(my*(mx*this%f(ix0,iy0,iz0)     &
                 +dx*this%f(ix1,iy0,iz0))    &
             +dy*(mx*this%f(ix0,iy1,iz0)     &
                 +dx*this%f(ix1,iy1,iz0)))   &
         +dz*(my*(mx*this%f(ix0,iy0,iz1)     &
                 +dx*this%f(ix1,iy0,iz1))    &
             +dy*(mx*this%f(ix0,iy1,iz1)     &
                 +dx*this%f(ix1,iy1,iz1)))
  end if
end function grid3D_interpolate
end module T_grid3Dvec

不完全是。

  • 只要你的代码结构非常清晰(对编译器来说),它可以很容易地优化它。
  • 一旦您的OOP结构变得太复杂,或者解引用的级别变得太大,您可能会从手动解引用方案中获得一些改进。(我经常使用它,尽管通常是为了保持我的代码可读性。但是我曾经在这里有过一点改进,但是使用了>5级解引用的代码。)

下面是一些例子:

module vec_mod
  implicit none
  type t_vector
    real :: x = 0.
    real :: y = 0.
    real :: z = 0.
  end type
  type t_group
    type(t_vector),allocatable :: vecs(:)
  end type
contains
  subroutine sum_vec( vecs, res )
    implicit none
    type(t_vector),intent(in)   :: vecs(:)
    type(t_vector),intent(out)  :: res
    integer                     :: i
    res%x = 0. ; res%y = 0. ; res%z = 0.
    do i=1,size(vecs)
      res%x = res%x + vecs(i)%x
      res%y = res%y + vecs(i)%y
      res%z = res%z + vecs(i)%z
    enddo
  end subroutine
  subroutine sum_vec_ptr( vecs, res )
    implicit none
    type(t_vector),intent(in),target   :: vecs(:)
    type(t_vector),intent(out)         :: res
    integer                            :: i
    type(t_vector),pointer             :: curVec
    res%x = 0. ; res%y = 0. ; res%z = 0.
    do i=1,size(vecs)
      curVec => vecs(i)
      res%x = res%x + curVec%x
      res%y = res%y + curVec%y
      res%z = res%z + curVec%z
    enddo
  end subroutine
  subroutine sum_vecGrp( vecGrp, res )
    implicit none
    type(t_group),intent(in)    :: vecGrp
    type(t_vector),intent(out)  :: res
    integer                     :: i
    res%x = 0. ; res%y = 0. ; res%z = 0.
    do i=1,size(vecGrp%vecs)
      res%x = res%x + vecGrp%vecs(i)%x
      res%y = res%y + vecGrp%vecs(i)%y
      res%z = res%z + vecGrp%vecs(i)%z
    enddo
  end subroutine
  subroutine sum_vecGrp_ptr( vecGrp, res )
    implicit none
    type(t_group),intent(in),target    :: vecGrp
    type(t_vector),intent(out)         :: res
    integer                            :: i
    type(t_vector),pointer             :: curVec, vecs(:)
    res%x = 0. ; res%y = 0. ; res%z = 0.
    vecs => vecGrp%vecs
    do i=1,size(vecs)
      curVec => vecs(i)
      res%x = res%x + curVec%x
      res%y = res%y + curVec%y
      res%z = res%z + curVec%z
    enddo
  end subroutine
end module
program test
  use omp_lib
  use vec_mod
  use,intrinsic :: ISO_Fortran_env
  implicit none
  type(t_vector),allocatable :: vecs(:)
  type(t_vector)             :: res
  type(t_group)              :: vecGrp
  integer,parameter          :: N=100000000
  integer                    :: i, stat
  real(REAL64)               :: t1, t2
  allocate( vecs(N), vecGrp%vecs(N), stat=stat )
  if (stat /= 0) stop 'Cannot allocate memory'
  do i=1,N
    call random_number(vecs(i)%x)
    call random_number(vecs(i)%y)
    call random_number(vecs(i)%z)
  enddo
  print *,''
  print *,'1 Level'
  t1 = omp_get_wtime()
  call sum_vec( vecs, res )
  print *,res
  t2 = omp_get_wtime()
  print *,'Normal  [s]:', t2-t1
  t1 = omp_get_wtime()
  call sum_vec_ptr( vecs, res )
  print *,res
  t2 = omp_get_wtime()
  print *,'Pointer [s]:', t2-t1
  print *,''
  print *,'2 Levels'
  vecGrp%vecs = vecs
  t1 = omp_get_wtime()
  call sum_vecGrp( vecGrp, res )
  print *,res
  t2 = omp_get_wtime()
  print *,'Normal  [s]:', t2-t1
  t1 = omp_get_wtime()
  call sum_vecGrp_ptr( vecGrp, res )
  print *,res
  t2 = omp_get_wtime()
  print *,'Pointer [s]:', t2-t1
end program

使用默认选项(gfortran test.F90 -fopenmp)编译,3是手动解引用的一个小好处,特别是对于两个级别的解引用:

OMP_NUM_THREADS=1 ./a.out 
 1 Level
   16777216.0       16777216.0       16777216.0    
 Normal  [s]:  0.69216769299237058     
   16777216.0       16777216.0       16777216.0    
 Pointer [s]:  0.67321390099823475     
 2 Levels
   16777216.0       16777216.0       16777216.0    
 Normal  [s]:  0.84902219301147852     
   16777216.0       16777216.0       16777216.0    
 Pointer [s]:  0.71247501399193425   

一旦您打开优化(gfortran test.F90 -fopenmp -O3),您可以看到编译器实际上自动地做了更好的工作:

OMP_NUM_THREADS=1 ./a.out 
 1 Level
   16777216.0       16777216.0       16777216.0    
 Normal  [s]:  0.13888958499592263     
   16777216.0       16777216.0       16777216.0    
 Pointer [s]:  0.19099253200693056     
 2 Levels
   16777216.0       16777216.0       16777216.0    
 Normal  [s]:  0.13436777899914887     
   16777216.0       16777216.0       16777216.0    
 Pointer [s]:  0.21104205500159878   

相关内容

  • 没有找到相关文章

最新更新