对于以下从带有BLAS 的OpenMP扩展的代码
Program bench_dgemm
Use, Intrinsic :: iso_fortran_env, Only : wp => real64, li => int64
Use :: omp_lib
integer, parameter :: dp = selected_real_kind(15, 307)
Real( dp ), Dimension( :, : ), Allocatable :: a
Real( dp ), Dimension( :, :, : ), Allocatable :: b
Real( dp ), Dimension( :, :, : ), Allocatable :: c
Integer :: na, nb, nc, nd, m, m_iter
Integer( li ) :: start, finish, rate
Integer :: numthreads
Integer :: ithr, istart, iend
real(dp) :: sum_time
Write( *, * ) 'numthreads'
Read( *, * ) numthreads
call omp_set_num_threads(numthreads)
Write( *, * ) 'na, nb, nc, nd ?'
Read( *, * ) na, nb, nc, nd
Allocate( a ( 1:na, 1:nb ) )
Allocate( b ( 1:nb, 1:nc, 1:nd ) )
Allocate( c( 1:na, 1:nc, 1:nd ) )
!A[a,b] * B[b,c,d] = C[a,c,d]
Call Random_number( a )
Call Random_number( b )
c = 0.0_dp
m_iter = 30
write (*,*) 'm_iter average', m_iter
write (*,*) 'numthreads', numthreads
sum_time = 0.0
do m = 1, m_iter
Call System_clock( start, rate )
!$omp parallel private(ithr, istart, iend)
ithr = omp_get_thread_num()
istart = ithr * nd / numthreads
iend = (ithr + 1) * nd / numthreads
Call dgemm('N', 'N', na, nc * (iend - istart), nb, 1.0_dp, a, na, &
b(1, 1, 1 + istart), Size(b, Dim = 1), &
0.0_dp, c(1, 1, 1 + istart), Size(c, Dim = 1))
!$omp end parallel
Call System_clock( finish, rate )
sum_time = sum_time + Real( finish - start, dp ) / rate
end do
Write( *, * ) 'Time for dgemm', sum_time / m_iter
End
假设该文件名为CCD_ 1。我尝试了ifort bench.f90 -o bench -qopenmp -mkl=sequential
,然后是bench
。
对于na=nb=nc=nd=200
,numthreads=1
给我
1 Time for dgemm 4.053670000000001E-002
2 Time for dgemm 2.087716666666666E-002
4 Time for dgemm 1.082136666666667E-002
8 Time for dgemm 5.819133333333333E-003
16 Time for dgemm 4.304533333333333E-003
32 Time for dgemm 5.269366666666666E-003
我试过gfortran bench.f90 -o bench -fopenmp -lopenblas
,得到了
1 Time for dgemm 0.13534268956666665
2 Time for dgemm 6.9672616866666662E-002
4 Time for dgemm 3.5927094433333334E-002
8 Time for dgemm 1.8583297666666668E-002
16 Time for dgemm 1.1969903900000000E-002
32 Time for dgemm 1.9136184166666667E-002
omp
在32核(Intel(R)Xeon(R)Gold 6148 CPU@2.40GHZ 2插槽)中的速度似乎更低。因此40个核心)。我认为指数的分割是矩阵中的外部分割。与A[a,b]B[b,c]
类似,该代码将c
拆分为若干段。它应该是直接平行的。那么,为什么性能没有提高到32核?(如果bench.f90
1中c
的尺寸只有30,我可以想象32核不会有帮助。)
与OpenMP
和理想缩放相比,MPI
是否具有更好的性能?
我们在最后尝试了共享示例代码,结果如下。尝试"setenv OMP_PROC_BIND true";并导出相同的内容,因为这对您的情况会有所帮助。
numthreads1.na,nb,nc,nd?200200200200m_iter平均值30numthreads 1MKL_VERBOSE oneMKL 2022.0产品内部版本20211112,适用于英特尔(R)64体系结构英特尔(RMKL_VERBOSE DGEMM(N,N,20040000200,0x4b1490,0x1488e659b240200,0x1488d7cfa280200,0x4b1498,0x1488d3cf92c0200)113.23ms CNR:OFF Dyn:1 FastMM:1MKL_VERBOSE DGEMM(N,N,20040000200,0x4b1490,0x1488e659b240200,0x1488d7cfa280200,0x4b1498,0x1488d3cf92c0200)113.50ms CNR:OFF Dyn:1 FastMM:1MKL_VERBOSE DGEMM(N,N,20040000200,0x4b1490,0x1488e659b240200,0x1488d7cfa280200,0x4b1498,0x1488d3cf92c0200)113.66ms CNR:OFF Dyn:1 FastMM:1MKL_VERBOSE DGEMM(N,N,20040000200,0x4b1490,0x1488e659b240200,0x1488d7cfa280200,0x4b1498,0x1488d3cf92c0200)113.68ms CNR:OFF Dyn:1 FastMM:1MKL_VERBOSE DGEMM(N,N,20040000200,0x4b1490,0x1488e659b240200,0x1488d7cfa280200,0x4b1498,0x1488d3cf92c0200)113.64ms CNR:OFF Dyn:1 FastMM:1MKL_VERBOSE DGEMM(N,N,20040000200,0x4b1490,0x1488e659b240200,0x1488d7cfa280200,0x4b1498,0x1488d3cf92c0200)113.63ms CNR:OFF Dyn:1 FastMM:1MKL_VERBOSE DGEMM(N,N,20040000200,0x4b1490,0x1488e659b240200,0x1488d7cfa280200,0x4b1498,0x1488d3cf92c0200)113.67ms CNR:OFF Dyn:1 FastMM:1MKL_VERBOSE DGEMM(N,N,20040000200,0x4b1490,0x1488e659b240200,0x1488d7cfa280200,0x4b1498,0x1488d3cf92c0200)113.71ms CNR:OFF Dyn:1 FastMM:1MKL_VERBOSE DGEMM(N,N,20040000200,0x4b1490,0x1488e659b240200,0x1488d7cfa280200,0x4b1498,0x1488d3cf92c0200)113.74ms CNR:OFF Dyn:1 FastMM:1MKL_VERBOSE DGEMM(N,N,20040000200,0x4b1490,0x1488e659b240200,0x1488d7cfa280200,0x4b1498,0x1488d3cf92c0200)113.68ms CNR:OFF Dyn:1 FastMM:1MKL_VERBOSE DGEMM(N,N,20040000200,0x4b1490,0x1488e659b240200,0x1488d7cfa280200,0x4b1498,0x1488d3cf92c0200)113.65ms CNR:OFF Dyn:1 FastMM:1MKL_VERBOSE DGEMM(N,N,20040000200,0x4b1490,0x1488e659b240200,0x1488d7cfa280200,0x4b1498,0x1488d3cf92c0200)113.71ms CNR:OFF Dyn:1 FastMM:1MKL_VERBOSE DGEMM(N,N,20040000200,0x4b1490,0x1488e659b240200,0x1488d7cfa280200,0x4b1498,0x1488d3cf92c0200)113.68ms CNR:OFF Dyn:1 FastMM:1MKL_VERBOSE DGEMM(N,N,20040000200,0x4b1490,0x1488e659b240200,0x1488d7cfa280200,0x4b1498,0x1488d3cf92c0200)113.67ms CNR:OFF Dyn:1 FastMM:1MKL_VERBOSE DGEMM(N,N,20040000200,0x4b1490,0x1488e659b240200,0x1488d7cfa280200,0x4b1498,0x1488d3cf92c0200)116.28ms CNR:OFF Dyn:1 FastMM:1MKL_VERBOSE DGEMM(N,N,20040000200,0x4b1490,0x1488e659b240200,0x1488d7cfa280200,0x4b1498,0x1488d3cf92c0200)143.58ms CNR:OFF Dyn:1 FastMM:1MKL_VERBOSE DGEMM(N,N,20040000200,0x4b1490,0x1488e659b240200,0x1488d7cfa280200,0x4b1498,0x1488d3cf92c0200)105.96ms CNR:OFF Dyn:1 FastMM:1MKL_VERBOSE DGEMM(N,N,20040000200,0x4b1490,0x1488e659b240200,0x1488d7cfa280200,0x4b1498,0x1488d3cf92c0200)105.98毫秒CNR:OFF Dyn:1 FastMM:1MKL_VERBOSE DGEMM(N,N,20040000200,0x4b1490,0x1488e659b240200,0x1488d7cfa280200,0x4b1498,0x1488d3cf92c0200)106.06毫秒CNR:OFF Dyn:1 FastMM:1MKL_VERBOSE DGEMM(N,N,20040000200,0x4b1490,0x1488e659b240200,0x1488d7cfa280200,0x4b1498,0x1488d3cf92c0200)105.9ms CNR:OFF Dyn:1 FastMM:1MKL_VERBOSE DGEMM(N,N,20040000200,0x4b1490,0x1488e659b240200,0x1488d7cfa280200,0x4b1498,0x1488d3cf92c0200)106.12ms CNR:OFF Dyn:1 FastMM:1MKL_VERBOSE DGEMM(N,N,20040000200,0x4b1490,0x1488e659b240200,0x1488d7cfa280200,0x4b1498,0x1488d3cf92c0200)106.06毫秒CNR:OFF Dyn:1 FastMM:1MKL_VERBOSE DGEMM(N,N,20040000200,0x4b1490,0x1488e659b240200,0x1488d7cfa280200,0x4b1498,0x1488d3cf92c0200)106.01毫秒CNR:OFF Dyn:1 FastMM:1MKL_VERBOSE DGEMM(N,N,20040000200,0x4b1490,0x1488e659b240200,0x1488d7cfa280200,0x4b1498,0x1488d3cf92c0200)105.93毫秒CNR:OFF Dyn:1 FastMM:1MKL_VERBOSE DGEMM(N,N,20040000200,0x4b1490,0x1488e659b240200,0x1488d7cfa280200,0x4b1498,0x1488d3cf92c0200)106.08ms CNR:OFF Dyn:1 FastMM:1MKL_VERBOSE DGEMM(N,N,20040000200,0x4b1490,0x1488e659b240200,0x1488d7cfa280200,0x4b1498,0x1488d3cf92c0200)106.07ms CNR:OFF Dyn:1 FastMM:1MKL_VERBOSE DGEMM(N,N,20040000200,0x4b1490,0x1488e659b240200,0x1488d7cfa280200,0x4b1498,0x1488d3cf92c0200)106.09ms CNR:OFF Dyn:1 FastMM:1MKL_VERBOSE DGEMM(N,N,20040000200,0x4b1490,0x1488e659b240200,0x1488d7cfa280200,0x4b1498,0x1488d3cf92c0200)106.10ms CNR:OFF Dyn:1 FastMM:1MKL_VERBOSE DGEMM(N,N,20040000200,0x4b1490,0x1488e659b240200,0x1488d7cfa280200,0x4b1498,0x1488d3cf92c0200)106.03毫秒CNR:OFF Dyn:1 FastMM:1MKL_VERBOSE DGEMM(N,N,20040000200,0x4b1490,0x1488e659b240200,0x1488d7cfa280200,0x4b1498,0x1488d3cf92c0200)106.05ms CNR:OFF Dyn:1 FastMM:1dgemm 0.116057933333333 的时间