Hi all,
I have made a simple comparison between three different derived types: 1) with static array, 2) with allocatable array, 3) pointer array.
For pointer array I avoided the side effect, thus it is very very similar to the allocatable one (but the performances are not so identical).
The code used is the following:
module derived_static
implicit none
type:: wrapper_s
real(8), dimension(3):: array
integer:: scalar
endtype wrapper_s
interface assignment(=)
module procedure static_assignment
endinterface
interface operator(+)
module procedure static_add
endinterface
contains
subroutine static_assignment (a, b)
implicit none
type(wrapper_s), intent(inout) :: a
type(wrapper_s), intent(in) :: b
a%array = b%array
a%scalar = b%scalar
return
endsubroutine static_assignment
function static_add (a, b) result (c)
implicit none
type(wrapper_s), intent(in):: a, b
type(wrapper_s):: c
c%array = a%array + b%array
c%scalar = a%scalar + b%scalar
return
endfunction static_add
endmodule derived_static
module derived_allocatable
implicit none
type:: wrapper_a
real(8), dimension(:), allocatable:: array
integer:: scalar
endtype wrapper_a
interface assignment(=)
module procedure allocatable_assignment
endinterface
interface operator(+)
module procedure allocatable_add
endinterface
contains
subroutine allocatable_assignment (a, b)
implicit none
type(wrapper_a), intent(inout) :: a
type(wrapper_a), intent(in) :: b
a%array = b%array
a%scalar = b%scalar
return
endsubroutine allocatable_assignment
function allocatable_add (a, b) result (c)
implicit none
type(wrapper_a), intent(in):: a, b
type(wrapper_a):: c
allocate(c%array(size(a%array)))
c%array = a%array + b%array
c%scalar = a%scalar + b%scalar
return
endfunction allocatable_add
endmodule derived_allocatable
module derived_pointer
implicit none
type:: wrapper_p
real(8), dimension(:), pointer:: array => null ()
integer:: scalar
endtype wrapper_p
interface assignment(=)
module procedure pointer_assignment
endinterface
interface operator(+)
module procedure pointer_add
endinterface
contains
subroutine pointer_assignment (a, b)
implicit none
type(wrapper_p), intent(inout) :: a
type(wrapper_p), intent(in) :: b
a%array = b%array
a%scalar = b%scalar
return
endsubroutine pointer_assignment
function pointer_add (a, b) result (c)
implicit none
type(wrapper_p), intent(in):: a, b
type(wrapper_p):: c
allocate(c%array(size(a%array)))
c%array = a%array + b%array
c%scalar = a%scalar + b%scalar
return
endfunction pointer_add
endmodule derived_pointer
program main
use derived_static
use derived_allocatable
use derived_pointer
implicit none
type(wrapper_s), dimension(:), allocatable :: sa, sb, sc
type(wrapper_a), dimension(:), allocatable :: aa, ab, ac
type(wrapper_p), dimension(:), allocatable :: pa, pb, pc
real:: partial(1:2)
integer, parameter:: N=100000,M=3,R=500
integer:: i,j,k
! initializing
allocate (sa(N), sb(N), sc(N))
allocate (aa(N), ab(N), ac(N))
allocate (pa(N), pb(N), pc(N))
do i=1,N
allocate (aa(i)%array(M),ab(i)%array(M),ac(i)%array(M))
allocate (pa(i)%array(M),pb(i)%array(M),pc(i)%array(M))
! array
sa(i)%array = [1._8*i, 2._8*i, 3._8*i] ; sb(i)%array = [4._8*i, 5._8*i, 6._8*i]
aa(i)%array = [1._8*i, 2._8*i, 3._8*i] ; ab(i)%array = [4._8*i, 5._8*i, 6._8*i]
pa(i)%array = [1._8*i, 2._8*i, 3._8*i] ; pb(i)%array = [4._8*i, 5._8*i, 6._8*i]
! scalar
sa(i)%scalar = i ; sb(i)%scalar = 2*i
aa(i)%scalar = i ; ab(i)%scalar = 2*i
pa(i)%scalar = i ; pb(i)%scalar = 2*i
enddo
! testing static without overloaded operators
call CPU_TIME(partial(1))
do j=1,R
do i=1,N
sc(i)%array = sa(i)%array + sb(i)%array
sc(i)%scalar = sa(i)%scalar + sb(i)%scalar
enddo
enddo
call CPU_TIME(partial(2))
print '(A)', 'Static without overloaded operators'
print '(E13.6)', (partial(2)-partial(1))/R
print '(3F8.1,1X,I1)', sa(1)%array, sa(1)%scalar
print '(3F8.1,1X,I1)', sb(1)%array, sb(1)%scalar
print '(3F8.1,1X,I1)', sc(1)%array, sc(1)%scalar
! testing static
call CPU_TIME(partial(1))
do j=1,R
do i=1,N
sc(i) = sa(i) + sb(i)
enddo
enddo
call CPU_TIME(partial(2))
print '(A)', 'Static'
print '(E13.6)', (partial(2)-partial(1))/R
print '(3F8.1,1X,I1)', sa(1)%array, sa(1)%scalar
print '(3F8.1,1X,I1)', sb(1)%array, sb(1)%scalar
print '(3F8.1,1X,I1)', sc(1)%array, sc(1)%scalar
! testing allocatable
call CPU_TIME(partial(1))
do j=1,R
do i=1,N
ac(i) = aa(i) + ab(i)
enddo
enddo
call CPU_TIME(partial(2))
print '(A)', 'Allocatable'
print '(E13.6)', (partial(2)-partial(1))/R
print '(3F8.1,1X,I1)', aa(1)%array, aa(1)%scalar
print '(3F8.1,1X,I1)', ab(1)%array, ab(1)%scalar
print '(3F8.1,1X,I1)', ac(1)%array, ac(1)%scalar
! testing pointer
call CPU_TIME(partial(1))
do j=1,R
do i=1,N
pc(i) = pa(i) + pb(i)
enddo
enddo
call CPU_TIME(partial(2))
print '(A)', 'Pointer'
print '(E13.6)', (partial(2)-partial(1))/R
print '(3F8.1,1X,I1)', pa(1)%array, pa(1)%scalar
print '(3F8.1,1X,I1)', pb(1)%array, pb(1)%scalar
print '(3F8.1,1X,I1)', pc(1)%array, pc(1)%scalar
stop
end program main
I have tested Intel fortran 12.1.5 and GNU gfortran 4.6.3. The results on my workstation are (I omit the output for the side-effect check, reporting only the time values):
Intel compiler
Intel (-O0 compiling):
Static without overloaded operators
0.276817E-02
Static
0.384024E-02
Allocatable
0.183371E-01
Pointer
0.207293E-01
Intel (-O3 compiling):
Static without overloaded operators
0.000000E+00
Static
0.888056E-03
Allocatable
0.839252E-02
Pointer
0.101606E-01
It seems that the optimization, at least for a generic -O3, performs (slightly) better with allocatable than with pointer array. The static version is always one order faster than the others and when the overhead of using overloaded operators is eliminated the sum is so faster that the loop is not enough long to measure its time with CPU_TIME.
For the GNU compiler the comments are almost identical:
gfortran (-O0):
Static without overloaded operators
0.140009E-02
Static
0.255216E-02
Allocatable
0.103927E-01
Pointer
0.116887E-01
gfortran (-O3):
Static without overloaded operators
0.848054E-03
Static
0.880056E-03
Allocatable
0.696043E-02
Pointer
0.705644E-02
Note that the -O3 optimization of the gfortran is not so effective as the ifort one for the static without overloaded operators case.
My conclusions are:
1) using the side-effect the pointer array into derived type is efficient, but it makes the code confusing and unsafe;
2) static and pointer (without side-effect) array into derived type are at least one order slower than the static version;
3) even with the static array derived type the overloaded operators have a not negligible overhead.
I am now thinking to completely avoid overloaded operators in favor of explicitly components computations: less clear but more efficient computations.
See you soon.