On 13/10/2018 8:21 PM, Dominik Gronkiewicz wrote:
Hello,
Here is another variant of the program without "do concurrent" loops.
In case you care about portability, the loop that prints the last (3d)
line is perhaps compact but doesn't look portable or i have an outdated
version (18.4) of the PGI compiler.
Regards,
Ev. Drikos
----------------------------------------------------------
$ gfortran7 -fopenmp -O3 do-concurrent-4.f90
$ ./a.out
size(a)= 1024000
102.71127629280090 passed 1138000
102.71127629280090 passed 750000
102.71127629280090 passed 399000
$ pgfortran -mp=allcores -O3 do-concurrent-4.f90
$ ./a.out
size(a)= 1024000
367.0113028883934 passed 0
367.0113028883934 passed 1
241.2957104444504 passed 0
$ cat do-concurrent-4.f90
use omp_lib
use, intrinsic :: iso_fortran_env
integer , parameter :: threads=4
integer(int64) :: t0, t1
real*4 :: a(1024000)
real*8 :: s=0, s1=0, s2=0, s3=0, s4=0, v(threads)
integer :: i, i1, i2, i3, i4
call random_seed()
call random_number(a)
print *, "size(a)=", size(a)
!-----------------------------------------------
! s = 0
! call system_clock(t0)
! do concurrent (i = 1:1024000)
! s = s + (2 * a(i) - 1)
! end do
! call system_clock(t1)
! print *, s , " passed " , t1 - t0
!-----------------------------------------------
call system_clock(t0)
s = 0
do i = 1, 1024000
s = s + (2 * a(i) - 1)
end do
call system_clock(t1)
print *, s , " passed " , t1 - t0
!-----------------------------------------------
! s=0
! call system_clock(t0)
! s = 2 * sum(a,1) - 1024000 !rounding errors here?
!
! call system_clock(t1)
! print *, s , " passed " , t1 - t0
call system_clock(t0)
!$omp parallel num_threads(4)
select case( omp_get_thread_num() )
case ( 0 )
s1=0
DO i1 = 1,1024000,4
s1 = s1 + (2 * a(i1) - 1)
end do
case ( 1 )
s2=0
DO i2 = 2,1024000,4
s2 = s2 + (2 * a(i2) - 1)
end do
case ( 2 )
s3=0
DO i3 = 3,1024000,4
s3 = s3 + (2 * a(i3) - 1)
end do
case ( 3 )
s4=0
DO i4 = 4,1024000,4
s4 = s4 + (2 * a(i4) - 1)
end do
case default
print *, "error thread number"
end select
!$omp end parallel
s=s1+s2+s3+s4
call system_clock(t1)
print *, s , " passed " , t1 - t0
!-----------------------------------------------
call system_clock(t0)
!$omp parallel num_threads(threads)
associate ( p => v( omp_get_thread_num() + 1 ) )
p = 0
DO i1 = omp_get_thread_num()+1,1024000,threads
p = p + (2 * a(i1) - 1)
end do
end associate
!$omp end parallel
s=sum(v)
call system_clock(t1)
print *, s , " passed " , t1 - t0
!-----------------------------------------------
end