I have a problem that goes far beyond my skills...
My program consist in a general loop that first perform a kind of
bisection (which is also a loop) and then write the results in multiple
files.
If this general loop is of length 100, there is no problem and my
program quit normally. If I set the length to 1000, after a while the
program exit (after 661 iterations of the loop to be precise) before
finishing and write "Segmentation fault" (just that).
The problem doesn't seems to be in my code but when opening the files
(at the end of the loop). I use ifort and I wonder if anyone has had the
same problem.
Here is the stack after the segfault (given by gdb) :
-------------------------------------------------------------
Program received signal SIGSEGV, Segmentation fault.
0x000000000045835d in _intel_fast_memcpy.A ()
(gdb) bt full
#0 0x000000000045835d in _intel_fast_memcpy.A ()
No symbol table info available.
#1 0x0000000000414925 in for__compute_filename ()
No symbol table info available.
#2 0x00000000004160b1 in for__open_proc ()
No symbol table info available.
#3 0x0000000000412b62 in for_open ()
No symbol table info available.
#4 0x000000000040ae57 in scprob_muvar () at scprob_muvar1.4g.f:625
nichar = Invalid C/C++ type code 13 in symbol table.
-------------------------------------------------------------
and the stack given by idb :
-------------------------------------------------------------
Program received signal SIGBUS
_pthread_self () in /usr/lib/libSystem.B.dylib
(idb) where
#0 0x00000001028db79e in _pthread_self () in /usr/lib/libSystem.B.dylib
#1 0x0000003000000018
(idb) bt full
#0 0x00000001028db79e in _pthread_self () in /usr/lib/libSystem.B.dylib
No locals.
#1 0x0000003000000018
(idb) bt
#0 0x00000001028db79e in _pthread_self () in /usr/lib/libSystem.B.dylib
#1 0x0000003000000018
-------------------------------------------------------------
I use this version of ifort :
-------------------------------------------------------------
Intel(R) Fortran Compiler for Intel(R) EM64T-based applications, Version
9.1 Build 20060707 Package ID: l_fc_c_9.1.036
-------------------------------------------------------------
Here is the portion of code where the error occurs (it is a loop
included in the more general loop I mentioned above) :
-------------------------------------------------------------
c Traitement des données
do n=1,nmax
k=n
do i=1,4
rn=k/(10.q0**(4-i))
nichar(i:i)=char(iqint(rn)+48)
k=k-iqint(rn)*(10.q0**(4-i))
enddo
nichar=trim(nichar)
print*, nichar,n
print*, 'Opening info files...'
if (xf.ne.c-b) then
open(13,file='inters'//nichar//'.mu='//amuchar//'.dat'
$ ,status='old',position='append')
open(14,file='acti'//nichar//'.mu='//amuchar//'.dat'
$ ,status='old',position='append')
open(15,file='deti'//nichar//'.mu='//amuchar//'.dat'
$ ,status='old',position='append')
open(16,file='contrib.cl'//nichar//'.mu='//amuchar/
$ /'.dat',status='old',position='append')
open(17,file='contrib.r'//nichar//'.mu='//amuchar/
$ /'.dat',status='old',position='append')
open(18,file='contrib.c'//nichar//'.mu='//amuchar/
$ /'.dat',status='old',position='append')
open(19,file='cl'//nichar//'.mu='//amuchar//'.dat',status
$ ='old',position='append')
else
open(13,file='inters'//nichar//'.mu='//amuchar//'.dat'
$ ,status='new')
open(14,file='acti'//nichar//'.mu='//amuchar//'.dat'
$ ,status='new')
open(15,file='deti'//nichar//'.mu='//amuchar//'.dat'
$ ,status='new')
open(16,file='contrib.cl'//nichar//'.mu='//amuchar/
$ /'.dat',status='new')
open(17,file='contrib.r'//nichar//'.mu='//amuchar/
$ /'.dat',status='new')
open(18,file='contrib.c'//nichar//'.mu='//amuchar/
$ /'.dat',status='new')
open(19,file='cl'//nichar//'.mu='//amuchar//'.dat',status
$ ='new')
endif
Pr=0.q0
Pc=0.q0
Pcl=0.q0
Prmc=0.q0
Pcmc=0.q0
Pclmc=0.q0
iclmc=0
icmc=0
irmc=0
print*, 'Computing contributions and probabilities'
do i=1,nint(n)
print*, 'for intersection',i
if (cl(n,i).and.(di(n,i).gt.0.q0)) then
print*, 'classical'
Pcont=(1./qsqrt(2*cpi*eps*di(n,i)))*qexp(-Wi(n,i)/eps)
Pcl=Pcl+Pcont
write(16,*) dbleq(xf),dbleq(Pcont),i
if (Pcont.gt.Pclmc) then
Pclmc=Pcont
iclmc=i
endif
endif
if (di(n,i).gt.0.) then
print*, 'real'
Pcont=(1./qsqrt(2*cpi*eps*di(n,i)))*qexp(-Wi(n,i)/eps)
Pr=Pr+Pcont
write(17,*) dbleq(xf),dbleq(Pcont),i
if (Pcont.gt.Prmc) then
Prmc=Pcont
irmc=i
endif
elseif (di(n,i).lt.0.) then
print*, 'complex'
Pcont=(1./qsqrt(-2*cpi*eps*di(n,i)))*qexp(-Wi(n,i)/eps
$ )
Pc=Pc+Pcont
write(18,*) dbleq(xf),dbleq(Pcont),i
if (Pcont.gt.Pcmc) then
Pcmc=Pcont
icmc=i
endif
endif
write(13,*) dbleq(xf),dbleq(yi(n,i)),i
write(14,*) dbleq(xf),dbleq(Wi(n,i)),i
write(15,*) dbleq(xf),dbleq(di(n,i)),i
write(19,*) dbleq(xf),i,cl(n,i)
enddo
print*, 'Computing finished'
close(13)
close(14)
close(15)
close(16)
close(17)
close(18)
close(19)
print*, 'Info files closed'
print*, 'Opening contributions files...'
if (xf.ne.c-b) then
open(13,file='tclmc'//nichar//'.mu='//amuchar//'.dat'
$ ,status='old',position='append')
open(14,file='trmc'//nichar//'.mu='//amuchar//'.dat'
$ ,status='old',position='append')
open(15,file='tcmc'//nichar//'.mu='//amuchar//'.dat'
$ ,status='old',position='append')
else
open(13,file='tclmc'//nichar//'.mu='//amuchar//'.dat'
$ ,status='new')
open(14,file='trmc'//nichar//'.mu='//amuchar//'.dat'
$ ,status='new')
open(15,file='tcmc'//nichar//'.mu='//amuchar//'.dat'
$ ,status='new')
endif
print*, 'Writing contributions'
if (iclmc.ne.0) write(13,*) dbleq(xf),dbleq(yi(n,iclmc))
if (irmc.ne.0) write(14,*) dbleq(xf),dbleq(yi(n,irmc))
if (icmc.ne.0) write(15,*) dbleq(xf),dbleq(yi(n,icmc))
close(13)
close(14)
close(15)
print*, 'Contributions files closed'
print*, 'Opening probabilities files and writing...'
if (xf.ne.c-b) then
print*, 'classic'
open(12,file='scprobr'//nichar//'.mu='//amuchar/
$ /'.dat',status='old',position='append')
else
open(12,file='scprobr'//nichar//'.mu='//amuchar/
$ /'.dat',status='new')
endif
write(12,*) dbleq(xf),dbleq(Pr)
close(12)
if (xf.ne.c-b) then
print*, 'real'
open(12,file='scprobc'//nichar//'.mu='//amuchar/
$ /'.dat',status='old',position='append')
else
open(12,file='scprobc'//nichar//'.mu='//amuchar/
$ /'.dat',status='new')
endif
write(12,*) dbleq(xf),dbleq(Pc)
close(12)
if (xf.ne.c-b) then
print*, 'complex'
open(12,file='scprobcl'//nichar//'.mu='//amuchar/
$ /'.dat',status='old',position='append')
else
open(12,file='scprobcl'//nichar//'.mu='//amuchar/
$ /'.dat',status='new')
endif
write(12,*) dbleq(xf),dbleq(Pcl)
close(12)
print*, 'Writing probabilities finished'
enddo
-------------------------------------------------------------
amuchar and nichar are character string of respectively length 3 and 4.
I must add that the program doesn't crashes on the opening of a specific
file.
If you have an idea or a track to follow, I'm interested!
Thank you in advance,
Jonathan D.
You have given only a fragment and not the whole code; therefore, one
may only guess the reason for the abnormal end of program. Since you
create thousands of files (OPEN(...,STATUS='NEW') without checking
IOSTAT or providing ERR=nnn, the first thing to try would be to check
IOSTAT on new file opens, and look up the IOSTAT code for the loop
iteration in which the program aborts.
-- mecej4
Older versions of ifort had problems with creating stack temps in loops
where the temp was not removed until the end of the loop. We've done a
lot of work to address that problem and I'd guess that if you recompiled
using a current version (10.1) that you would not see it.
We also added a -heap-arrays option which causes the compiler to use the
heap instead of the stack for temps. I don't remember exactly when this
was introduced - sometime during 9.1 I think. You could also see if
that works for you.
If you need further help, see the links below.
idb gave me an hint about the problem :
--------------------------------------------------------------
Program received signal SIGSEGV
getcwd () in /lib64/libc-2.5.so
(idb) bt full
#0 0x00002b1c1d0b8c0d in getcwd () in /lib64/libc-2.5.so
No locals.
#1 0x000000000043e048 in for__compute_filename () in
/home/jon/scprob/contrib/test/scprob_muvar1.4g.x
No locals.
#2 0x000000000047b6cc in _intel_fast_memcpy.A () in
/home/jon/scprob/contrib/test/scprob_muvar1.4g.x
No locals.
#3 0x000000000043d734 in for__compute_filename () in
/home/jon/scprob/contrib/test/scprob_muvar1.4g.x
No locals.
#4 0x2e392e313d756d2e
2008-04-15 23:31:27.126039: idb : ALLO 0..0x342fb030d5fb7f40 :
mallocCnt = 127277
Internal Error: out of memory
Heap exhausted with 11214848 bytes.
Intel(R) Debugger for applications running on Intel(R) 64, Version
10.1-35 caught signal "Aborted" (6).
This is an unexpected condition and may indicate the presence of a defect.
If you wish to report this, please include the stack trace that follows.
/lib64/libc.so.6 [0x2b90e8d035b0]
/lib64/libc.so.6(gsignal+0x35) [0x2b90e8d03535]
/lib64/libc.so.6(abort+0x110) [0x2b90e8d04990]
/usr/lib64/libstdc++.so.5 [0x2b90e8a95996]
/usr/lib64/libstdc++.so.5 [0x2b90e8a959c3]
/usr/lib64/libstdc++.so.5 [0x2b90e8a959d6]
/usr/lib64/libstdc++.so.5(__cxa_call_unexpected+0x48) [0x2b90e8a953e8]
/usr/lib64/libstdc++.so.5 [0x2b90e8a95c82]
/usr/lib64/libstdc++.so.5(_Znam+0x9) [0x2b90e8a95d19]
/opt/intel/idbe/10.1.015/bin/idb-e(_ZN4Bits10initializeEmPKc+0x36)
[0xae2628]
/opt/intel/idbe/10.1.015/bin/idb-e(_ZN22LocativeAbsoluteMemory4bitsEv+0x4f)
[0xad2c0f]
/opt/intel/idbe/10.1.015/bin/idb-e(_ZN5Value4bitsEb+0x5a) [0xb80b78]
/opt/intel/idbe/10.1.015/bin/idb-e(_ZN15FrameDescriptor11printLocalsERSobbbbSs+0x7df)
[0xc4e74d]
/opt/intel/idbe/10.1.015/bin/idb-e [0xc52832]
/opt/intel/idbe/10.1.015/bin/idb-e(_ZN12CmdBacktrace5do_itER19CmdExecutionContextRN10BaseForCmd9CmdResultE+0x1cb)
[0xc51ab3]
/opt/intel/idbe/10.1.015/bin/idb-e(_ZN10BaseForCmd7executeEb+0xcbf)
[0xc05241]
/opt/intel/idbe/10.1.015/bin/idb-e [0xcafd8f]
/opt/intel/idbe/10.1.015/bin/idb-e(_Z15ProcessCommandsv+0x47) [0xcaf46f]
/opt/intel/idbe/10.1.015/bin/idb-e(_Z7idbMainiPPKcS1_+0x159) [0xcaee29]
/opt/intel/idbe/10.1.015/bin/idb-e(main+0x3c) [0xb87a98]
/lib64/libc.so.6(__libc_start_main+0xf4) [0x2b90e8cf0ae4]
/opt/intel/idbe/10.1.015/bin/idb-e(_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_c+0x5a)
[0x7ba02a]
--------------------------------------------------------------
particularly the lines :
--------------------------------------------------------------
2008-04-15 23:31:27.126039: idb : ALLO 0..0x342fb030d5fb7f40 :
mallocCnt = 127277
Internal Error: out of memory
Heap exhausted with 11214848 bytes.
--------------------------------------------------------------
After updating to ifort 10.1 and using the option -heap-arrays, now my
program exit normally.
Thank you very mutch...
Jonathan D.