Google Groups no longer supports new Usenet posts or subscriptions. Historical content remains viewable.
Dismiss

PVM 3.2.6 problem on SP1 : messages sent to wrong processes

0 views
Skip to first unread message

Francois PELLEGRINI

unread,
Aug 27, 1994, 4:51:07 PM8/27/94
to

Hello.

I have a rather strange problem when running the public-domain
PVM 3.2.6 on an IBM SP1 machine. I have compiled PVM on an IBM 990 machine,
and the PV machine is made of the nodes of the SP1.
As far as I can see, it seems that processes receive messages that are
not sent to them.

To illustrate this erroneous behavior, I have put below a simple
Fortran + PVM program, as well as its resulting log files. The goal of the
program is to perform data exchange between pair of processes, e.g.
processes 1<-->2, 3<-->4, and so on. The messages contain the number of
their sender, and the message tags are set to 600 plus this number.
The received messages are coherent, but sometimes they are received by
the wrong process.

I have thought that I may have made a Fortran mistake which would
result in garbling PVM, but I have been unable to see it. Either it's too
big and I am so shortsighted that I do not see it, or there is something
strange happening...

Any help would be strongly appreciated.

Thanks in advance.

f.p.
##
## Here is the Fortran program
##
PROGRAM PRINCIPAL
IMPLICIT NONE
INCLUDE 'fpvm3.h'
INTEGER COMTIDS(1024)
INTEGER COMNBR
INTEGER COMNUM,COMTID
INTEGER COMPEER
INTEGER RCVBUF,RCVSIZ,RCVTAG,RCVTID,RCVVAL
INTEGER COMREP
INTEGER I

PRINT *,'X *** HELLO ***'

CALL PVMFMYTID (COMTID)
IF ( COMTID.LT.0 ) THEN
PRINT *,'X --- pvmd unreachable'
STOP
END IF
CALL PVMFJOINGROUP ('COMGROUP',COMNUM)
IF ( COMNUM.LT.0 ) THEN
PRINT *,'X --- Cannot access group',COMNUM
CALL PVMFEXIT (COMREP)
STOP
END IF
COMNUM = COMNUM + 1

IF ( COMNUM.EQ.1 ) THEN

C We are the initial process

100 PRINT *,'1 Enter number of processes (multiple of 2) '
READ (*,*) COMNBR
IF ( COMNBR.LT.1 ) THEN
print*,'1 Incorrect number of processes. Retry.'
GOTO 100
END IF

C Launch the other processes

COMTIDS(1) = COMTID
CALL PVMFSPAWN ('testpvm',PVMDEFAULT,'*',
& COMNBR-1,COMTIDS(2),COMREP)
IF ( COMREP.LT.(COMNBR-1) ) THEN
PRINT *,'1 --- Cannot launch',COMREP
CALL PVMFEXIT (COMREP)
STOP
END IF

C Broadcast the TID array

CALL PVMFINITSEND (PVMDEFAULT,COMREP)
CALL PVMFPACK (INTEGER4,COMNBR,1,1,COMREP)
CALL PVMFPACK (INTEGER4,COMTIDS,COMNBR,1,COMREP)
CALL PVMFMCAST (COMNBR-1,COMTIDS(2),0,COMREP)
ELSE

C We are the launched processes

CALL PVMFGETTID ('COMGROUP',0,COMTIDS(1))
CALL PVMFRECV (COMTIDS(1),0,COMREP)
CALL PVMFUNPACK (INTEGER4,COMNBR,1,1,COMREP)
CALL PVMFUNPACK (INTEGER4,COMTIDS,COMNBR,1,COMREP)
END IF

PRINT *,COMNUM,'*** TID LIST ***'
DO I=1,COMNBR
PRINT *,COMNUM,' COMNBRC=',I,' TID=',COMTIDS(I)
END DO
PRINT *,COMNUM,'*** BEFORE TEST ***'

CALL PVMFBARRIER ('COMGROUP',COMNBR,COMREP)

IF ( (COMNUM / 2).NE.((COMNUM - 1) / 2) ) THEN

C Even instance number (2, 4, 6, ...)

COMPEER = COMNUM - 1

CALL PVMFINITSEND (PVMDEFAULT,COMREP)
CALL PVMFPACK (INTEGER4,COMNUM,1,1,COMREP)
CALL PVMFSEND (COMTIDS(COMPEER),600+COMNUM,COMREP)
CALL PVMFRECV (-1,-1,RCVBUF)
CALL PVMFBUFINFO (RCVBUF,RCVSIZ,RCVTAG,RCVTID,COMREP)
CALL PVMFUNPACK (INTEGER4,RCVVAL,1,1,COMREP)
ELSE

C Odd instance number (1, 3, 5, ...)

COMPEER = COMNUM + 1

CALL PVMFRECV (-1,-1,RCVBUF)
CALL PVMFBUFINFO (RCVBUF,RCVSIZ,RCVTAG,RCVTID,COMREP)
CALL PVMFUNPACK (INTEGER4,RCVVAL,1,1,COMREP)
CALL PVMFINITSEND (PVMDEFAULT,COMREP)
CALL PVMFPACK (INTEGER4,COMNUM,1,1,COMREP)
CALL PVMFSEND (COMTIDS(COMPEER),600+COMNUM,COMREP)
END IF

IF ( RCVBUF.LT.1 ) THEN
PRINT *,COMNUM,'--- Bad receive buffer'
END IF

PRINT *,COMNUM,' COMPEER=',COMPEER,' RCVTAG=',RCVTAG,
& ' RCVVAL=',RCVVAL

IF ( RCVTAG.NE.(RCVVAL+600) ) THEN
PRINT *,COMNUM,'--- Incoherent message',
& RCVTAG,RCVVAL
END IF
IF ( COMPEER.NE.RCVVAL ) THEN
PRINT *,COMNUM,'--- Misreceived message (1)',
& COMPEER,RCVVAL
END IF
IF ( COMTIDS(COMPEER).NE.RCVTID ) THEN
PRINT *,COMNUM,'--- Misreceived message (2)',
& COMTIDS(COMPEER),RCVTID
END IF

CALL PVMFBARRIER ('COMGROUP',COMNBR,COMREP)

PRINT *,COMNUM,'*** AFTER TEST ***'

CALL PVMFLVGROUP ('COMGROUP',COMREP)
CALL PVMFEXIT (COMREP)

END
##
## The resulting text console output
##

./testpvm
X *** HELLO ***
1 Enter number of processes (multiple of 2)
8
1 *** TID LIST ***
1 COMNBRC= 1 TID= 262147
1 COMNBRC= 2 TID= 1048577
1 COMNBRC= 3 TID= 1310721
1 COMNBRC= 4 TID= 1572865
1 COMNBRC= 5 TID= 1835009
1 COMNBRC= 6 TID= 2097153
1 COMNBRC= 7 TID= 2359297
1 COMNBRC= 8 TID= 2621441
1 *** BEFORE TEST ***
1 COMPEER= 2 RCVTAG= 602 RCVVAL= 2
1 --- Misreceived message (2) 1048577 2097153

##
## The resulting PVMD log file
##

[t80040000] ready Fri Aug 26 14:30:51 1994
[t80040000] dm_addhost() already adding new hosts, oops
[t80040000] dm_addhost() already adding new hosts, oops
[t80040000] dm_addhost() already adding new hosts, oops
[t80040000] dm_addhost() already adding new hosts, oops
[t80040000] [t100001] X *** HELLO ***
[t80040000] [t200001] X *** HELLO ***
[t80040000] [t1c0001] X *** HELLO ***
[t80040000] [t140001] X *** HELLO ***
[t80040000] [t240001] X *** HELLO ***
[t80040000] [t280001] X *** HELLO ***
[t80040000] [t180001] X *** HELLO ***
[t80040000] [t200001] 2 *** TID LIST ***
[t80040000] [t200001] 2 COMNBRC= 1 TID= 262147
[t80040000] [t100001] 3 *** TID LIST ***
[t80040000] [t100001] 3 COMNBRC= 1 TID= 262147
[t80040000] [t200001] 2 COMNBRC= 2 TID= 1048577
[t80040000] [t100001] 3 COMNBRC= 2 TID= 1048577
[t80040000] [t200001] 2 COMNBRC= 3 TID= 1310721
[t80040000] [t100001] 3 COMNBRC= 3 TID= 1310721
[t80040000] [t200001] 2 COMNBRC= 4 TID= 1572865
[t80040000] [t100001] 3 COMNBRC= 4 TID= 1572865
[t80040000] [t200001] 2 COMNBRC= 5 TID= 1835009
[t80040000] [t100001] 3 COMNBRC= 5 TID= 1835009
[t80040000] [t200001] 2 COMNBRC= 6 TID= 2097153
[t80040000] [t100001] 3 COMNBRC= 6 TID= 2097153
[t80040000] [t200001] 2 COMNBRC= 7 TID= 2359297
[t80040000] [t140001] 4 *** TID LIST ***
[t80040000] [t140001] 4 COMNBRC= 1 TID= 262147
[t80040000] [t140001] 4 COMNBRC= 2 TID= 1048577
[t80040000] [t140001] 4 COMNBRC= 3 TID= 1310721
[t80040000] [t140001] 4 COMNBRC= 4 TID= 1572865
[t80040000] [t140001] 4 COMNBRC= 5 TID= 1835009
[t80040000] [t140001] 4 COMNBRC= 6 TID= 2097153
[t80040000] [t140001] 4 COMNBRC= 7 TID= 2359297
[t80040000] [t140001] 4 COMNBRC= 8 TID= 2621441
[t80040000] [t140001] 4 *** BEFORE TEST ***
[t80040000] [t100001] 3 COMNBRC= 7 TID= 2359297
[t80040000] [t200001] 2 COMNBRC= 8 TID= 2621441
[t80040000] [t100001] 3 COMNBRC= 8 TID= 2621441
[t80040000] [t200001] 2 *** BEFORE TEST ***
[t80040000] [t100001] 3 *** BEFORE TEST ***
[t80040000] [t180001] 6 *** TID LIST ***
[t80040000] [t180001] 6 COMNBRC= 1 TID= 262147
[t80040000] [t1c0001] 5 *** TID LIST ***
[t80040000] [t1c0001] 5 COMNBRC= 1 TID= 262147
[t80040000] [t1c0001] 5 COMNBRC= 2 TID= 1048577
[t80040000] [t1c0001] 5 COMNBRC= 3 TID= 1310721
[t80040000] [t1c0001] 5 COMNBRC= 4 TID= 1572865
[t80040000] [t1c0001] 5 COMNBRC= 5 TID= 1835009
[t80040000] [t1c0001] 5 COMNBRC= 6 TID= 2097153
[t80040000] [t1c0001] 5 COMNBRC= 7 TID= 2359297
[t80040000] [t1c0001] 5 COMNBRC= 8 TID= 2621441
[t80040000] [t1c0001] 5 *** BEFORE TEST ***
[t80040000] [t180001] 6 COMNBRC= 2 TID= 1048577
[t80040000] [t240001] 7 *** TID LIST ***
[t80040000] [t240001] 7 COMNBRC= 1 TID= 262147
[t80040000] [t240001] 7 COMNBRC= 2 TID= 1048577
[t80040000] [t240001] 7 COMNBRC= 3 TID= 1310721
[t80040000] [t240001] 7 COMNBRC= 4 TID= 1572865
[t80040000] [t240001] 7 COMNBRC= 5 TID= 1835009
[t80040000] [t240001] 7 COMNBRC= 6 TID= 2097153
[t80040000] [t240001] 7 COMNBRC= 7 TID= 2359297
[t80040000] [t240001] 7 COMNBRC= 8 TID= 2621441
[t80040000] [t240001] 7 *** BEFORE TEST ***
[t80040000] [t180001] 6 COMNBRC= 3 TID= 1310721
[t80040000] [t180001] 6 COMNBRC= 4 TID= 1572865
[t80040000] [t180001] 6 COMNBRC= 5 TID= 1835009
[t80040000] [t180001] 6 COMNBRC= 6 TID= 2097153
[t80040000] [t180001] 6 COMNBRC= 7 TID= 2359297
[t80040000] [t180001] 6 COMNBRC= 8 TID= 2621441
[t80040000] [t180001] 6 *** BEFORE TEST ***
[t80040000] [t280001] 8 *** TID LIST ***
[t80040000] [t280001] 8 COMNBRC= 1 TID= 262147
[t80040000] [t280001] 8 COMNBRC= 2 TID= 1048577
[t80040000] [t280001] 8 COMNBRC= 3 TID= 1310721
[t80040000] [t280001] 8 COMNBRC= 4 TID= 1572865
[t80040000] [t280001] 8 COMNBRC= 5 TID= 1835009
[t80040000] [t280001] 8 COMNBRC= 6 TID= 2097153
[t80040000] [t280001] 8 COMNBRC= 7 TID= 2359297
[t80040000] [t280001] 8 COMNBRC= 8 TID= 2621441
[t80040000] [t280001] 8 *** BEFORE TEST ***
[t80040000] [t240001] 7 COMPEER= 8 RCVTAG= 608 RCVVAL= 8
[t80040000] [t1c0001] 5 COMPEER= 6 RCVTAG= 606 RCVVAL= 6
[t80040000] [t1c0001] 5 --- Misreceived message (2) 2097153 1572865
[t80040000] [t100001] 3 COMPEER= 4 RCVTAG= 601 RCVVAL= 1
[t80040000] [t100001] 3 --- Misreceived message (1) 4 1
[t80040000] [t100001] 3 --- Misreceived message (2) 1572865 262147
[t80040000] [t140001] 4 COMPEER= 3 RCVTAG= 604 RCVVAL= 4
[t80040000] [t140001] 4 --- Misreceived message (1) 3 4
[t80040000] [t280001] 8 COMPEER= 7 RCVTAG= 607 RCVVAL= 7
[t80040000] [t200001] 2 COMPEER= 1 RCVTAG= 605 RCVVAL= 5
[t80040000] [t200001] 2 --- Misreceived message (1) 1 5
[t80040000] [t200001] 2 --- Misreceived message (2) 262147 1835009
[t80040000] [t180001] 6 COMPEER= 5 RCVTAG= 603 RCVVAL= 3
[t80040000] [t180001] 6 --- Misreceived message (1) 5 3
[t80040000] [t180001] 6 --- Misreceived message (2) 1835009 1048577
[t80040000] [t280001] 8 *** AFTER TEST ***
[t80040000] [t240001] 7 *** AFTER TEST ***
[t80040000] [t200001] 2 *** AFTER TEST ***
[t80040000] [t1c0001] 5 *** AFTER TEST ***
[t80040000] [t180001] 6 *** AFTER TEST ***
[t80040000] [t140001] 4 *** AFTER TEST ***
[t80040000] [t100001] 3 *** AFTER TEST ***

###
pele...@labri.u-bordeaux.fr

0 new messages