Yes, I meant job 38692. Sorry.
I am still having the problem. I suspect it has something to do with
the GPU configuration as this does not happen on my non-GPU node partitions.
Also, if I submit non-GPU jobs to the rtx8000 partition here, they
use up all the cores on the nodes just fine.
The upshot is on my 10 GPU nodes, I never see more than 6 GPUs in use
and jobs just asking for 1 or 2 GPUs are just made to wait in the qeuue.
Here is an example. The state of the nodes in rtx8000 queue before
I queue jobs:
rtx-04
CfgTRES=cpu=32,mem=1546000M,billing=99,gres/gpu=10
AllocTRES=cpu=15,mem=120G,gres/gpu=5
rtx-05
CfgTRES=cpu=32,mem=1546000M,billing=99,gres/gpu=10
AllocTRES=cpu=15,mem=328G,gres/gpu=5
rtx-06
CfgTRES=cpu=32,mem=1546000M,billing=99,gres/gpu=10
AllocTRES=cpu=15,mem=224G,gres/gpu=5
rtx-07
CfgTRES=cpu=32,mem=1546000M,billing=99,gres/gpu=10
AllocTRES=cpu=16,mem=232G,gres/gpu=6
rtx-08
CfgTRES=cpu=32,mem=1546000M,billing=81,gres/gpu=4
I then submit 10 jobs. Then the queue for rtx8000 is:
NODELIST JOBID PARTITION ST TIME_LIMIT TRES_ALLOC TRES_PER
rtx-04 40365 rtx8000 R 7-00:00:00 cpu=3,mem=24G,node=1 gpu:1
rtx-04 38676 rtx8000 R 7-00:00:00 cpu=3,mem=24G,node=1 gpu:1
rtx-04 38673 rtx8000 R 7-00:00:00 cpu=3,mem=24G,node=1 gpu:1
rtx-04 38670 rtx8000 R 7-00:00:00 cpu=3,mem=24G,node=1 gpu:1
rtx-04 38409 rtx8000 R 7-00:00:00 cpu=3,mem=24G,node=1 gpu:1
rtx-05 40214 rtx8000 R 6-10:00:00 cpu=3,mem=128G,node= gpu:1
rtx-05 38677 rtx8000 R 7-00:00:00 cpu=3,mem=24G,node=1 gpu:1
rtx-05 38674 rtx8000 R 7-00:00:00 cpu=3,mem=24G,node=1 gpu:1
rtx-05 37450 rtx8000 R 6-10:00:00 cpu=3,mem=128G,node= gpu:1
rtx-05 37278 rtx8000 R 7-00:00:00 cpu=3,mem=24G,node=1 gpu:1
rtx-06 40366 rtx8000 R 7-00:00:00 cpu=3,mem=24G,node=1 gpu:1
rtx-06 40364 rtx8000 R 6-10:00:00 cpu=3,mem=128G,node= gpu:1
rtx-06 38648 rtx8000 R 7-00:00:00 cpu=3,mem=24G,node=1 gpu:1
rtx-06 38646 rtx8000 R 7-00:00:00 cpu=3,mem=24G,node=1 gpu:1
rtx-06 37267 rtx8000 R 7-00:00:00 cpu=3,mem=24G,node=1 gpu:1
rtx-07 40760 rtx8000 R 50:00 cpu=4,mem=32G,node=1 gpu:2
rtx-07 38675 rtx8000 R 7-00:00:00 cpu=3,mem=24G,node=1 gpu:1
rtx-07 38672 rtx8000 R 7-00:00:00 cpu=3,mem=24G,node=1 gpu:1
rtx-07 38671 rtx8000 R 7-00:00:00 cpu=3,mem=24G,node=1 gpu:1
rtx-07 37451 rtx8000 R 6-10:00:00 cpu=3,mem=128G,node= gpu:1
rtx-08 40785 rtx8000 R 50:00 cpu=4,mem=32G,node=1 gpu:2
rtx-08 40786 rtx8000 R 50:00 cpu=4,mem=32G,node=1 gpu:2
(Priorit 40794 rtx8000 PD 50:00 cpu=4,mem=32G,node=1 gpu:2
(Priorit 40793 rtx8000 PD 50:00 cpu=4,mem=32G,node=1 gpu:2
(Priorit 40792 rtx8000 PD 50:00 cpu=4,mem=32G,node=1 gpu:2
(Priorit 40791 rtx8000 PD 50:00 cpu=4,mem=32G,node=1 gpu:2
(Priorit 40790 rtx8000 PD 50:00 cpu=4,mem=32G,node=1 gpu:2
(Priorit 40789 rtx8000 PD 50:00 cpu=4,mem=32G,node=1 gpu:2
(Priorit 40788 rtx8000 PD 50:00 cpu=4,mem=32G,node=1 gpu:2
(Resourc 40787 rtx8000 PD 50:00 cpu=4,mem=32G,node=1 gpu:2
[root@mlsc-head ~]# scontrol show job=40787
JobId=40787 JobName=sjob_5
UserId=raines(5829) GroupId=raines(5829) MCS_label=N/A
Priority=19836243 Nice=0 Account=sysadm QOS=normal
JobState=PENDING Reason=Resources Dependency=(null)
Requeue=1 Restarts=0 BatchFlag=1 Reboot=0 ExitCode=0:0
RunTime=00:00:00 TimeLimit=00:50:00 TimeMin=N/A
SubmitTime=2021-01-23T12:37:51 EligibleTime=2021-01-23T12:37:51
AccrueTime=2021-01-23T12:37:51
StartTime=2021-01-23T13:08:52 EndTime=2021-01-23T13:58:52 Deadline=N/A
SuspendTime=None SecsPreSuspend=0 LastSchedEval=2021-01-23T12:38:36
Partition=rtx8000 AllocNode:Sid=mlsc-head:1268664
ReqNodeList=(null) ExcNodeList=(null)
NodeList=(null) SchedNodeList=rtx-07
NumNodes=1-2 NumCPUs=4 NumTasks=1 CPUs/Task=4 ReqB:S:C:T=0:0:*:*
TRES=cpu=4,mem=32G,node=1,billing=11,gres/gpu=2
Socks/Node=* NtasksPerN:B:S:C=0:0:*:1 CoreSpec=*
MinCPUsNode=4 MinMemoryNode=32G MinTmpDiskNode=0
Features=(null) DelayBoot=00:00:00
OverSubscribe=OK Contiguous=0 Licenses=(null) Network=(null)
Command=/autofs/cluster/batch/raines/sjob_5
WorkDir=/autofs/cluster/batch/raines
StdErr=/autofs/cluster/batch/raines/sjob_5.err40787
StdIn=/dev/null
StdOut=/autofs/cluster/batch/raines/sjob_5.out40787
Power=
TresPerJob=gpu:2
MailUser=(null) MailType=NONE
[root@mlsc-head ~]# scontrol show node=rtx-04
NodeName=rtx-04 Arch=x86_64 CoresPerSocket=16
CPUAlloc=15 CPUTot=32 CPULoad=18.21
AvailableFeatures=intel,cascade,rtx8000
ActiveFeatures=intel,cascade,rtx8000
Gres=gpu:quadro_rtx_8000:10(S:0)
NodeAddr=rtx-04 NodeHostName=rtx-04 Version=20.02.3
OS=Linux 4.18.0-193.28.1.el8_2.x86_64 #1 SMP Thu Oct 22 00:20:22 UTC 2020
RealMemory=1546000 AllocMem=122880 FreeMem=1413061 Sockets=2 Boards=1
MemSpecLimit=2048
State=MIXED ThreadsPerCore=1 TmpDisk=6000000 Weight=1 Owner=N/A
MCS_label=N/A Partitions=rtx8000
BootTime=2020-12-29T13:40:45 SlurmdStartTime=2020-12-29T13:44:12
CfgTRES=cpu=32,mem=1546000M,billing=99,gres/gpu=10
AllocTRES=cpu=15,mem=120G,gres/gpu=5
CapWatts=n/a
CurrentWatts=0 AveWatts=0
ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
rtx-04
CfgTRES=cpu=32,mem=1546000M,billing=99,gres/gpu=10
AllocTRES=cpu=15,mem=120G,gres/gpu=5
rtx-05
CfgTRES=cpu=32,mem=1546000M,billing=99,gres/gpu=10
AllocTRES=cpu=15,mem=328G,gres/gpu=5
rtx-06
CfgTRES=cpu=32,mem=1546000M,billing=99,gres/gpu=10
AllocTRES=cpu=15,mem=224G,gres/gpu=5
rtx-07
CfgTRES=cpu=32,mem=1546000M,billing=99,gres/gpu=10
AllocTRES=cpu=16,mem=232G,gres/gpu=6
rtx-08
CfgTRES=cpu=32,mem=1546000M,billing=81,gres/gpu=4
AllocTRES=cpu=8,mem=64G,gres/gpu=4
Now rtx-08 which has only 4 GPUs seems to always get all 4 uses.
But the others seem to always only get half used (except rtx-07
which somehow gets 6 used so another wierd thing).
Again if I submit non-GPU jobs, they end up allocating all hte
cores/cpus on the nodes just fine.
I have two nodes with RTX6000's in a rtx6000 queue and those fill up
using all GPUs just fine:
NODELIST JOBID PARTITION ST TIME_LIMIT TRES_ALLOC TRES_PER
rtx-01 40830 rtx6000 R 50:00 cpu=4,mem=32G,node=1 gpu:2
rtx-01 40831 rtx6000 R 50:00 cpu=4,mem=32G,node=1 gpu:2
rtx-01 40833 rtx6000 R 50:00 cpu=4,mem=32G,node=1 gpu:2
rtx-01 40835 rtx6000 R 50:00 cpu=4,mem=32G,node=1 gpu:2
rtx-02 40832 rtx6000 R 50:00 cpu=4,mem=32G,node=1 gpu:2
rtx-02 40834 rtx6000 R 50:00 cpu=4,mem=32G,node=1 gpu:2
rtx-02 40836 rtx6000 R 50:00 cpu=4,mem=32G,node=1 gpu:2
(Priorit 40839 rtx6000 PD 50:00 cpu=4,mem=32G,node=1 gpu:2
(Priorit 40838 rtx6000 PD 50:00 cpu=4,mem=32G,node=1 gpu:2
(Resourc 40837 rtx6000 PD 50:00 cpu=4,mem=32G,node=1 gpu:2
rtx-01
CfgTRES=cpu=32,mem=1546000M,billing=89,gres/gpu=8
AllocTRES=cpu=16,mem=128G,gres/gpu=8
rtx-02
CfgTRES=cpu=32,mem=1546000M,billing=84,gres/gpu=6
AllocTRES=cpu=12,mem=96G,gres/gpu=6
So maybe it is something odd about those 7 day jobs already running
on rtx8000 boxes. Here are two examples:
[root@mlsc-head ~]# scontrol show job=40365
JobId=40365 JobName=unet_1
UserId=mu40(4181545) GroupId=mu40(4181545) MCS_label=N/A
Priority=8813 Nice=0 Account=lcn QOS=normal
JobState=RUNNING Reason=None Dependency=(null)
Requeue=1 Restarts=0 BatchFlag=1 Reboot=0 ExitCode=0:0
RunTime=01:50:42 TimeLimit=7-00:00:00 TimeMin=N/A
SubmitTime=2021-01-23T10:56:01 EligibleTime=2021-01-23T10:56:01
AccrueTime=2021-01-23T10:56:01
StartTime=2021-01-23T10:56:02 EndTime=2021-01-30T10:56:02 Deadline=N/A
SuspendTime=None SecsPreSuspend=0 LastSchedEval=2021-01-23T10:56:02
Partition=rtx8000 AllocNode:Sid=mlsc-head:1266838
ReqNodeList=(null) ExcNodeList=(null)
NodeList=rtx-04
BatchHost=rtx-04
NumNodes=1 NumCPUs=3 NumTasks=1 CPUs/Task=3 ReqB:S:C:T=0:0:*:*
TRES=cpu=3,mem=24G,node=1,billing=7,gres/gpu=1
Socks/Node=* NtasksPerN:B:S:C=0:0:*:1 CoreSpec=*
MinCPUsNode=3 MinMemoryNode=24G MinTmpDiskNode=0
Features=(null) DelayBoot=00:00:00
OverSubscribe=OK Contiguous=0 Licenses=(null) Network=(null)
Command=(null)
WorkDir=/homes/9/mu40/l/ge
StdErr=/homes/9/mu40/l/jobs/mlsc-login.40365.log
StdIn=/dev/null
StdOut=/homes/9/mu40/l/jobs/mlsc-login.40365.log
Power=
TresPerJob=gpu:1
MailUser=mu40 MailType=FAIL
[root@mlsc-head ~]# scontrol show job=38676
JobId=38676 JobName=int_sos
UserId=mu40(4181545) GroupId=mu40(4181545) MCS_label=N/A
Priority=96466 Nice=0 Account=lcn QOS=normal
JobState=RUNNING Reason=None Dependency=(null)
Requeue=0 Restarts=0 BatchFlag=1 Reboot=0 ExitCode=0:0
RunTime=2-02:04:59 TimeLimit=7-00:00:00 TimeMin=N/A
SubmitTime=2021-01-21T10:42:01 EligibleTime=2021-01-21T10:42:01
AccrueTime=2021-01-21T10:42:01
StartTime=2021-01-21T10:42:01 EndTime=2021-01-28T10:42:01 Deadline=N/A
SuspendTime=None SecsPreSuspend=0 LastSchedEval=2021-01-21T10:42:01
Partition=rtx8000 AllocNode:Sid=mlsc-head:965521
ReqNodeList=(null) ExcNodeList=(null)
NodeList=rtx-04
BatchHost=rtx-04
NumNodes=1 NumCPUs=3 NumTasks=1 CPUs/Task=3 ReqB:S:C:T=0:0:*:*
TRES=cpu=3,mem=24G,node=1,billing=7,gres/gpu=1
Socks/Node=* NtasksPerN:B:S:C=0:0:*:1 CoreSpec=*
MinCPUsNode=3 MinMemoryNode=24G MinTmpDiskNode=0
Features=(null) DelayBoot=00:00:00
OverSubscribe=OK Contiguous=0 Licenses=(null) Network=(null)
Command=(null)
WorkDir=/homes/9/mu40/l/ge
StdErr=/homes/9/mu40/l/jobs/mlsc-login.38676.log
StdIn=/dev/null
StdOut=/homes/9/mu40/l/jobs/mlsc-login.38676.log
Power=
TresPerJob=gpu:1
MailUser=mu40 MailType=FAIL
I don't see anything obvious here. Is it maybe the 7 day thing? If
I submit my jobs for 7 days to the rtx6000 partition though I don't
see the problem.
-- Paul Raines (
http://help.nmr.mgh.harvard.edu)