If we request 2 GPU's in the job and start two jobs, both jobs will start on the same node fully allocating the node. We are puzzled about is going on and any hints are welcome.
# Example slurm.conf file. Please run configurator.html
# (in doc/html) to build a configuration file customized
# for your environment.
#
#
# slurm.conf file generated by configurator.html.
#
# See the slurm.conf man page for more information.
#
ClusterName=cluster
ControlAddr=10.116.0.11
BackupAddr=10.116.0.17
#
SlurmUser=slurm
#SlurmdUser=root
SlurmctldPort=6817
SlurmdPort=6818
SchedulerPort=7321
RebootProgram="/usr/sbin/reboot"
AuthType=auth/munge
#JobCredentialPrivateKey=
#JobCredentialPublicCertificate=
StateSaveLocation=/var/spool/slurm/ctld
SlurmdSpoolDir=/var/spool/slurm/d
SwitchType=switch/none
MpiDefault=none
SlurmctldPidFile=/var/run/slurmctld.pid
SlurmdPidFile=/var/run/slurmd.pid
ProctrackType=proctrack/pgid
GresTypes=gpu,mps,bandwidth
PrologFlags=x11
#PluginDir=
#FirstJobId=
#MaxJobCount=
#PlugStackConfig=
#PropagatePrioProcess=
#PropagateResourceLimits=
#PropagateResourceLimitsExcept=
#Prolog=
#Epilog=/etc/slurm/slurm.epilog.clean
#SrunProlog=
#SrunEpilog=
#TaskProlog=
#TaskEpilog=
#TaskPlugin=
#TrackWCKey=no
#TreeWidth=50
#TmpFS=
#UsePAM=
#
# TIMERS
SlurmctldTimeout=300
SlurmdTimeout=300
InactiveLimit=0
MinJobAge=300
KillWait=30
Waittime=0
#
# SCHEDULING
SchedulerType=sched/backfill
#bf_interval=10
#SchedulerAuth=
#SelectType=select/linear
# Cores and memory are consumable
#SelectType=select/cons_res
#SelectTypeParameters=CR_Core_Memory
SchedulerParameters=bf_interval=10
SelectType=select/cons_res
SelectTypeParameters=CR_Core
FastSchedule=1
#PriorityType=priority/multifactor
#PriorityDecayHalfLife=14-0
#PriorityUsageResetPeriod=14-0
#PriorityWeightFairshare=100000
#PriorityWeightAge=1000
#PriorityWeightPartition=10000
#PriorityWeightJobSize=1000
#PriorityMaxAge=1-0
#
# LOGGING
SlurmctldDebug=3
SlurmctldLogFile=/var/log/slurmctld.log
SlurmdDebug=3
SlurmdLogFile=/var/log/slurmd.log
JobCompType=jobcomp/none
#JobCompLoc=
#
# ACCOUNTING
#JobAcctGatherType=jobacct_gather/linux
#JobAcctGatherFrequency=30
#
#AccountingStorageType=accounting_storage/slurmdbd
#AccountingStorageHost=
#AccountingStorageLoc=
#AccountingStoragePass=
#AccountingStorageUser=
#
#
#
# Default values
# DefMemPerNode=64000
# DefCpuPerGPU=4
# DefMemPerCPU=4000
# DefMemPerGPU=16000
# OpenHPC default configuration
#TaskPlugin=task/affinity
TaskPlugin=task/affinity,task/cgroup
PropagateResourceLimitsExcept=MEMLOCK
TaskPluginParam=autobind=cores
#AccountingStorageType=accounting_storage/mysql
#StorageLoc=slurm_acct_db
AccountingStorageType=accounting_storage/slurmdbd
#AccountingStorageType=accounting_storage/filetxt
Epilog=/etc/slurm/slurm.epilog.clean
#PartitionName=normal Nodes=c[1-5] Default=YES MaxTime=24:00:00 State=UP
PartitionName=DEFAULT State=UP Default=NO AllowGroups=ALL Priority=10 DisableRootJobs=NO RootOnly=NO Hidden=NO Shared=NO GraceTime=0 PreemptMode=OFF ReqResv=NO AllowAccounts=ALL AllowQos=ALL LLN=NO ExclusiveUser=NO Nodes=nodeamd[009-016],c[1-4],nodehtc[001-025]
# Partitions
# Group Limited Queues
# OIT DEBUG QUEUE
PartitionName=debug Nodes=c[1-4] MaxTime=24:00:00 State=UP AllowGroups=oit-hpc-admin
# RNA CHEM
PartitionName=longq7-rna MinNodes=1 MaxNodes=30 DefaultTime=168:00:00 MaxTime=UNLIMITED Priority=200 Nodes=nodeamd[001-008],nodegpu[021-025] AllowGroups=gpu-rnachem
# V100's
PartitionName=longq7-mri MinNodes=1 MaxNodes=30 DefaultTime=168:00:00 MaxTime=168:00:00 Priority=200 Nodes=nodenviv100[001-016] AllowGroups=gpu-mri
# BIGDATA GRANT
PartitionName=longq-bigdata7 MinNodes=1 MaxNodes=30 DefaultTime=168:00:00 MaxTime=168:00:00 Priority=200 Nodes=node[087-098],nodegpu001 AllowGroups=fau-bigdata,nsf-bigdata
PartitionName=gpu-bigdata7 Default=NO MinNodes=1 Priority=10 AllowAccounts=ALL Nodes=nodegpu001 AllowGroups=fau-bigdata,nsf-bigdata
# CogNeuroLab
PartitionName=CogNeuroLab Default=NO MinNodes=1 MaxNodes=4 MaxTime=7-12:00:00 AllowGroups=cogneurolab Priority=200 State=UP Nodes=node[001-004]
# Standard queues
# OPEN TO ALL
#Short Queue
PartitionName=shortq7 MinNodes=1 MaxNodes=30 DefaultTime=06:00:00 MaxTime=06:00:00 Priority=100 Nodes=nodeamd[001-016],nodenviv100[001-015],nodegpu[001-025],node[001-100],nodehtc[001-025] Default=YES
# Medium Queue
PartitionName=mediumq7 MinNodes=1 MaxNodes=30 DefaultTime=72:00:00 MaxTime=72:00:00 Priority=50 Nodes=nodeamd[009-016],node[004-100]
# Long Queue
PartitionName=longq7 MinNodes=1 MaxNodes=30 DefaultTime=168:00:00 MaxTime=168:00:00 Priority=30 Nodes=nodeamd[009-016],node[004-100]
# Interactive
PartitionName=interactive MinNodes=1 MaxNodes=4 DefaultTime=06:00:00 MaxTime=06:00:00 Priority=101 Nodes=node[001-100] Default=No Hidden=YES
# Nodes
# Test nodes, (vms)
NodeName=c[1-4] Cpus=4 Feature=virtual RealMemory=16000
# AMD Nodes
NodeName=nodeamd[001-016] Procs=64 Boards=1 SocketsPerBoard=8 CoresPerSocket=8 ThreadsPerCore=1 Features=amd,epyc RealMemory=225436
# V100 MRI
NodeName=nodenviv100[001-016] CPUs=64 Boards=1 SocketsPerBoard=2 CoresPerSocket=16 ThreadsPerCore=2 Gres=gpu:v100:4 Feature=v100 RealMemory=192006
# GPU nodes
NodeName=nodegpu001 Procs=40 Boards=1 SocketsPerBoard=2 CoresPerSocket=10 ThreadsPerCore=2 Gres=gpu:k80:8 Feature=k80,intel RealMemory=64000
NodeName=nodegpu002 Procs=40 Boards=1 SocketsPerBoard=2 CoresPerSocket=10 ThreadsPerCore=2 Gres=gpu:gk1:8 Feature=gk1,intel RealMemory=128000
NodeName=nodegpu[003-020] Boards=1 SocketsPerBoard=2 CoresPerSocket=8 ThreadsPerCore=2 Gres=gpu:gk1:8 Feature=gk1,intel RealMemory=128000
NodeName=nodegpu[021-025] Procs=16 Boards=1 SocketsPerBoard=2 CoresPerSocket=8 ThreadsPerCore=1 Gres=gpu:4 Feature=exxact,intel RealMemory=128000
# IvyBridge nodes
NodeName=node[001-021] Procs=20 Boards=1 SocketsPerBoard=2 CoresPerSocket=10 ThreadsPerCore=1 Feature=intel,ivybridge RealMemory=112750
# SandyBridge node(2)
NodeName=node022 Procs=16 Boards=1 SocketsPerBoard=2 CoresPerSocket=8 ThreadsPerCore=1 Feature=intel,sandybridge RealMemory=64000
# IvyBridge
NodeName=node[023-050] Procs=20 Boards=1 SocketsPerBoard=2 CoresPerSocket=10 ThreadsPerCore=1 Feature=intel,ivybridge RealMemory=112750
# Haswell
NodeName=node[051-100] Procs=20 Boards=1 SocketsPerBoard=2 CoresPerSocket=10 ThreadsPerCore=1 Feature=intel,haswell RealMemory=112750
# Node health monitoring
HealthCheckProgram=/usr/sbin/nhc
HealthCheckInterval=300
ReturnToService=2
# Fix for X11 issues
X11Parameters=use_raw_hostname