scontrol info:
JobId=283 JobName=cryosparc_P2_J214
UserId=cryosparc(1003) GroupId=cryosparc(1003) MCS_label=N/A
Priority=4294901572 Nice=0 Account=(null) QOS=normal
JobState=PENDING Reason=ReqNodeNotAvail,_UnavailableNodes:node04 Dependency=(null)
Requeue=1 Restarts=0 BatchFlag=1 Reboot=0 ExitCode=0:0
RunTime=00:00:00 TimeLimit=UNLIMITED TimeMin=N/A
SubmitTime=2021-08-20T20:55:00 EligibleTime=2021-08-20T20:55:00
AccrueTime=2021-08-20T20:55:00
StartTime=Unknown EndTime=Unknown Deadline=N/A
SuspendTime=None SecsPreSuspend=0 LastSchedEval=2021-08-20T23:36:14
Partition=CSCluster AllocNode:Sid=headnode:108964
ReqNodeList=(null) ExcNodeList=(null)
NodeList=(null)
NumNodes=1 NumCPUs=4 NumTasks=4 CPUs/Task=1 ReqB:S:C:T=0:0:*:*
TRES=cpu=4,mem=24000M,node=1,billing=4
Socks/Node=* NtasksPerN:B:S:C=0:0:*:* CoreSpec=*
MinCPUsNode=1 MinMemoryNode=24000M MinTmpDiskNode=0
Features=(null) DelayBoot=00:00:00
OverSubscribe=NO Contiguous=0 Licenses=(null) Network=(null)
Command=/data/backups/takeda2/data/cryosparc_projects/P8/J214/queue_sub_script.sh
WorkDir=/ssd/CryoSparc/cryosparc_master
StdErr=/data/backups/takeda2/data/cryosparc_projects/P8/J214/job.log
StdIn=/dev/null
StdOut=/data/backups/takeda2/data/cryosparc_projects/P8/J214/job.log
Power=
TresPerNode=gpu:1
MailUser=cryosparc MailType=NONE
Script:
#SBATCH --job-name cryosparc_P2_J214
#SBATCH -n 4
#SBATCH --gres=gpu:1
#SBATCH -p CSCluster
#SBATCH --mem=24000MB
#SBATCH --output=/data/backups/takeda2/data/cryosparc_projects/P8/J214/job.log
#SBATCH --error=/data/backups/takeda2/data/cryosparc_projects/P8/J214/job.log
available_devs=""
for devidx in $(seq 0 15);
do
if [[ -z $(nvidia-smi -i $devidx --query-compute-apps=pid --format=csv,noheader) ]] ; then
if [[ -z "$available_devs" ]] ; then
available_devs=$devidx
else
available_devs=$available_devs,$devidx
fi
fi
done
export CUDA_VISIBLE_DEVICES=$available_devs
/ssd/CryoSparc/cryosparc_worker/bin/cryosparcw run --project P2 --job J214 --master_hostname headnode.cm.cluster --master_command_core_port 39002 > /data/backups/takeda2/data/cryosparc_projects/P8/J214/job.log 2>&1
Slurm.conf
# This section of this file was automatically generated by cmd. Do not edit manually!
# BEGIN AUTOGENERATED SECTION -- DO NOT REMOVE
# Server nodes
SlurmctldHost=headnode
AccountingStorageHost=master
#############################################################################################
#GPU Nodes
#############################################################################################
NodeName=node[02-04] Procs=64 CoresPerSocket=16 RealMemory=257024 Sockets=2 ThreadsPerCore=2 Feature=RTX6000 Gres=gpu:4
NodeName=node01 Procs=64 CoresPerSocket=16 RealMemory=386048 Sockets=2 ThreadsPerCore=2 Feature=RTX3090 Gres=gpu:4
#NodeName=node[05-08] Procs=8 Gres=gpu:4
#
#############################################################################################
# Partitions
#############################################################################################
PartitionName=defq Default=YES MinNodes=1 DefaultTime=UNLIMITED MaxTime=UNLIMITED AllowGroups=ALL PriorityJobFactor=1 PriorityTier=1 OverSubscribe=NO PreemptMode=OFF AllowAccounts=ALL AllowQos=ALL Nodes=node[01-04]
PartitionName=CSLive MinNodes=1 DefaultTime=UNLIMITED MaxTime=UNLIMITED AllowGroups=ALL PriorityJobFactor=1 PriorityTier=1 OverSubscribe=NO PreemptMode=OFF AllowAccounts=ALL AllowQos=ALL Nodes=node01
PartitionName=CSCluster MinNodes=1 DefaultTime=UNLIMITED MaxTime=UNLIMITED AllowGroups=ALL PriorityJobFactor=1 PriorityTier=1 OverSubscribe=NO PreemptMode=OFF AllowAccounts=ALL AllowQos=ALL Nodes=node[02-04]
ClusterName=slurm
Gres.conf
# This section of this file was automatically generated by cmd. Do not edit manually!
# BEGIN AUTOGENERATED SECTION -- DO NOT REMOVE
AutoDetect=NVML
# END AUTOGENERATED SECTION -- DO NOT REMOVE
#Name=gpu File=/dev/nvidia[0-3] Count=4
#Name=mic Count=0
Sinfo:
PARTITION AVAIL TIMELIMIT NODES STATE NODELIST
defq* up infinite 1 down* node04
defq* up infinite 3 idle node[01-03]
CSLive up infinite 1 idle node01
CSCluster up infinite 1 down* node04
CSCluster up infinite 2 idle node[02-03]
Node1:
NodeName=node01 Arch=x86_64 CoresPerSocket=16
CPUAlloc=0 CPUTot=64 CPULoad=0.04
AvailableFeatures=RTX3090
ActiveFeatures=RTX3090
Gres=gpu:4
NodeAddr=node01 NodeHostName=node01 Version=20.02.6
OS=Linux 3.10.0-1160.11.1.el7.x86_64 #1 SMP Fri Dec 18 16:34:56 UTC 2020
RealMemory=386048 AllocMem=0 FreeMem=16665 Sockets=2 Boards=1
State=IDLE ThreadsPerCore=2 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
Partitions=defq,CSLive
BootTime=2021-08-04T13:59:08 SlurmdStartTime=2021-08-10T09:32:43
CfgTRES=cpu=64,mem=377G,billing=64
AllocTRES=
CapWatts=n/a
CurrentWatts=0 AveWatts=0
ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
Node2-3
NodeName=node02 Arch=x86_64 CoresPerSocket=16
CPUAlloc=0 CPUTot=64 CPULoad=0.48
AvailableFeatures=RTX6000
ActiveFeatures=RTX6000
Gres=gpu:4(S:0-1)
NodeAddr=node02 NodeHostName=node02 Version=20.02.6
OS=Linux 3.10.0-1160.11.1.el7.x86_64 #1 SMP Fri Dec 18 16:34:56 UTC 2020
RealMemory=257024 AllocMem=0 FreeMem=2259 Sockets=2 Boards=1
State=IDLE ThreadsPerCore=2 TmpDisk=0 Weight=1 Owner=N/A MCS_label=N/A
Partitions=defq,CSCluster
BootTime=2021-07-29T20:47:32 SlurmdStartTime=2021-08-10T09:32:55
CfgTRES=cpu=64,mem=251G,billing=64
AllocTRES=
CapWatts=n/a
CurrentWatts=0 AveWatts=0
ExtSensorsJoules=n/s ExtSensorsWatts=0 ExtSensorsTemp=n/s
Slurm.conf
#
# See the slurm.conf man page for more information.
#
SlurmUser=slurm
#SlurmdUser=root
SlurmctldPort=6817
SlurmdPort=6818
AuthType=auth/munge
#JobCredentialPrivateKey=
#JobCredentialPublicCertificate=
SlurmdSpoolDir=/cm/local/apps/slurm/var/spool
SwitchType=switch/none
MpiDefault=none
SlurmctldPidFile=/var/run/slurmctld.pid
SlurmdPidFile=/var/run/slurmd.pid
#ProctrackType=proctrack/pgid
ProctrackType=proctrack/cgroup
#PluginDir=
CacheGroups=0
#FirstJobId=
ReturnToService=2
#MaxJobCount=
#PlugStackConfig=
#PropagatePrioProcess=
#PropagateResourceLimits=
#PropagateResourceLimitsExcept=
#SrunProlog=
#SrunEpilog=
#TaskProlog=
#TaskEpilog=
TaskPlugin=task/cgroup
#TrackWCKey=no
#TreeWidth=50
#TmpFs=
#UsePAM=
#
# TIMERS
SlurmctldTimeout=300
SlurmdTimeout=300
InactiveLimit=0
MinJobAge=300
KillWait=30
Waittime=0
#
# SCHEDULING
#SchedulerAuth=
#SchedulerPort=
#SchedulerRootFilter=
#PriorityType=priority/multifactor
#PriorityDecayHalfLife=14-0
#PriorityUsageResetPeriod=14-0
#PriorityWeightFairshare=100000
#PriorityWeightAge=1000
#PriorityWeightPartition=10000
#PriorityWeightJobSize=1000
#PriorityMaxAge=1-0
#
# LOGGING
SlurmctldDebug=3
SlurmctldLogFile=/var/log/slurmctld
SlurmdDebug=3
SlurmdLogFile=/var/log/slurmd
#JobCompType=jobcomp/filetxt
#JobCompLoc=/cm/local/apps/slurm/var/spool/job_comp.log
#
# ACCOUNTING
JobAcctGatherType=jobacct_gather/linux
#JobAcctGatherType=jobacct_gather/cgroup
#JobAcctGatherFrequency=30
AccountingStorageType=accounting_storage/slurmdbd
AccountingStorageUser=slurm
# AccountingStorageLoc=slurm_acct_db
# AccountingStoragePass=SLURMDBD_USERPASS
# Scheduler
SchedulerType=sched/backfill
# Statesave
StateSaveLocation=/cm/shared/apps/slurm/var/cm/statesave/slurm
# Generic resources types
GresTypes=gpu
# Epilog/Prolog section
PrologSlurmctld=/cm/local/apps/cmd/scripts/prolog-prejob
Prolog=/cm/local/apps/cmd/scripts/prolog
Epilog=/cm/local/apps/cmd/scripts/epilog
# Power saving section (disabled)
# GPU related plugins
#SelectType=select/cons_tres
#SelectTypeParameters=CR_Core
#AccountingStorageTRES=gres/gpu
# END AUTOGENERATED SECTION -- DO NOT REMOVE
Scontrol for working 1GPU job on node01
JobId=285 JobName=cryosparc_P2_J232
UserId=cryosparc(1003) GroupId=cryosparc(1003) MCS_label=N/A
Priority=4294901570 Nice=0 Account=(null) QOS=normal
JobState=RUNNING Reason=None Dependency=(null)
Requeue=1 Restarts=0 BatchFlag=1 Reboot=0 ExitCode=0:0
RunTime=00:00:51 TimeLimit=UNLIMITED TimeMin=N/A
SubmitTime=2021-08-21T00:05:30 EligibleTime=2021-08-21T00:05:30
AccrueTime=2021-08-21T00:05:30
StartTime=2021-08-21T00:05:30 EndTime=Unknown Deadline=N/A
SuspendTime=None SecsPreSuspend=0 LastSchedEval=2021-08-21T00:05:30
Partition=CSLive AllocNode:Sid=headnode:108964
ReqNodeList=(null) ExcNodeList=(null)
NodeList=node01
BatchHost=node01
NumNodes=1 NumCPUs=64 NumTasks=2 CPUs/Task=1 ReqB:S:C:T=0:0:*:*
TRES=cpu=64,node=1,billing=64
Socks/Node=* NtasksPerN:B:S:C=0:0:*:* CoreSpec=*
MinCPUsNode=1 MinMemoryNode=24000M MinTmpDiskNode=0
Features=(null) DelayBoot=00:00:00
OverSubscribe=NO Contiguous=0 Licenses=(null) Network=(null)
Command=/data/backups/takeda2/data/cryosparc_projects/P8/J232/queue_sub_script.sh
WorkDir=/ssd/CryoSparc/cryosparc_master
StdErr=/data/backups/takeda2/data/cryosparc_projects/P8/J232/job.log
StdIn=/dev/null
StdOut=/data/backups/takeda2/data/cryosparc_projects/P8/J232/job.log
Power=
TresPerNode=gpu:1
MailUser=cryosparc MailType=NONE
Cgroup
# This section of this file was automatically generated by cmd. Do not edit manually!
# BEGIN AUTOGENERATED SECTION -- DO NOT REMOVE
CgroupMountpoint="/sys/fs/cgroup"
CgroupAutomount=no
TaskAffinity=no
ConstrainCores=no
ConstrainRAMSpace=no
ConstrainSwapSpace=no
ConstrainDevices=no
ConstrainKmemSpace=yes
AllowedRamSpace=100.00
AllowedSwapSpace=0.00
MinKmemSpace=30
MaxKmemPercent=100.00
MaxRAMPercent=100.00
MaxSwapPercent=100.00
MinRAMSpace=30