/var/log/slurmctld.log contains logs related to slum execution.
The log of the operation in the log is as follows.
1. Creating Nodes and Assigning Tasks
sched: Allocate JobId=60961_199(60961) NodeList=node-0-999 #CPUs=16 Partition=partition
sched/backfill: _start_job: Started JobId=60961_199(60961) in partition on node-0-999
2-1. 138 nodes running normally
Node node-0-800 now responding
2-2. resetting due to failure of remaining nodes
job_time_limit: Configuration for JobId=60961_43(61005) complete
Resetting JobId=60961_43(61005) start time for node power up
node node-0-976 not resumed by ResumeTimeout(300) - marking down and power_save
requeue job JobId=61243_176(61420) due to failure of node node-0-976
Requeuing JobId=61243_176(61420)
After the calculation of some of the 138 nodes is completed, the failed node is created and the operation proceeds. (similar to using an array)
I've also found and copied logs of VMs that fail and are deleted.
It appears to be a lack of resources in the zone.
There seems to be no other part of the quota that seems to be exceeded.
{
"protoPayload": {
"status": {
"code": 8,
"message": "ZONE_RESOURCE_POOL_EXHAUSTED",
"details": [
{
"value": {
"zoneResourcePoolExhaustedWithDetails": {
"zoneResource": {
"resourceType": "ZONE",
"resourceName": "asia-northeast3-b",
"project": {
"canonicalProjectId": "748182922348"
},
"scope": {
"scopeType": "GLOBAL",
"scopeName": "global"
}
},
"details": "(resource type:compute)"
}
}
}
]
},
"authenticationInfo": {
},
"requestMetadata": {
"callerIp": "35.216.74.94",
"callerSuppliedUserAgent": "Slurm_GCP_Scripts/1.2 (GPN:SchedMD) (gzip),gzip(gfe)",
"requestAttributes": {},
"destinationAttributes": {}
},
"methodName": "v1.compute.instances.bulkInsert",
"authorizationInfo": [
{
"permission": "compute.instances.create",
"granted": true,
"resourceAttributes": {
"service": "compute",
"name": "projects/test-project/zones/asia-northeast3-b/instances/unusedName",
"type": "compute.instances"
}
},
{
"permission": "compute.disks.create",
"granted": true,
"resourceAttributes": {
"service": "compute",
"name": "projects/test-project/zones/asia-northeast3-b/disks/unusedName",
"type": "compute.disks"
}
},
{
"permission": "compute.subnetworks.use",
"granted": true,
"resourceAttributes": {
"service": "compute",
"name": "projects/test-project/regions/asia-northeast3/subnetworks/node-asia-northeast3",
"type": "compute.subnetworks"
}
},
{
"permission": "compute.subnetworks.useExternalIp",
"granted": true,
"resourceAttributes": {
"service": "compute",
"name": "projects/test-project/regions/asia-northeast3/subnetworks/node-asia-northeast3",
"type": "compute.subnetworks"
}
},
{
"permission": "compute.instances.setMetadata",
"granted": true,
"resourceAttributes": {
"service": "compute",
"name": "projects/test-project/zones/asia-northeast3-b/instances/unusedName",
"type": "compute.instances"
}
},
{
"permission": "compute.instances.setTags",
"granted": true,
"resourceAttributes": {
"service": "compute",
"name": "projects/test-project/zones/asia-northeast3-b/instances/unusedName",
"type": "compute.instances"
}
},
{
"permission": "compute.instances.setServiceAccount",
"granted": true,
"resourceAttributes": {
"service": "compute",
"name": "projects/test-project/zones/asia-northeast3-b/instances/unusedName",
"type": "compute.instances"
}
}
],
"resourceName": "projects/test-project/zones/asia-northeast3-b/instances/node-compute-0-999",
"request": {
}
},
"insertId": "udzude11ki4",
"resource": {
"type": "gce_instance",
"labels": {
"instance_id": "9147678799106660706",
"zone": "asia-northeast3-b",
"project_id": "test-project"
}
},
"timestamp": "2022-06-22T00:51:42.846715Z",
"severity": "ERROR",
"operation": {
"id": "operation-1655859087952-5e1febcbc4ac4-b713c511-7ccfeadb",
"first": true,
"last": true
},
"receiveTimestamp": "2022-06-22T00:51:43.611376754Z"
}