cluster_name = "test-slurm-1"
project = "our-test-project-name"
zone = "us-central1-a"
network_name = "our-vpc"
subnetwork_name = "test-subnet"
shared_vpc_host_project = "our-infrastructure-vpc"
disable_controller_public_ips = true
disable_login_public_ips = true
disable_compute_public_ips = true
# ompi_version = null # e.g. v3.1.x
# slurm_version = "19.05-latest"
suspend_time = 60
# controller_machine_type = "n1-standard-2"
# controller_disk_type = "pd-standard"
# controller_disk_size_gb = 50
# controller_labels = {
# key1 = "val1"
# key2 = "val2"
# }
controller_service_account = "our-compute instances service account"
# cloudsql = {
# server_ip = "<cloudsql ip>"
# user = "slurm"
# password = "verysecure"
# db_name = "slurm_accounting"
# }
# controller_secondary_disk = false
# controller_secondary_disk_size = 100
# controller_secondary_disk_type = "pd-ssd"
# login_machine_type = "n1-standard-2"
# login_disk_type = "pd-standard"
# login_disk_size_gb = 20
# login_labels = {
# key1 = "val1"
# key2 = "val2"
# }
login_node_count = 1
login_node_service_account = "our-compute instances service account"
login_node_scopes = [
]
# Optional network storage fields
# network_storage is mounted on all instances
# login_network_storage is mounted on controller and login instances
network_storage = [{
server_ip = "10.0.0.10"
remote_mount = "/test_test"
local_mount = "/home"
fs_type = "nfs"
mount_options = "nfsvers=3"
}]
#
login_network_storage = [{
server_ip = "10.0.0.11"
remote_mount = "/shared"
local_mount = "/mnt/shared"
fs_type = "nfs"
mount_options = "nfsvers=3"
}]
# compute_image_machine_type = "n1-standard-2"
# compute_image_disk_type = "pd-standard"
# compute_image_disk_size_gb = 20
# compute_image_labels = {
# key1 = "val1"
# key2 = "val2"
# }
# compute_node_service_account = "default"
# compute_node_scopes = [
# ]
partitions = [
{ name = "debug"
machine_type = "n1-standard-2"
static_node_count = 0
max_node_count = 10
zone = "us-central1-a"
compute_disk_type = "pd-standard"
compute_disk_size_gb = 20
compute_labels = {}
cpu_platform = null
gpu_count = 0
gpu_type = null
network_storage = []
preemptible_bursting = true
vpc_subnet = null
},
# { name = "partition2"
# machine_type = "n1-standard-16"
# static_node_count = 0
# max_node_count = 20
# zone = "us-central1-a"
# compute_disk_type = "pd-ssd"
# compute_disk_size_gb = 20
# compute_labels = {
# key1 = "val1"
# key2 = "val2"
# }
# cpu_platform = "Intel Skylake"
# gpu_count = 8
# gpu_type = "nvidia-tesla-v100"
# network_storage = [{
# server_ip = "none"
# remote_mount = "<gcs bucket name>"
# local_mount = "/data"
# fs_type = "gcsfuse"
# mount_options = "file_mode=664,dir_mode=775,allow_other"
# }]
# preemptible_bursting = true
# vpc_subnet = null
]
We've managed to find a manual solution to the fluentd install issue ( https://cloud.google.com/logging/docs/agent/logging/installation ), but the failure for the controller setup.py script is perplexing. We had it all working just running the basic.tfvars as provided originally, but it needed network configuration... and now it's borked. (The systems can get out to the internet using CloudNAT, but do not have public IPs.)