[COMMIT scylla-cluster-tests master] refactor(sct_runner.py): generalize *SctRunner classes

0 views
Skip to first unread message

Commit Bot

unread,
Sep 27, 2021, 8:13:02 AMSep 27
to scylla...@googlegroups.com, Evgeniy Naydanov
From: Evgeniy Naydanov <evgeniy....@scylladb.com>
Committer: Bentsi <ben...@scylladb.com>
Branch: master

refactor(sct_runner.py): generalize *SctRunner classes

- Change type of VERSION from float to str
- Add get_current_datetime_formatted() function
- Add region_az() method
- Move instance_type() to base class
- Convert key_pair() to property
- Bump sct-runner verion to 1.5 (because of JRE version)
- Set fs.aio-max-nr to recommended value for docker backend
- Fix problem with login user in install_prereq()
- Use apt-get instead of apt
- Remove redundant sudo
- Move create_image() to base class and smaller hooks
- Block project-wide SSH keys for GCE instances
- add get_sct_runner() factory function

---
diff --git a/sct.py b/sct.py
--- a/sct.py
+++ b/sct.py
@@ -37,7 +37,7 @@
from sdcm.remote import LOCALRUNNER
from sdcm.results_analyze import PerformanceResultsAnalyzer
from sdcm.sct_config import SCTConfiguration
-from sdcm.sct_runner import AwsSctRunner, GceSctRunner
+from sdcm.sct_runner import AwsSctRunner, GceSctRunner, get_sct_runner
from sdcm.utils.cloud_monitor import cloud_report, cloud_qa_report
from sdcm.utils.common import (
all_aws_regions,
@@ -1140,13 +1140,7 @@ def create_runner_image(cloud_provider, region, availability_zone):
if cloud_provider == "aws" and availability_zone != "":
assert len(availability_zone) == 1, f"Invalid AZ: {availability_zone}, availability-zone is one-letter a-z."
add_file_logger()
- if cloud_provider == 'aws':
- sct_runner = AwsSctRunner(region_name=region,
- availability_zone=availability_zone)
- elif cloud_provider == 'gce':
- sct_runner = GceSctRunner(datacenter=region, availability_zone=availability_zone)
- else:
- raise Exception('Unsupported Cloud provider')
+ sct_runner = get_sct_runner(cloud_provider=cloud_provider, region_name=region, availability_zone=availability_zone)
sct_runner.create_image()


@@ -1163,29 +1157,19 @@ def create_runner_instance(cloud_provider, region, availability_zone, instance_t
add_file_logger()
sct_runner_ip_path = Path("sct_runner_ip")
sct_runner_ip_path.unlink(missing_ok=True)
- if cloud_provider == 'aws':
- sct_runner = AwsSctRunner(region_name=region,
- availability_zone=availability_zone)
- elif cloud_provider == 'gce':
- sct_runner = GceSctRunner(datacenter=region,
- availability_zone=availability_zone)
- else:
- raise Exception('Unsupported Cloud provider')
-
+ sct_runner = get_sct_runner(cloud_provider=cloud_provider, region_name=region, availability_zone=availability_zone)
instance = sct_runner.create_instance(
instance_type=instance_type,
test_id=test_id,
test_duration=duration,
- region_az=region + availability_zone,
)
- if cloud_provider == 'aws':
- runner_public_ip = instance.public_ip_address
- elif cloud_provider == 'gce':
- runner_public_ip = instance.public_ips[0]
+ if not instance:
+ sys.exit(1)
+
LOGGER.info("Verifying SSH connectivity...")
+ runner_public_ip = sct_runner.get_instance_public_ip(instance=instance)
remoter = sct_runner.get_remoter(host=runner_public_ip, connect_timeout=120)
- result = remoter.run("true", timeout=100, verbose=False, ignore_status=True)
- if result.exit_status == 0:
+ if remoter.run("true", timeout=100, verbose=False, ignore_status=True).ok:
LOGGER.info("Successfully connected the SCT Runner. Public IP: %s", runner_public_ip)
with sct_runner_ip_path.open("w") as sct_runner_ip_file:
sct_runner_ip_file.write(runner_public_ip)
diff --git a/sdcm/sct_runner.py b/sdcm/sct_runner.py
--- a/sdcm/sct_runner.py
+++ b/sdcm/sct_runner.py
@@ -1,26 +1,36 @@
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See LICENSE for more details.
+#
+# Copyright (c) 2021 ScyllaDB
+
import logging
-import random
-import sys
import tempfile
import time
import datetime
from enum import Enum
-from functools import lru_cache, cached_property
+from functools import cached_property
from math import ceil
-from textwrap import dedent
-from typing import Optional
+from typing import Optional, Any
from abc import ABC, abstractmethod
-import pytz

-import boto3
-from libcloud.common.google import ResourceNotFoundError
+import pytz
+from libcloud.common.google import ResourceNotFoundError as GoogleResourceNotFoundError

from sdcm.keystore import KeyStore, SSHKey
-from sdcm.remote import RemoteCmdRunnerBase
+from sdcm.remote import RemoteCmdRunnerBase, shell_script_cmd
from sdcm.utils.aws_utils import ec2_instance_wait_public_ip, ec2_ami_get_root_device_name
from sdcm.utils.get_username import get_username
from sdcm.utils.aws_region import AwsRegion
from sdcm.utils.gce_utils import get_gce_service
+from sdcm.cluster_docker import AIO_MAX_NR_RECOMMENDED_VALUE


LOGGER = logging.getLogger(__name__)
@@ -31,29 +41,44 @@ class ImageType(Enum):
GENERAL = "general"


+def get_current_datetime_formatted():
+ return datetime.datetime.now(tz=pytz.utc).strftime("%B %d, %Y, %H:%M:%S")
+
+
class SctRunner(ABC):
- """Provisions and configures the SCT runner"""
- VERSION = 1.4 # Version of the Image
+ """Provision and configure the SCT runner."""
+ VERSION = "1.5" # Version of the Image
NODE_TYPE = "sct-runner"
RUNNER_NAME = "SCT-Runner"
LOGIN_USER = "ubuntu"
IMAGE_DESCRIPTION = f"SCT runner image {VERSION}"

- def __init__(self, cloud_provider: str, region_name: str):
+ BASE_IMAGE: Any
+ SOURCE_IMAGE_REGION: str
+ IMAGE_BUILDER_INSTANCE_TYPE: str
+ REGULAR_TEST_INSTANCE_TYPE: str
+ LONGTERM_TEST_INSTANCE_TYPE: str
+
+ def __init__(self, cloud_provider: str, region_name: str, availability_zone: str = ""):
self.cloud_provider = cloud_provider
self.region_name = region_name
+ self.availability_zone = availability_zone
self._ssh_pkey_file = None

- @cached_property
@abstractmethod
- def image_name(self) -> str:
+ def region_az(self, region_name: str, availability_zone: str) -> str:
...

- @staticmethod
+ @property
@abstractmethod
- def instance_type(test_duration) -> str:
+ def image_name(self) -> str:
...

+ def instance_type(self, test_duration) -> str:
+ if test_duration > 7 * 60:
+ return self.LONGTERM_TEST_INSTANCE_TYPE
+ return self.REGULAR_TEST_INSTANCE_TYPE
+
@staticmethod
def instance_root_disk_size(test_duration) -> int:
disk_size = 80 # GB
@@ -62,185 +87,285 @@ def instance_root_disk_size(test_duration) -> int:
return disk_size + 40
return disk_size

- @staticmethod
+ @property
@abstractmethod
- @lru_cache(maxsize=None)
- def key_pair() -> SSHKey:
+ def key_pair(self) -> SSHKey:
...

def get_remoter(self, host, connect_timeout: Optional[float] = None) -> RemoteCmdRunnerBase:
self._ssh_pkey_file = tempfile.NamedTemporaryFile(mode="w", delete=False) # pylint: disable=consider-using-with
- self._ssh_pkey_file.write(self.key_pair().private_key.decode())
+ self._ssh_pkey_file.write(self.key_pair.private_key.decode())
self._ssh_pkey_file.flush()
return RemoteCmdRunnerBase.create_remoter(hostname=host, user=self.LOGIN_USER,
key_file=self._ssh_pkey_file.name, connect_timeout=connect_timeout)

def install_prereqs(self, public_ip: str, connect_timeout: Optional[int] = None) -> None:
- prereqs_script = dedent(f"""
- echo "fs.aio-max-nr = 65536" >> /etc/sysctl.conf
- echo "ubuntu soft nofile 4096" >> /etc/security/limits.conf
+ LOGGER.info("Connecting instance...")
+ remoter = self.get_remoter(host=public_ip, connect_timeout=connect_timeout)
+
+ LOGGER.info("Installing required packages...")
+ login_user = self.LOGIN_USER
+ public_key = self.key_pair.public_key.decode()
+ result = remoter.sudo(shell_script_cmd(quote="'", cmd=f"""\
+ echo "fs.aio-max-nr = {AIO_MAX_NR_RECOMMENDED_VALUE}" >> /etc/sysctl.conf
+ echo "{login_user} soft nofile 4096" >> /etc/security/limits.conf
echo "jenkins soft nofile 4096" >> /etc/security/limits.conf
echo "root soft nofile 4096" >> /etc/security/limits.conf
- sudo -u ubuntu mkdir -p /home/ubuntu/.ssh || true
- echo "{self.key_pair().public_key.decode()}" >> /home/ubuntu/.ssh/authorized_keys
- chmod 600 /home/ubuntu/.ssh/authorized_keys
- mkdir -p -m 777 /home/ubuntu/sct-results
- echo "cd ~/sct-results" >> /home/ubuntu/.bashrc
- chown -R ubuntu:ubuntu /home/ubuntu/
- apt clean
- apt update
- apt install -y python3-pip htop screen tree
+ sudo -u {login_user} mkdir -p /home/{login_user}/.ssh || true
+ echo "{public_key}" >> /home/{login_user}/.ssh/authorized_keys
+ chmod 600 /home/{login_user}/.ssh/authorized_keys
+ mkdir -p -m 777 /home/{login_user}/sct-results
+ echo "cd ~/sct-results" >> /home/{login_user}/.bashrc
+ chown -R {login_user}:{login_user} /home/{login_user}/
+ apt-get clean
+ apt-get update
+ apt-get install -y python3-pip htop screen tree
pip3 install awscli
- # docker
+
+ # Install Docker.
apt-get install -y apt-transport-https ca-certificates curl gnupg-agent software-properties-common
- curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -
+ curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add -
apt-key fingerprint 0EBFCD88
add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable"
- apt update
- apt install -y docker-ce docker-ce-cli containerd.io
- usermod -aG docker {self.LOGIN_USER}
- # add kubectl
+ apt-get update
+ apt-get install -y docker-ce docker-ce-cli containerd.io
+ usermod -aG docker {login_user}
+
+ # Install kubectl.
curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
- sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl
- usermod -aG docker ubuntu || true
- # configure Jenkins user
- apt install -y openjdk-14-jre-headless
+ install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl
+
+ # Configure Jenkins user.
+ apt-get install -y openjdk-11-jre-headless # https://www.jenkins.io/doc/administration/requirements/java/
adduser --disabled-password --gecos "" jenkins || true
usermod -aG docker jenkins
mkdir -p /home/jenkins/.ssh
- echo "{self.key_pair().public_key.decode()}" >> /home/jenkins/.ssh/authorized_keys
+ echo "{public_key}" >> /home/jenkins/.ssh/authorized_keys
chmod 600 /home/jenkins/.ssh/authorized_keys
chown -R jenkins:jenkins /home/jenkins
echo "jenkins ALL=(ALL) NOPASSWD: ALL" > /etc/sudoers.d/jenkins
- # Jenkins pipelines run /bin/sh for some reason
- unlink /bin/sh
- ln -s /bin/bash /bin/sh
- """)
- LOGGER.info("Connecting instance...")
- remoter = self.get_remoter(host=public_ip, connect_timeout=connect_timeout)
- LOGGER.info("Installing required packages...")
- result = remoter.run(f"sudo bash -cxe '{prereqs_script}'", ignore_status=True)

+ # Jenkins pipelines run /bin/sh for some reason.
+ ln -sf /bin/bash /bin/sh
+ """), ignore_status=True)
remoter.stop()
- if result.exit_status == 0:
+ if result.ok:
LOGGER.info("All packages successfully installed.")
else:
- raise Exception("Unable to install required packages:\n%s" % (result.stdout + result.stderr))
+ raise Exception(f"Unable to install required packages:\n{result.stdout}{result.stderr}")

@abstractmethod
- def _image(self, image_type=ImageType.SOURCE):
+ def _image(self, image_type: ImageType = ImageType.SOURCE) -> Any:
...

@property
- def source_image(self):
+ def source_image(self) -> Any:
return self._image(image_type=ImageType.SOURCE)

@property
- def image(self):
+ def image(self) -> Any:
return self._image(image_type=ImageType.GENERAL)

+ @cached_property
+ def image_tags(self) -> dict[str, str]:
+ return {
+ "Name": self.image_name,
+ "Version": self.VERSION,
+ }
+
@abstractmethod
# pylint: disable=too-many-arguments
- def _create_instance(self, instance_type, base_image, tags_list, instance_name=None, region_az="", test_duration=None):
+ def _create_instance(self,
+ instance_type: str,
+ base_image: Any,
+ tags: dict[str, str],
+ instance_name: str,
+ region_az: str = "",
+ test_duration: Optional[int] = None) -> Any:
...

@abstractmethod
- def create_image(self) -> None:
+ def _stop_image_builder_instance(self, instance: Any) -> None:
...

@abstractmethod
- def _get_base_image(self, image=None):
+ def _terminate_image_builder_instance(self, instance: Any) -> None:
...

- def create_instance(self, test_id: str, test_duration: int, region_az: str, instance_type: str = ''):
- """
- :param test_duration: used to set keep-alive flags, measured in MINUTES
+ @abstractmethod
+ def _get_instance_id(self, instance: Any) -> Any:
+ ...
+
+ @abstractmethod
+ def get_instance_public_ip(self, instance: Any) -> str:
+ ...
+
+ @abstractmethod
+ def _create_image(self, instance: Any) -> Any:
+ ...
+
+ @abstractmethod
+ def _get_image_id(self, image: Any) -> Any:
+ ...
+
+ @abstractmethod
+ def _copy_source_image_to_region(self) -> None:
+ ...
+
+ def create_image(self) -> None:
+ """Create SCT Runner image in specified region.
+
+ If the image exists in SOURCE_REGION it will be copied to the destination region.
+
+ WARNING: this can't run in parallel!
"""
+ LOGGER.info("Looking for source SCT Runner Image in `%s'...", self.SOURCE_IMAGE_REGION)
+ source_image = self.source_image
+ if not source_image:
+ LOGGER.info("Source SCT Runner Image not found. Creating...")
+ instance = self._create_instance(
+ instance_type=self.IMAGE_BUILDER_INSTANCE_TYPE,
+ base_image=self.BASE_IMAGE,
+ tags={
+ "keep": "1",
+ "keep_action": "terminate",
+ "Version": self.VERSION,
+ },
+ instance_name=f"{self.image_name}-builder-{self.SOURCE_IMAGE_REGION}",
+ region_az=self.region_az(
+ region_name=self.SOURCE_IMAGE_REGION,
+ availability_zone=self.availability_zone,
+ ),
+ )
+ self.install_prereqs(public_ip=self.get_instance_public_ip(instance=instance), connect_timeout=120)
+
+ LOGGER.info("Stopping the SCT Image Builder instance...")
+ self._stop_image_builder_instance(instance=instance)
+
+ LOGGER.info("SCT Image Builder instance stopped.\nCreating image...")
+ self._create_image(instance=instance)
+
+ builder_instance_id = self._get_instance_id(instance=instance)
+ try:
+ LOGGER.info("Terminating SCT Image Builder instance `%s'...", builder_instance_id)
+ self._terminate_image_builder_instance(instance=instance)
+ except Exception as ex: # pylint: disable=broad-except
+ LOGGER.warning("Was not able to terminate `%s': %s\nPlease terminate manually!!!",
+ builder_instance_id, ex)
+ else:
+ LOGGER.info("SCT Runner image exists in the source region `%s'! ID: %s",
+ self.SOURCE_IMAGE_REGION, self._get_image_id(image=source_image))
+
+ if self.region_name != self.SOURCE_IMAGE_REGION and self.image is None:
+ LOGGER.info("Copying %s to %s...\nNOTE: it can take 5-15 minutes.",
+ self.image_name, self.region_name)
+ self._copy_source_image_to_region()
+ LOGGER.info("Done.")
+ else:
+ LOGGER.info("No need to copy SCT Runner image since it already exists in `%s'", self.region_name)
+
+ @abstractmethod
+ def _get_base_image(self, image: Optional[Any] = None) -> Any:
+ ...
+
+ def create_instance(self, test_id: str, test_duration: int, instance_type: str = "") -> Any:
LOGGER.info("Creating SCT Runner instance...")
image = self.image
if not image:
LOGGER.error("SCT Runner image was not found in %s! "
- "Use hydra create-runner-image --cloud-privider %s --region %s",
+ "Use `hydra create-runner-image --cloud-provider %s --region %s'",
self.region_name, self.cloud_provider, self.region_name)
- sys.exit(1)
- lt_datetime = datetime.datetime.now(tz=pytz.utc)
+ return None
return self._create_instance(
instance_type=instance_type or self.instance_type(test_duration=test_duration),
base_image=self._get_base_image(self.image),
- tags_list=[
- {"Key": "Name", "Value": self.RUNNER_NAME},
- {"Key": "TestId", "Value": test_id},
- {"Key": "NodeType", "Value": self.NODE_TYPE},
- {"Key": "RunByUser", "Value": get_username()},
- {"Key": "keep", "Value": str(ceil(test_duration / 60) + 6)},
- {"Key": "keep_action", "Value": "terminate"},
- {"Key": "launch_time", "Value": lt_datetime.strftime("%B %d, %Y, %H:%M:%S")},
- ],
+ tags={
+ "TestId": test_id,
+ "NodeType": self.NODE_TYPE,
+ "RunByUser": get_username(),
+ "keep": str(ceil(test_duration / 60) + 6), # keep SCT Runner for 6h more than test_duration
+ "keep_action": "terminate",
+ },
instance_name=f"{self.image_name}-instance-{test_id[:8]}",
- region_az=region_az,
+ region_az=self.region_az(
+ region_name=self.region_name,
+ availability_zone=self.availability_zone,
+ ),
test_duration=test_duration,
)


class AwsSctRunner(SctRunner):
- """Provisions and configures the SCT runner on AWS"""
+ """Provision and configure the SCT Runner on AWS."""
BASE_IMAGE = "ami-0ffac660dd0cb2973" # ubuntu/images/hvm-ssd/ubuntu-focal-20.04-amd64-server-20200609
SOURCE_IMAGE_REGION = "eu-west-2" # where the source Runner image will be created and copied to other regions
+ IMAGE_BUILDER_INSTANCE_TYPE = "t2.small"
+ REGULAR_TEST_INSTANCE_TYPE = "t3.large" # has 7h 12m CPU burst
+ LONGTERM_TEST_INSTANCE_TYPE = "r5.large"

- def __init__(self, region_name: str, availability_zone: str, cloud_provider: str = 'aws'):
- super().__init__(cloud_provider=cloud_provider, region_name=region_name)
- self.availability_zone = availability_zone
- self.ec2_client = boto3.client("ec2", region_name=region_name)
- self.ec2_client_source = boto3.client("ec2", region_name=self.SOURCE_IMAGE_REGION)
- self.ec2_resource = boto3.resource("ec2", region_name=region_name)
- self.ec2_resource_source = boto3.resource("ec2", region_name=self.SOURCE_IMAGE_REGION)
+ def __init__(self, region_name: str, availability_zone: str, cloud_provider: str = "aws"):
+ super().__init__(cloud_provider=cloud_provider, region_name=region_name, availability_zone=availability_zone)
+ self.aws_region = AwsRegion(region_name=region_name)
+ self.aws_region_source = AwsRegion(region_name=self.SOURCE_IMAGE_REGION)
+
+ def region_az(self, region_name: str, availability_zone: str) -> str:
+ return f"{region_name}{availability_zone}"

@cached_property
def image_name(self) -> str:
return f"sct-runner-{self.VERSION}"

- @staticmethod
- def instance_type(test_duration) -> str:
- if test_duration > 7 * 60:
- return "r5.large"
- return "t3.large" # has 7h 12m CPU burst
-
- @staticmethod
- @lru_cache(maxsize=None)
- def key_pair() -> SSHKey:
- ks = KeyStore()
- return ks.get_ec2_ssh_key_pair()
+ @cached_property
+ def key_pair(self) -> SSHKey:
+ return KeyStore().get_ec2_ssh_key_pair()

- def _image(self, image_type=ImageType.SOURCE):
+ def _image(self, image_type: ImageType = ImageType.SOURCE) -> Any:
if image_type == ImageType.SOURCE:
- client = self.ec2_client_source
+ aws_region = self.aws_region_source
elif image_type == ImageType.GENERAL:
- client = self.ec2_client
+ aws_region = self.aws_region
else:
raise ValueError("Unknown Image type")
- amis = client.describe_images(Owners=["self"],
- Filters=[{"Name": "tag:Name", "Values": [self.image_name]},
- {"Name": "tag:Version", "Values": [str(self.VERSION)]}])
+
+ amis = aws_region.client.describe_images(
+ Owners=["self"],
+ Filters=[
+ {"Name": "tag:Name", "Values": [self.image_name]},
+ {"Name": "tag:Version", "Values": [self.VERSION]},
+ ],
+ )
LOGGER.debug("Found SCT Runner AMIs: %s", amis)
- existing_amis = amis.get("Images", [])
- if len(existing_amis) == 0:
+
+ existing_amis = amis.get("Images")
+
+ if not existing_amis:
return None
+
assert len(existing_amis) == 1, \
f"More than 1 SCT Runner AMI with {self.image_name}:{self.VERSION} " \
f"found in {self.region_name}: {existing_amis}"
- return self.ec2_resource.Image(existing_amis[0]["ImageId"]) # pylint: disable=no-member
+
+ return aws_region.resource.Image(existing_amis[0]["ImageId"])

# pylint: disable=too-many-arguments
- def _create_instance(self, instance_type, base_image, tags_list, instance_name=None, region_az="", test_duration=None):
- region = region_az[:-1]
- aws_region = AwsRegion(region_name=region)
- region_az = region_az if region_az else random.choice(aws_region.availability_zones)
+ def _create_instance(self,
+ instance_type: str,
+ base_image: Any,
+ tags: dict[str, str],
+ instance_name: str,
+ region_az: str = "",
+ test_duration: Optional[int] = None) -> Any:
+ if region_az.startswith(self.SOURCE_IMAGE_REGION):
+ aws_region = self.aws_region_source
+ else:
+ aws_region = self.aws_region
subnet = aws_region.sct_subnet(region_az=region_az)
assert subnet, f"No SCT subnet found in the source region. " \
- f"Use hydra prepare-region --region-name '{self.region_name}' to create cloud env!"
+ f"Use `hydra prepare-region --cloud-provider aws --region-name {aws_region.region_name}' " \
+ f"to create cloud env!"
+
LOGGER.info("Creating instance...")
- ec2_resource = boto3.resource("ec2", region_name=region)
- result = ec2_resource.create_instances(
+ result = aws_region.resource.create_instances(
ImageId=base_image,
InstanceType=instance_type,
MinCount=1,
@@ -253,96 +378,80 @@ def _create_instance(self, instance_type, base_image, tags_list, instance_name=N
"Groups": [aws_region.sct_security_group.group_id],
"DeleteOnTermination": True,
}],
- TagSpecifications=[{"ResourceType": "instance", "Tags": tags_list}],
+ TagSpecifications=[{
+ "ResourceType": "instance",
+ "Tags": [{"Key": key, "Value": value} for key, value in tags.items()] +
+ [{"Key": "Name", "Value": instance_name}],
+ }],
BlockDeviceMappings=[{
- "DeviceName": ec2_ami_get_root_device_name(image_id=base_image, region=region),
+ "DeviceName": ec2_ami_get_root_device_name(image_id=base_image, region=aws_region.region_name),
"Ebs": {
"VolumeSize": self.instance_root_disk_size(test_duration),
"VolumeType": "gp2"
}
}]
)
instance = result[0]
+
LOGGER.info("Instance created. Waiting until it becomes running... ")
instance.wait_until_running()
- LOGGER.info("Instance '%s' is running. Waiting for public IP...", instance.instance_id)
+
+ LOGGER.info("Instance `%s' is running. Waiting for public IP...", instance.instance_id)
ec2_instance_wait_public_ip(instance=instance)
+
LOGGER.info("Got public IP: %s", instance.public_ip_address)
+
return instance

+ def _stop_image_builder_instance(self, instance: Any) -> None:
+ instance.stop()
+ instance.wait_until_stopped()
+
+ def _terminate_image_builder_instance(self, instance: Any) -> None:
+ instance.terminate()
+
+ def _get_instance_id(self, instance: Any) -> Any:
+ return instance.instance_id
+
+ def get_instance_public_ip(self, instance: Any) -> str:
+ return instance.public_ip_address
+
def tag_image(self, image_id, image_type) -> None:
if image_type == ImageType.SOURCE:
- ec2_resource = self.ec2_resource_source
+ aws_region = self.aws_region_source
elif image_type == ImageType.GENERAL:
- ec2_resource = self.ec2_resource
- image_tags = [
- {"Key": "Name", "Value": self.image_name},
- {"Key": "Version", "Value": str(self.VERSION)},
- ]
- runer_image = ec2_resource.Image(image_id)
- runer_image.wait_until_exists()
+ aws_region = self.aws_region
+ else:
+ raise ValueError("Unknown Image type")
+
+ runner_image = aws_region.resource.Image(image_id)
+ runner_image.wait_until_exists()
+
LOGGER.info("Image '%s' exists and ready. Tagging...", image_id)
- runer_image.create_tags(Tags=image_tags)
+ runner_image.create_tags(Tags=[{"Key": key, "Value": value} for key, value in self.image_tags.items()])
LOGGER.info("Tagging completed.")
- LOGGER.info("SCT Runner image created in '%s'. Id [%s].", self.region_name, image_id)

- def create_image(self) -> None:
- """
- Create an Image for SCT Runner in specified region. If the Image exists in SOURCE_REGION
- it will be copied to the destination region.
- Warning: this can't run in parallel!
- """
- LOGGER.info("Looking for source SCT Runner Image in '%s'...", self.SOURCE_IMAGE_REGION)
- source_image = self.source_image
- if not source_image:
- LOGGER.info("Source SCT Runner Image not found. Creating...")
- instance = self._create_instance(
- instance_type="t3.small",
- base_image=self.BASE_IMAGE,
- tags_list=[{"Key": "Name", "Value": "sct-image-builder"},
- {"Key": "keep", "Value": "1"},
- {"Key": "keep_action", "Value": "terminate"},
- {"Key": "Version", "Value": str(self.VERSION)},
- ],
- region_az=self.SOURCE_IMAGE_REGION + self.availability_zone # pylint: disable=no-member
- )
- self.install_prereqs(public_ip=instance.public_ip_address)
- LOGGER.info("Stopping the SCT Image Builder instance...")
- instance.stop()
- instance.wait_until_stopped()
- LOGGER.info("SCT Image Builder instance stopped.\nCreating image...")
- result = self.ec2_client_source.create_image(
- Description=self.IMAGE_DESCRIPTION,
- InstanceId=instance.instance_id,
- Name=self.image_name,
- NoReboot=False
- )
- self.tag_image(image_id=result["ImageId"], image_type=ImageType.SOURCE)
- try:
- LOGGER.info("Terminating image builder instance '%s'...", instance.instance_id)
- instance.terminate()
- except Exception as ex: # pylint: disable=broad-except
- LOGGER.warning("Was not able to terminate '%s': %s\n"
- "Please terminate manually!!!", instance.instance_id, ex)
+ def _create_image(self, instance: Any) -> Any:
+ result = self.aws_region_source.client.create_image(
+ Description=self.IMAGE_DESCRIPTION,
+ InstanceId=instance.instance_id,
+ Name=self.image_name,
+ NoReboot=False
+ )
+ self.tag_image(image_id=result["ImageId"], image_type=ImageType.SOURCE)

- else:
- LOGGER.info("SCT Runner image exists in the source region '%s'! "
- "ID: %s", self.SOURCE_IMAGE_REGION, source_image.image_id)
+ def _get_image_id(self, image: Any) -> Any:
+ return image.image_id

- if self.region_name != self.SOURCE_IMAGE_REGION and self.image is None:
- LOGGER.info("Copying %s to %s...\nNote: It can take 5-15 minutes.",
- self.image_name, self.region_name)
- result = self.ec2_client.copy_image( # pylint: disable=no-member
- Description=self.IMAGE_DESCRIPTION,
- Name=self.image_name,
- SourceImageId=self.source_image.image_id,
- SourceRegion=self.SOURCE_IMAGE_REGION
- )
- LOGGER.info("Image copied, id: '%s'.", result["ImageId"])
- self.tag_image(image_id=result["ImageId"], image_type=ImageType.GENERAL)
- LOGGER.info("Done.")
- else:
- LOGGER.info("No need to copy SCT Runner image since it already exists in '%s'.", self.region_name)
+ def _copy_source_image_to_region(self) -> None:
+ result = self.aws_region.client.copy_image( # pylint: disable=no-member
+ Description=self.IMAGE_DESCRIPTION,
+ Name=self.image_name,
+ SourceImageId=self.source_image.image_id,
+ SourceRegion=self.SOURCE_IMAGE_REGION
+ )
+ LOGGER.info("Image copied, id: `%s'.", result["ImageId"])
+ self.tag_image(image_id=result["ImageId"], image_type=ImageType.GENERAL)

def _get_base_image(self, image=None):
if image is None:
@@ -351,160 +460,138 @@ def _get_base_image(self, image=None):


class GceSctRunner(SctRunner):
- """Provisions and configures the SCT runner on GCE"""
+ """Provision and configure the SCT runner on GCE."""
BASE_IMAGE = "https://www.googleapis.com/compute/v1/projects/ubuntu-os-cloud/global/images/family/ubuntu-2004-lts"
SOURCE_IMAGE_REGION = "us-east1" # where the source Runner image will be created and copied to other regions
+ IMAGE_BUILDER_INSTANCE_TYPE = "e2-standard-2"
+ REGULAR_TEST_INSTANCE_TYPE = "e2-standard-4" # 2 vcpus, 16G
+ LONGTERM_TEST_INSTANCE_TYPE = "e2-standard-2" # 2 vcpus, 8G
+
FAMILY = "sct-runner-image"
- LOGIN_USER = "scylla-test"
+ SCT_NETWORK = "qa-vpc"

- def __init__(self, datacenter: str, availability_zone: str, cloud_provider: str = 'gce'):
- super().__init__(cloud_provider=cloud_provider, region_name=datacenter)
- self.availability_zone = availability_zone
- self.gce_service = get_gce_service(datacenter)
- self.gce_service_source = get_gce_service(self.SOURCE_IMAGE_REGION)
- self._project = self.gce_service.ex_get_project()
- self.project_name = self._project.name
+ def __init__(self, region_name: str, availability_zone: str, cloud_provider: str = "gce"):
+ super().__init__(cloud_provider=cloud_provider, region_name=region_name, availability_zone=availability_zone)
+ self.gce_service = get_gce_service(region=region_name)
+ self.gce_service_source = get_gce_service(region=self.SOURCE_IMAGE_REGION)
+ self.project_name = self.gce_service.ex_get_project().name
+
+ def region_az(self, region_name: str, availability_zone: str) -> str:
+ if availability_zone:
+ return f"{region_name}-{availability_zone}"
+ return region_name

@cached_property
def image_name(self) -> str:
- return f"sct-runner-{str(self.VERSION).replace('.', '-')}"
+ return f"sct-runner-{self.VERSION.replace('.', '-')}"

- @staticmethod
- def instance_type(test_duration) -> str:
- if test_duration > 7 * 60:
- return "e2-standard-4" # 2 vcpus, 16G
- return "e2-standard-2" # 2 vcpus, 8G
+ @cached_property
+ def key_pair(self) -> SSHKey:
+ return KeyStore().get_gce_ssh_key_pair() # scylla-test

- def _get_image_url(self, image_id) -> str:
- return f"https://www.googleapis.com/compute/alpha/projects/{self.project_name}/global/images/{image_id}"
+ def _image(self, image_type: ImageType = ImageType.SOURCE) -> Any:
+ if image_type == ImageType.SOURCE:
+ gce_service = self.gce_service_source
+ elif image_type == ImageType.GENERAL:
+ gce_service = self.gce_service
+ else:
+ raise ValueError("Unknown Image type")
+ try:
+ return gce_service.ex_get_image(self.image_name)
+ except GoogleResourceNotFoundError:
+ return None

@staticmethod
- @lru_cache(maxsize=None)
- def key_pair() -> SSHKey:
- ks = KeyStore()
- return ks.get_gce_ssh_key_pair() # scylla-test
+ def tags_to_labels(tags: dict[str, str]) -> dict[str, str]:
+ return {key.lower(): value.lower().replace(".", "_") for key, value in tags.items()}

- def create_image(self) -> None:
- """
- Create an Image for SCT Runner in specified region. If the Image exists in SOURCE_REGION
- it will be copied to the destination region.
- Warning: this can't run in parallel!
- """
- LOGGER.info("Looking for source SCT Runner Image in %s ...", self.SOURCE_IMAGE_REGION)
- source_image = self.source_image
- if not source_image:
- # GCE doesn't allow repeat name in multiple datacenter
- instance_name = f"{self.image_name}-builder-{self.SOURCE_IMAGE_REGION}"
- LOGGER.info("Source SCT Runner Image not found. Creating...")
- if self.availability_zone != "":
- region_az = f"{self.SOURCE_IMAGE_REGION}-{self.availability_zone}"
- else:
- region_az = self.SOURCE_IMAGE_REGION
- lt_datetime = datetime.datetime.now(tz=pytz.utc)
- instance = self._create_instance(
- instance_type="e2-standard-2",
- base_image=self.BASE_IMAGE,
- tags_list=[{"Key": "Name", "Value": "sct-image-builder"},
- {"Key": "keep", "Value": "1"},
- {"Key": "keep_action", "Value": "terminate"},
- {"Key": "Version", "Value": str(self.VERSION)},
- {"Key": "launch_time", "Value": lt_datetime.strftime("%B %d, %Y, %H:%M:%S")},
- ],
- instance_name=instance_name,
- region_az=region_az
- )
- time.sleep(30) # wait until the public ips are available.
- self.install_prereqs(public_ip=instance.public_ips[0], connect_timeout=120)
+ # pylint: disable=too-many-arguments
+ def _create_instance(self,
+ instance_type: str,
+ base_image: Any,
+ tags: dict[str, str],
+ instance_name: str,
+ region_az: str = "",
+ test_duration: Optional[int] = None) -> Any:
+ LOGGER.info("Creating instance...")
+ if region_az.startswith(self.SOURCE_IMAGE_REGION):
+ gce_service = self.gce_service_source
+ else:
+ gce_service = self.gce_service
+ instance = gce_service.create_node(
+ name=instance_name,
+ size=instance_type,
+ image=base_image,
+ ex_network=self.SCT_NETWORK,
+ ex_disks_gce_struct=[{
+ "type": "PERSISTENT",
+ "deviceName": f"{instance_name}-root-pd-ssd",
+ "initializeParams": {
+ "diskName": f"{instance_name}-root-pd-ssd",
+ "diskType": f"projects/{self.project_name}/zones/{gce_service.zone.name}/diskTypes/pd-ssd",
+ "diskSizeGb": self.instance_root_disk_size(test_duration),
+ "sourceImage": base_image,
+ },
+ "boot": True,
+ "autoDelete": True,
+ }],
+ ex_metadata=tags | {
+ "launch_time": get_current_datetime_formatted(),
+ "block-project-ssh-keys": "true",
+ "ssh-keys": f"{self.LOGIN_USER}:{self.key_pair.public_key.decode()}",
+ },
+ )
+ time.sleep(30) # wait until the public IPs are available.

- LOGGER.info("Stopping the SCT Image Builder instance...")
- self.gce_service_source.ex_stop_node(instance)
- LOGGER.info("SCT Image Builder instance stopped.\nCreating image...")
- source_volume = self.gce_service_source.ex_get_volume(f"{instance_name}-root-pd-ssd")
- new_image = self.gce_service_source.ex_create_image(self.image_name,
- source_volume,
- description=self.IMAGE_DESCRIPTION,
- family=self.FAMILY,
- ex_labels={"name": self.image_name,
- "version": str(self.VERSION).replace('.', '_')})
- try:
- LOGGER.info("Terminating image builder instance '%s'...", instance.id)
- self.gce_service_source.destroy_node(instance)
- except Exception as ex: # pylint: disable=broad-except
- LOGGER.warning("Was not able to terminate '%s': %s\n"
- "Please terminate manually!!!", instance.id, str(ex))
+ LOGGER.info("Got public IP: %s", instance.public_ips[0])

- else:
- LOGGER.info("SCT Runner image exists in the source region '%s'! "
- "ID: %s", self.SOURCE_IMAGE_REGION, source_image.id)
+ return instance

- if self.region_name != self.SOURCE_IMAGE_REGION and self.image is None:
- LOGGER.info("Copying %s to %s ...\nNote: It can take 5-15 minutes.", self.image_name, self.region_name)
- new_image = self.gce_service.ex_copy_image( # pylint: disable=no-member
- self.image_name,
- self._get_image_url(self.source_image.id),
- description=self.IMAGE_DESCRIPTION,
- family=self.FAMILY
- )
- LOGGER.info("Image copied, id: '%s'.", new_image.id)
- LOGGER.info("Done.")
- else:
- LOGGER.info("No need to copy SCT Runner image since it's available for '%s'.", self.region_name)
+ def _stop_image_builder_instance(self, instance: Any) -> None:
+ self.gce_service_source.ex_stop_node(node=instance)

- def _get_base_image(self, image=None):
- """
- GCE needs image object in creating instance
- """
+ def _terminate_image_builder_instance(self, instance: Any) -> None:
+ self.gce_service_source.destroy_node(instance)
+
+ def _get_instance_id(self, instance: Any) -> Any:
+ return instance.id
+
+ def get_instance_public_ip(self, instance: Any) -> str:
+ return instance.public_ips[0]
+
+ def _create_image(self, instance: Any) -> Any:
+ return self.gce_service_source.ex_create_image(
+ name=self.image_name,
+ volume=self.gce_service_source.ex_get_volume(f"{instance.name}-root-pd-ssd"),
+ description=self.IMAGE_DESCRIPTION,
+ family=self.FAMILY,
+ ex_labels=self.tags_to_labels(tags=self.image_tags),
+ )
+
+ def _get_image_id(self, image: Any) -> Any:
+ return image.id
+
+ def _get_image_url(self, image_id) -> str:
+ return f"https://www.googleapis.com/compute/alpha/projects/{self.project_name}/global/images/{image_id}"
+
+ def _copy_source_image_to_region(self) -> None:
+ image = self.gce_service.ex_copy_image(
+ self.image_name,
+ self._get_image_url(self.source_image.id),
+ description=self.IMAGE_DESCRIPTION,
+ family=self.FAMILY,
+ )
+ LOGGER.info("Image copied, id: `%s'.", image.id)
+
+ def _get_base_image(self, image: Optional[Any] = None) -> Any:
if image is None:
image = self.image
return self._get_image_url(image.id)

- # pylint: disable=too-many-arguments
- def _create_instance(self, instance_type, base_image, tags_list, instance_name=None, region_az="", test_duration=None):
- if instance_name is None:
- instance_name = f"{self.image_name}-instance"
- ex_disks_gce_struct = [{"type": "PERSISTENT",
- "deviceName": f"{instance_name}-root-pd-ssd",
- "initializeParams": {
- "diskName": f"{instance_name}-root-pd-ssd",
- "diskType": f"projects/{self.project_name}/zones/{self.gce_service_source.zone.name}/diskTypes/pd-ssd",
- "diskSizeGb": self.instance_root_disk_size(test_duration),
- "sourceImage": base_image,
- },
- "boot": True,
- "autoDelete": True},
- ]
- labels = {}
- metadata = {}
- for tag_dict in tags_list:
- if tag_dict['Key'] != 'launch_time':
- labels[tag_dict['Key'].lower()] = str(tag_dict['Value']).lower().replace('.', '_')
- metadata[tag_dict['Key']] = tag_dict['Value']
- LOGGER.debug("Create node (%s) by image (%s)", instance_name, base_image)
- return self.gce_service_source.create_node(name=instance_name, size=instance_type,
- image=base_image,
- ex_network='qa-vpc',
- ex_disks_gce_struct=ex_disks_gce_struct,
- ex_labels=labels,
- ex_metadata=metadata)
-
- def _image(self, image_type=ImageType.SOURCE):
- if image_type == ImageType.SOURCE:
- driver = self.gce_service_source
- elif image_type == ImageType.GENERAL:
- driver = self.gce_service
- else:
- raise ValueError("Unknown Image type")
-
- try:
- return driver.ex_get_image(self.image_name)
- except ResourceNotFoundError as ex: # pylint: disable=unused-variable
- return None
-

-if __name__ == "__main__":
- TEST_REGION = "eu-west-2"
- TEST_ZONE = "a"
- SCT_RUNNER = AwsSctRunner(region_name=TEST_REGION, availability_zone=TEST_ZONE)
- SCT_RUNNER.create_image()
- SCT_RUNNER.create_instance(
- instance_type="", test_id="byakabuka", test_duration=60, region_az=f"{TEST_REGION}{TEST_ZONE}")
+def get_sct_runner(cloud_provider: str, region_name: str, availability_zone: str = "") -> SctRunner:
+ if cloud_provider == "aws":
+ return AwsSctRunner(region_name=region_name, availability_zone=availability_zone)
+ if cloud_provider == "gce":
+ return GceSctRunner(region_name=region_name, availability_zone=availability_zone)
+ raise Exception(f'Unsupported Cloud provider: `{cloud_provider}')
Reply all
Reply to author
Forward
0 new messages