[COMMIT scylla-cluster-tests branch-6.0] fix(validator): fix various issues with sstable scrub validator

0 views
Skip to first unread message

Commit Bot

<bot@cloudius-systems.com>
unread,
Jun 28, 2024, 1:38:23 AMJun 28
to scylladb-dev@googlegroups.com, Lukasz Sojka
From: Lukasz Sojka <lukasz...@scylladb.com>
Committer: Israel Fruchter <israel....@gmail.com>
Branch: branch-6.0

fix(validator): fix various issues with sstable scrub validator

We face issues from time to time with sstable scrub validator, like
delayed logs or validating on unbootstrapped node.

fixes: https://github.com/scylladb/scylla-cluster-tests/issues/7440
(cherry picked from commit 8fe625d3dd9210fe342496ad2c25273accad5073)

---
diff --git a/sdcm/teardown_validators/sstables.py b/sdcm/teardown_validators/sstables.py
--- a/sdcm/teardown_validators/sstables.py
+++ b/sdcm/teardown_validators/sstables.py
@@ -1,7 +1,9 @@
import logging
from functools import partial

+from sdcm import wait
from sdcm.cluster import BaseNode
+from sdcm.exceptions import WaitForTimeoutError
from sdcm.sct_events import Severity
from sdcm.sct_events.teardown_validators import ValidatorEvent, ScrubValidationErrorEvent
from sdcm.teardown_validators.base import TeardownValidator
@@ -31,7 +33,12 @@ def _upload_corrupted_files(self, node: BaseNode, quarantine_log_lines):
return s3_link

def _run_nodetool_scrub(self, node: BaseNode, keyspace: str, table: str, timeout=1200):
- node.wait_db_up(timeout=300)
+ try:
+ node.wait_db_up(timeout=300)
+ except WaitForTimeoutError as ex:
+ # sometimes node can boot very long after last nemesis (e.g. bootstrap new node).
+ LOGGER.error("Error waiting for node %s to be up in sstable validator: %s\nskipping validation", node.name, ex)
+ return
finish_scrub_follower = node.follow_system_log(patterns=['Finished scrubbing in validate mode'])
quarantine_lines = node.follow_system_log(patterns=['sstable - Moving sstable'], start_from_beginning=True)
result = node.run_nodetool(sub_cmd='scrub', args=f"--mode VALIDATE --no-snapshot {keyspace} {table}".strip(),
@@ -40,7 +47,10 @@ def _run_nodetool_scrub(self, node: BaseNode, keyspace: str, table: str, timeout
ValidatorEvent(
message=f'Error running nodetool scrub on node {node.name}: {result.stdout}\n{result.stderr}',
severity=Severity.ERROR).publish()
- scrub_finish_lines = list(finish_scrub_follower)
+ # sometimes logs might be delayed, so we need to wait for them
+ scrub_finish_lines = wait.wait_for(func=lambda: list(finish_scrub_follower), step=10,
+ text="Waiting for 'Finished scrubbing in validate mode' logs",
+ timeout=300, throw_exc=False)
if not scrub_finish_lines:
ValidatorEvent(
message=f'No scrubbing validation message found in db logs on node: {node.name}', severity=Severity.ERROR).publish()
Reply all
Reply to author
Forward
0 new messages