From: Lukasz Sojka <
lukasz...@scylladb.com>
Committer: Israel Fruchter <
israel....@gmail.com>
Branch: branch-6.0
fix(validator): fix various issues with sstable scrub validator
We face issues from time to time with sstable scrub validator, like
delayed logs or validating on unbootstrapped node.
fixes:
https://github.com/scylladb/scylla-cluster-tests/issues/7440
(cherry picked from commit 8fe625d3dd9210fe342496ad2c25273accad5073)
---
diff --git a/sdcm/teardown_validators/sstables.py b/sdcm/teardown_validators/sstables.py
--- a/sdcm/teardown_validators/sstables.py
+++ b/sdcm/teardown_validators/sstables.py
@@ -1,7 +1,9 @@
import logging
from functools import partial
+from sdcm import wait
from sdcm.cluster import BaseNode
+from sdcm.exceptions import WaitForTimeoutError
from sdcm.sct_events import Severity
from sdcm.sct_events.teardown_validators import ValidatorEvent, ScrubValidationErrorEvent
from sdcm.teardown_validators.base import TeardownValidator
@@ -31,7 +33,12 @@ def _upload_corrupted_files(self, node: BaseNode, quarantine_log_lines):
return s3_link
def _run_nodetool_scrub(self, node: BaseNode, keyspace: str, table: str, timeout=1200):
- node.wait_db_up(timeout=300)
+ try:
+ node.wait_db_up(timeout=300)
+ except WaitForTimeoutError as ex:
+ # sometimes node can boot very long after last nemesis (e.g. bootstrap new node).
+ LOGGER.error("Error waiting for node %s to be up in sstable validator: %s\nskipping validation",
node.name, ex)
+ return
finish_scrub_follower = node.follow_system_log(patterns=['Finished scrubbing in validate mode'])
quarantine_lines = node.follow_system_log(patterns=['sstable - Moving sstable'], start_from_beginning=True)
result = node.run_nodetool(sub_cmd='scrub', args=f"--mode VALIDATE --no-snapshot {keyspace} {table}".strip(),
@@ -40,7 +47,10 @@ def _run_nodetool_scrub(self, node: BaseNode, keyspace: str, table: str, timeout
ValidatorEvent(
message=f'Error running nodetool scrub on node {
node.name}: {result.stdout}\n{result.stderr}',
severity=Severity.ERROR).publish()
- scrub_finish_lines = list(finish_scrub_follower)
+ # sometimes logs might be delayed, so we need to wait for them
+ scrub_finish_lines = wait.wait_for(func=lambda: list(finish_scrub_follower), step=10,
+ text="Waiting for 'Finished scrubbing in validate mode' logs",
+ timeout=300, throw_exc=False)
if not scrub_finish_lines:
ValidatorEvent(
message=f'No scrubbing validation message found in db logs on node: {
node.name}', severity=Severity.ERROR).publish()