group0: stop group0 before draining storage service during shutdown
Currently storage service is drained while group0 is still active. The
draining stops commitlogs, so after this point no more writes are
possible, but if group0 is still active it may try to apply commands
which will try to do writes and they will fail causing group0 state
machine errors. This is benign since we are shutting down anyway, but
better to fix shutdown order to keep logs clean.
Fixes scylladb/scylladb#19665
---
CI:
https://jenkins.scylladb.com/job/scylla-master/job/scylla-ci/11365/
diff --git a/main.cc b/main.cc
index 9aab704749..36ac65f07f 100644
--- a/main.cc
+++ b/main.cc
@@ -1915,11 +1915,6 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
ss.local().uninit_address_map().get();
});
- // Need to make sure storage service does not use group0 before running group0_service.abort()
- auto stop_group0_usage_in_storage_service = defer_verbose_shutdown("group 0 usage in local storage", [&ss] {
- ss.local().wait_for_group0_stop().get();
- });
-
// Setup group0 early in case the node is bootstrapped already and the group exists.
// Need to do it before allowing incoming messaging service connections since
// storage proxy's and migration manager's verbs may access group0.
diff --git a/service/storage_service.cc b/service/storage_service.cc
index ffbc04d2e1..e676c6c5ff 100644
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -1086,12 +1086,15 @@ future<> storage_service::sstable_cleanup_fiber(raft::server& server, sharded<se
break;
}
rtlogger.debug("cleanup flag cleared");
- } catch (const seastar::abort_requested_exception &) {
+ } catch (const seastar::abort_requested_exception&) {
rtlogger.info("cleanup fiber aborted");
break;
} catch (raft::request_aborted&) {
rtlogger.info("cleanup fiber aborted");
break;
+ } catch (const seastar::broken_condition_variable&) {
+
rtlogger.info("cleanup fiber aborted");
+ break;
} catch (...) {
rtlogger.error("cleanup fiber got an error: {}", std::current_exception());
err = true;
@@ -4582,6 +4585,8 @@ future<> storage_service::drain() {
future<> storage_service::do_drain() {
co_await stop_transport();
+ co_await wait_for_group0_stop();
+
co_await tracing::tracing::tracing_instance().invoke_on_all(&tracing::tracing::shutdown);
co_await get_batchlog_manager().invoke_on_all([] (auto& bm) {
--
Gleb.