From: Kamil Braun <
kbr...@scylladb.com>
Committer: Kamil Braun <
kbr...@scylladb.com>
Branch: next
service/raft: raft_group0: introduce `setup_group0`
Contains all logic for deciding to join (or not join) group 0.
Prepare for the case where we don't want to join group 0 immediately on
startup - the upgrade scenario (will be implemented in a follow-up).
Move the group 0 setup step earlier in `storage_service::join_cluster`.
`join_group0()` is now a private member of `raft_group0`. Some more
comments were written.
---
diff --git a/service/raft/raft_group0.cc b/service/raft/raft_group0.cc
--- a/service/raft/raft_group0.cc
+++ b/service/raft/raft_group0.cc
@@ -240,12 +240,7 @@ future<> raft_group0::start_server_for_group0(raft::group_id group0_id) {
future<> raft_group0::join_group0() {
assert(this_shard_id() == 0);
- // do nothing either if raft group registry is not enabled or we've already
- // finished joining some existing group0, so that subsequent calls
- // to the function are safe.
- if (!_raft_gr.is_enabled() || std::holds_alternative<raft::group_id>(_group0)) {
- co_return;
- }
+ assert(!joined_group0());
auto group0_id = raft::group_id{co_await db::system_keyspace::get_raft_group0_id()};
if (group0_id) {
@@ -307,10 +302,41 @@ future<> raft_group0::join_group0() {
rslog.info("{} joined group 0 with id {}",
my_addr.id, group0_id);
}
-future<> raft_group0::become_voter() {
+future<> raft_group0::setup_group0(db::system_keyspace& sys_ks) {
+ assert(this_shard_id() == 0);
+
if (!_raft_gr.is_enabled()) {
+
rslog.info("raft_group0::setup_group0(): local RAFT feature disabled, skipping group 0 setup.");
+ // Note: if the local feature was enabled by every node earlier, that would enable the cluster
+ // SUPPORTS_RAFT feature, and the node should then refuse to start during feature check
+ // (because if the local feature is disabled, then the cluster feature - enabled in the cluster - is 'unknown' to us).
+ co_return;
+ }
+
+ if (sys_ks.bootstrap_complete()) {
+ auto group0_id = raft::group_id{co_await db::system_keyspace::get_raft_group0_id()};
+ if (group0_id) {
+ // Group 0 ID is present => we've already joined group 0 earlier.
+
rslog.info("raft_group0::setup_group0(): group 0 ID present. Starting existing Raft server.");
+ co_await start_server_for_group0(group0_id);
+ } else {
+ // Scylla has bootstrapped earlier but group 0 ID not present. This means we're upgrading.
+ // TODO: Prepare for upgrade.
+ }
+
co_return;
}
+
+
rslog.info("raft_group0::setup_group0(): joining group 0...");
+ co_await join_group0();
+
rslog.info("raft_group0::setup_group0(): successfully joined group 0.");
+}
+
+future<> raft_group0::become_voter() {
+ if (!_raft_gr.is_enabled() || !joined_group0()) {
+ co_return;
+ }
+
auto my_addr = co_await load_my_addr();
assert(std::holds_alternative<raft::group_id>(_group0));
auto& gid = std::get<raft::group_id>(_group0);
@@ -361,6 +387,10 @@ future<> raft_group0::leave_group0(std::optional<gms::inet_address> node) {
co_return co_await ser::group0_rpc_verbs::send_group0_modify_config(&_ms, peer, timeout, g0_info.group0_id, {}, {remove_addr});
}
+bool raft_group0::joined_group0() const {
+ return std::holds_alternative<raft::group_id>(_group0);
+}
+
future<group0_peer_exchange> raft_group0::peer_exchange(discovery::peer_list peers) {
return std::visit([this, peers = std::move(peers)] (auto&& d) mutable -> future<group0_peer_exchange> {
using T = std::decay_t<decltype(d)>;
diff --git a/service/raft/raft_group0.hh b/service/raft/raft_group0.hh
--- a/service/raft/raft_group0.hh
+++ b/service/raft/raft_group0.hh
@@ -12,6 +12,8 @@
namespace cql3 { class query_processor; }
+namespace db { class system_keyspace; }
+
namespace gms { class gossiper; }
namespace service {
@@ -87,14 +89,20 @@ public:
future<> abort();
- // Join this node to the cluster-wide Raft group
- // Called during bootstrap. Is idempotent - it
- // does nothing if already done, or resumes from the
- // unifinished state if aborted. The result is that
- // raft service has group 0 running.
- future<> join_group0();
+ // Call during the startup procedure.
+ //
+ // If the local RAFT feature is enabled, does one of the following:
+ // - join group 0 (if we're bootstrapping),
+ // - start existing group 0 server (if we bootstrapped before),
+ // - (TODO: not implemented yet) prepare us for the upgrade procedure, which will create group 0 later.
+ //
+ // Cannot be called twice.
+ //
+ // TODO: specify dependencies on other services: where during startup should we setup group 0?
+ future<> setup_group0(db::system_keyspace&);
// After successful bootstrapping, make this node a voting member.
+ // Precondition: `setup_group0` successfully finished earlier.
future<> become_voter();
// Remove the node from the cluster-wide raft group.
@@ -107,6 +115,8 @@ private:
void init_rpc_verbs();
future<> uninit_rpc_verbs();
+ bool joined_group0() const;
+
// Handle peer_exchange RPC
future<group0_peer_exchange> peer_exchange(discovery::peer_list peers);
@@ -133,11 +143,28 @@ private:
// See 'raft-in-scylla.md', 'Establishing group 0 in a fresh cluster'.
future<group0_info> discover_group0(raft::server_address my_addr);
+ // Start a Raft server for the cluster-wide group 0 and join it to the group.
+ // Called during bootstrap or upgrade.
+ //
+ // The Raft server may already exist on disk (e.g. because we initialized it earlier but then crashed),
+ // but it doesn't have to. It may also already be part of group 0, but if not, we will attempt
+ // to discover and join it.
+ //
+ // Persists group 0 ID on disk at the end so subsequent restarts of Scylla process can detect that group 0
+ // has already been joined and the server initialized.
+ //
+ // Preconditions: Raft local feature enabled
+ // and we haven't initialized group 0 yet after last Scylla start (`joined_group0()` is false).
+ // Postcondition: `joined_group0()` is true.
+ future<> join_group0();
+
// Start an existing Raft server for the cluster-wide group 0.
// Assumes the server was already added to the group earlier so we don't attempt to join it again.
//
- // `group0_id` must be non-null and equal to the ID of group 0 that we joined earlier.
- // The existing group 0 server must not have been started yet after the last restart of Scylla.
+ // Preconditions: `group0_id` must be non-null and equal to the ID of group 0 that we joined earlier.
+ // The existing group 0 server must not have been started yet after the last restart of Scylla
+ // (`joined_group0()` is false).
+ // Postcondition: `joined_group0()` is true.
//
// XXX: perhaps it would be good to make this function callable multiple times,
// if we want to handle crashes of the group 0 server without crashing the entire Scylla process
diff --git a/service/storage_service.cc b/service/storage_service.cc
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -389,6 +389,10 @@ future<> storage_service::join_token_ring(cdc::generation_service& cdc_gen_servi
auto broadcast_rpc_address = utils::fb_utilities::get_broadcast_rpc_address();
// Ensure we know our own actual Schema UUID in preparation for updates
co_await db::schema_tables::recalculate_schema_version(_sys_ks, proxy, _feature_service);
+
+ assert(_group0);
+ co_await _group0->setup_group0(_sys_ks.local());
+
app_states.emplace(gms::application_state::NET_VERSION, versioned_value::network_version());
app_states.emplace(gms::application_state::HOST_ID, versioned_value::host_id(local_host_id));
app_states.emplace(gms::application_state::RPC_ADDRESS, versioned_value::rpcaddress(broadcast_rpc_address));
@@ -424,18 +428,6 @@ future<> storage_service::join_token_ring(cdc::generation_service& cdc_gen_servi
auto advertise = gms::advertise_myself(!replacing_a_node_with_same_ip);
co_await _gossiper.start_gossiping(generation_number, app_states, advertise);
- // Raft group0 can be joined before we wait for gossip to settle
- // if one of the following applies:
- // - it's a fresh node start (in a fresh cluster)
- // - it's a restart of an existing node, which have already joined some group0
- const bool can_join_with_raft =
- _db.local().get_config().check_experimental(db::experimental_features_t::feature::RAFT) && (
- _sys_ks.local().bootstrap_needed() ||
- !(co_await _sys_ks.local().get_raft_group0_id()).is_null());
- if (can_join_with_raft) {
- co_await _group0->join_group0();
- }
-
auto schema_change_announce = _db.local().observable_schema_version().observe([this] (utils::UUID schema_version) mutable {
_migration_manager.local().passive_announce(std::move(schema_version));
});
@@ -444,8 +436,6 @@ future<> storage_service::join_token_ring(cdc::generation_service& cdc_gen_servi
set_mode(mode::JOINING);
- co_await _group0->join_group0();
-
// We bootstrap if we haven't successfully bootstrapped before, as long as we are not a seed.
// If we are a seed, or if the user manually sets auto_bootstrap to false,
// we'll skip streaming data from other nodes and jump directly into the ring.