One downside of Raft’s leader election algorithm is that a server that
has been partitioned from the cluster is likely to cause a disruption
when it regains connectivity. When a server is partitioned, it will
not receive heartbeats. It will soon increment its term to start
an election, although it won’t be able to collect enough votes to
become leader. When the server regains connectivity sometime later, its
larger term number will propagate to the rest of the cluster (either
through the server’s RequestVote requests or through its AppendEntries
response). This will force the cluster leader to step down, and a new
election will have to take place to select a new leader.
Prevoting stage is addressing that. In the Prevote algorithm, a
candidate only increments its term if it first learns from a majority of
the cluster that they would be willing to grant the candidate their votes
(if the candidate’s log is sufficiently up-to-date, and the voters have
not received heartbeats from a valid leader for at least a baseline
election timeout).
The Prevote algorithm solves the issue of a partitioned server disrupting
the cluster when it rejoins. While a server is partitioned, it won’t
be able to increment its term, since it can’t receive permission
from a majority of the cluster. Then, when it rejoins the cluster, it
still won’t be able to increment its term, since the other servers
will have been receiving regular heartbeats from the leader. Once the
server receives a heartbeat from the leader itself, it will return to
the follower state(in the same term).
---
No unit tests yet, but judging by the amount of problems replicate_test
was able to find it stresses the algorithm quite nicely.
diff --git a/idl/raft.idl.hh b/idl/raft.idl.hh
index 97f26ad82a..5517dbf295 100644
--- a/idl/raft.idl.hh
+++ b/idl/raft.idl.hh
@@ -62,11 +62,13 @@ struct vote_request {
raft::internal::tagged_uint64<raft::term_tag> current_term;
raft::internal::tagged_uint64<raft::index_tag> last_log_idx;
raft::internal::tagged_uint64<raft::term_tag> last_log_term;
+ bool prevote;
};
struct vote_reply {
raft::internal::tagged_uint64<raft::term_tag> current_term;
bool vote_granted;
+ bool prevote;
};
struct install_snapshot {
diff --git a/raft/fsm.hh b/raft/fsm.hh
index 17fb348284..a45f5bff97 100644
--- a/raft/fsm.hh
+++ b/raft/fsm.hh
@@ -48,6 +48,8 @@ struct fsm_config {
// is configured by the snapshot, otherwise the state
// machine will deadlock.
size_t max_log_length;
+ // If set to true will enable prevoting stage during election
+ bool enable_prevoting;
};
// 3.4 Leader election
@@ -69,6 +71,7 @@ static constexpr logical_clock::duration ELECTION_TIMEOUT = logical_clock::durat
// leader.
class follower {};
class candidate {};
+class prevote {};
class leader {};
// Raft protocol finite state machine
@@ -110,7 +113,7 @@ class fsm {
// id of the current leader
server_id _current_leader;
// What state the server is in. The default is follower.
- std::variant<follower, candidate, leader> _state;
+ std::variant<follower, candidate, leader, prevote> _state;
// _current_term, _voted_for && _log are persisted in persistence
// The latest term the server has seen.
term_t _current_term;
@@ -221,10 +224,14 @@ class fsm {
}
}
+ void campain(bool prevote);
+
void become_leader();
void become_candidate();
+ void become_prevote();
+
void become_follower(server_id leader);
// Controls whether the follower has been responsive recently,
@@ -250,6 +257,8 @@ class fsm {
// Tick implementation on a leader
void tick_leader();
+ void reset_election_timeout();
+
public:
explicit fsm(server_id id, term_t current_term, server_id voted_for, log log,
failure_detector& failure_detector, fsm_config conf);
@@ -263,6 +272,9 @@ class fsm {
bool is_candidate() const {
return std::holds_alternative<candidate>(_state);
}
+ bool is_prevote() const {
+ return std::holds_alternative<prevote>(_state);
+ }
index_t log_last_idx() const {
return _log.last_idx();
}
@@ -336,6 +348,24 @@ class fsm {
friend std::ostream& operator<<(std::ostream& os, const fsm& f);
};
+namespace internal {
+static inline bool need_update_term(const vote_request& r) {
+ return !r.prevote;
+}
+static inline bool need_update_term(const vote_reply& r) {
+ // We send pre-vote requests with a term in our future. If the
+ // pre-vote is granted, we will increment our term when we get a
+ // quorum. If it is not, the term comes from the node that
+ // rejected our vote so we should become a follower at the new
+ // term.
+ return !r.prevote || !r.vote_granted;
+}
+template<typename T>
+static inline bool need_update_term(const T& r) {
+ return true;
+}
+}
+
template <typename Message>
void fsm::step(server_id from, Message&& msg) {
static_assert(std::is_rvalue_reference<decltype(msg)>::value, "must be rvalue");
@@ -353,11 +383,13 @@ void fsm::step(server_id from, Message&& msg) {
// follower state. If a server receives a request with
// a stale term number, it rejects the request.
if (msg.current_term > _current_term) {
+ server_id leader{};
+
logger.trace("{} [term: {}] received a message with higher term from {} [term: {}]",
_my_id, _current_term, from, msg.current_term);
if constexpr (std::is_same_v<Message, append_request>) {
- become_follower(from);
+ leader = from;
} else {
if constexpr (std::is_same_v<Message, vote_request>) {
if (_current_leader != server_id{} && election_elapsed() < ELECTION_TIMEOUT) {
@@ -366,15 +398,17 @@ void fsm::step(server_id from, Message&& msg) {
// within the minimum election timeout of
// hearing from a current leader, it does not
// update its term or grant its vote.
- logger.trace("{} [term: {}] not granting a vote within a minimum election timeout, elapsed {}",
- _my_id, _current_term, election_elapsed());
+ logger.trace("{} [term: {}] not granting a vote within a minimum election timeout, elapsed {} (current leader = {})",
+ _my_id, _current_term, election_elapsed(), _current_leader);
return;
}
}
- become_follower(server_id{});
}
- update_current_term(msg.current_term);
-
+ // Do not update term or become follower on prevote
+ if (internal::need_update_term(msg)) {
+ become_follower(leader);
+ update_current_term(msg.current_term);
+ }
} else if (msg.current_term < _current_term) {
if constexpr (std::is_same_v<Message, append_request>) {
// Instructs the leader to step down.
@@ -392,7 +426,7 @@ void fsm::step(server_id from, Message&& msg) {
} else /* _current_term == msg.current_term */ {
if constexpr (std::is_same_v<Message, append_request> ||
std::is_same_v<Message, install_snapshot>) {
- if (is_candidate()) {
+ if (is_candidate() || is_prevote()) {
// 3.4 Leader Election
// While waiting for votes, a candidate may receive an AppendEntries
// RPC from another server claiming to be leader. If the
@@ -427,8 +461,8 @@ void fsm::step(server_id from, Message&& msg) {
} else if constexpr (std::is_same_v<Message, vote_request>) {
request_vote(from, std::move(msg));
} else if constexpr (std::is_same_v<Message, vote_reply>) {
- if constexpr (!std::is_same_v<State, candidate>) {
- // Ignore stray reply if we're not a candidate.
+ if constexpr (!std::is_same_v<State, candidate> && !std::is_same_v<State, prevote>) {
+ // Ignore stray reply if we're not a candidate or prevote
return;
}
request_vote_reply(from, std::move(msg));
diff --git a/raft/raft.hh b/raft/raft.hh
index 79c1b594aa..4524bdc666 100644
--- a/raft/raft.hh
+++ b/raft/raft.hh
@@ -259,6 +259,8 @@ struct vote_request {
index_t last_log_idx;
// The term of the candidate's last log entry.
term_t last_log_term;
+ // True if this is prevote request
+ bool prevote;
};
struct vote_reply {
@@ -266,6 +268,8 @@ struct vote_reply {
term_t current_term;
// True means the candidate received a vote.
bool vote_granted;
+ // True if it is a reply to prevote request
+ bool prevote;
};
struct install_snapshot {
diff --git a/raft/server.hh b/raft/server.hh
index 2be5156c8d..7c47ea677f 100644
--- a/raft/server.hh
+++ b/raft/server.hh
@@ -46,6 +46,8 @@ class server {
// is configured by the snapshot, otherwise the state
// machine will deadlock on attempt to submit a new entry.
size_t max_log_length = 5000;
+ // If set to true will enable prevoting stage during election
+ bool enable_prevoting = true;
};
virtual ~server() {}
diff --git a/raft/fsm.cc b/raft/fsm.cc
index fb55875b20..4f38413ecc 100644
--- a/raft/fsm.cc
+++ b/raft/fsm.cc
@@ -118,13 +118,11 @@ void fsm::update_current_term(term_t current_term)
assert(_current_term < current_term);
_current_term = current_term;
_voted_for = server_id{};
+}
+void fsm::reset_election_timeout() {
static thread_local std::default_random_engine re{std::random_device{}()};
static thread_local std::uniform_int_distribution<> dist(1, ELECTION_TIMEOUT.count());
- // Reset the randomized election timeout on each term
- // change, even if we do not plan to campaign during this
- // term: the main purpose of the timeout is to avoid
- // starting our campaign simultaneously with other followers.
_randomized_election_timeout = ELECTION_TIMEOUT + logical_clock::duration{dist(re)};
}
@@ -138,6 +136,7 @@ void fsm::become_leader() {
_log_limiter_semaphore.emplace(this);
_log_limiter_semaphore->sem.consume(_log.length());
_last_election_time = _clock.now();
+ reset_election_timeout();
// a new leader needs to commit at lease one entry to make sure that
// all existing entries in its log are commited as well. Also it should
// send append entries rpc as soon as possible to establish its leqdership
@@ -155,13 +154,20 @@ void fsm::become_follower(server_id leader) {
_tracker = std::nullopt;
_log_limiter_semaphore = std::nullopt;
_votes = std::nullopt;
+ reset_election_timeout();
if (_current_leader) {
_last_election_time = _clock.now();
}
}
-void fsm::become_candidate() {
- _state = candidate{};
+void fsm::campain(bool do_prevote) {
+ // When starting a campain we need to reset current leader otherwise
+ // distruptive server prevention will stall an election if qurum of nodes
+ // start election together since each one will ignore vote requests from others
+ _current_leader = {};
+
+ reset_election_timeout();
+
_tracker = std::nullopt;
_log_limiter_semaphore = std::nullopt;
// 3.4 Leader election
@@ -177,34 +183,55 @@ void fsm::become_candidate() {
_votes.emplace(_log.get_configuration());
const auto& voters = _votes->voters();
- if (voters.find(server_address{_my_id}) == voters.end()) {
+ if (!voters.contains(server_address{_my_id})) {
// If the server is not part of the current configuration,
// revert to the follower state without increasing
// the current term.
become_follower(server_id{});
return;
}
- update_current_term(term_t{_current_term + 1});
+
+ term_t term{_current_term + 1};
+ if (!do_prevote) {
+ update_current_term(term);
+ }
// Replicate RequestVote
for (const auto& server : voters) {
if (
server.id == _my_id) {
// Vote for self.
_votes->register_vote(
server.id, true);
- _voted_for = _my_id;
+ if (!do_prevote) {
+ // Only record real votes
+ _voted_for = _my_id;
+ }
// Already signaled _sm_events in update_current_term()
continue;
}
- logger.trace("{} [term: {}, index: {}, last log term: {}] sent vote request to {}",
- _my_id, _current_term, _log.last_idx(), _log.last_term(),
server.id);
+ logger.trace("{} [term: {}, index: {}, last log term: {}{}] sent vote request to {}",
+ _my_id, term, _log.last_idx(), _log.last_term(), do_prevote ? ", prevote" : "",
server.id);
- send_to(
server.id, vote_request{_current_term, _log.last_idx(), _log.last_term()});
+ send_to(
server.id, vote_request{term, _log.last_idx(), _log.last_term(), do_prevote});
}
if (_votes->tally_votes() == vote_result::WON) {
// A single node cluster.
- become_leader();
+ if (!do_prevote) {
+ become_leader();
+ } else {
+ become_candidate();
+ }
}
}
+void fsm::become_candidate() {
+ _state = candidate{};
+ campain(false);
+}
+
+void fsm::become_prevote() {
+ _state = prevote{};
+ campain(true);
+}
+
future<fsm_output> fsm::poll_output() {
logger.trace("fsm::poll_output() {} stable index: {} last index: {}",
_my_id, _log.stable_idx(), _log.last_idx());
@@ -406,7 +433,11 @@ void fsm::tick() {
} else if (is_past_election_timeout()) {
logger.trace("tick[{}]: becoming a candidate, last election: {}, now: {}", _my_id,
_last_election_time, _clock.now());
- become_candidate();
+ if (_config.enable_prevoting) {
+ become_prevote();
+ } else {
+ become_candidate();
+ }
}
}
@@ -535,13 +566,15 @@ void fsm::request_vote(server_id from, vote_request&& request) {
// We can cast a vote in any state. If the candidate's term is
// lower than ours, we ignore the request. Otherwise we first
// update our current term and convert to a follower.
- assert(_current_term == request.current_term);
+ assert(request.prevote || _current_term == request.current_term);
bool can_vote =
// We can vote if this is a repeat of a vote we've already cast...
_voted_for == from ||
// ...we haven't voted and we don't think there's a leader yet in this term...
- (_voted_for == server_id{} && _current_leader == server_id{});
+ (_voted_for == server_id{} && _current_leader == server_id{}) ||
+ // ...this is prevote for a future term...
+ (request.prevote && request.current_term > _current_term);
// ...and we believe the candidate is up to date.
if (can_vote && _log.is_up_to_date(request.last_log_idx, request.last_log_term)) {
@@ -550,28 +583,38 @@ void fsm::request_vote(server_id from, vote_request&& request) {
"voted for {} [log_term: {}, log_index: {}]",
_my_id, _current_term, _log.last_idx(), _log.last_term(), _voted_for,
from, request.last_log_term, request.last_log_idx);
- // If a server grants a vote, it must reset its election
- // timer. See Raft Summary.
- _last_election_time = _clock.now();
- _voted_for = from;
-
- send_to(from, vote_reply{_current_term, true});
+ if (!request.prevote) { // Only record real votes
+ // If a server grants a vote, it must reset its election
+ // timer. See Raft Summary.
+ _last_election_time = _clock.now();
+ _voted_for = from;
+ }
+ // When responding to Msg{Pre,}Vote messages we include the term
+ // from the message, not the local term. To see why, consider the
+ // case where a single node was previously partitioned away and
+ // it's local term is now out of date. If we include the local term
+ // (recall that for pre-votes we don't update the local term), the
+ // (pre-)campaigning node on the other end will proceed to ignore
+ // the message (it ignores all out of date messages).
+ // The term in the original message and current local term are the
+ // same in the case of regular votes, but different for pre-votes.
+ send_to(from, vote_reply{request.current_term, true, request.prevote});
} else {
// If a vote is not granted, this server is a potential
// viable candidate, so it should not reset its election
// timer, to avoid election disruption by non-viable
// candidates.
logger.trace("{} [term: {}, index: {}, log_term: {}, voted_for: {}] "
- "rejected vote for {} [log_term: {}, log_index: {}]",
+ "rejected vote for {} [current_term: {}, log_term: {}, log_index: {}, prevote: {}]",
_my_id, _current_term, _log.last_idx(), _log.last_term(), _voted_for,
- from, request.last_log_term, request.last_log_idx);
+ from, request.current_term, request.last_log_term, request.last_log_idx, request.prevote);
- send_to(from, vote_reply{_current_term, false});
+ send_to(from, vote_reply{_current_term, false, request.prevote});
}
}
void fsm::request_vote_reply(server_id from, vote_reply&& reply) {
- assert(is_candidate());
+ assert(is_candidate() || is_prevote());
logger.trace("{} received a {} vote from {}", _my_id, reply.vote_granted ? "yes" : "no", from);
@@ -581,7 +624,11 @@ void fsm::request_vote_reply(server_id from, vote_reply&& reply) {
case vote_result::UNKNOWN:
break;
case vote_result::WON:
- become_leader();
+ if (is_candidate()) {
+ become_leader();
+ } else {
+ become_candidate();
+ }
break;
case vote_result::LOST:
become_follower(server_id{});
diff --git a/raft/server.cc b/raft/server.cc
index 38ecb59e75..f4927faf81 100644
--- a/raft/server.cc
+++ b/raft/server.cc
@@ -198,7 +198,8 @@ future<> server_impl::start() {
_fsm = std::make_unique<fsm>(_id, term, vote, std::move(log), *_failure_detector,
fsm_config {
.append_request_threshold = _config.append_request_threshold,
- .max_log_length = _config.max_log_length
+ .max_log_length = _config.max_log_length,
+ .enable_prevoting = _config.enable_prevoting
});
assert(_fsm->get_current_term() != term_t(0));
@@ -604,7 +605,7 @@ void server_impl::register_metrics() {
}
future<> server_impl::elect_me_leader() {
- while (!_fsm->is_candidate() && !_fsm->is_leader()) {
+ while (_fsm->is_follower()) {
_fsm->tick();
}
do {
--
Gleb.