[PATCH] raft: implement prevoting stage in leader election

425 views
Skip to first unread message

Gleb Natapov

<gleb@scylladb.com>
unread,
Feb 25, 2021, 10:43:29 AM2/25/21
to scylladb-dev@googlegroups.com
One downside of Raft’s leader election algorithm is that a server that
has been partitioned from the cluster is likely to cause a disruption
when it regains connectivity. When a server is partitioned, it will
not receive heartbeats. It will soon increment its term to start
an election, although it won’t be able to collect enough votes to
become leader. When the server regains connectivity sometime later, its
larger term number will propagate to the rest of the cluster (either
through the server’s RequestVote requests or through its AppendEntries
response). This will force the cluster leader to step down, and a new
election will have to take place to select a new leader.

Prevoting stage is addressing that. In the Prevote algorithm, a
candidate only increments its term if it first learns from a majority of
the cluster that they would be willing to grant the candidate their votes
(if the candidate’s log is sufficiently up-to-date, and the voters have
not received heartbeats from a valid leader for at least a baseline
election timeout).

The Prevote algorithm solves the issue of a partitioned server disrupting
the cluster when it rejoins. While a server is partitioned, it won’t
be able to increment its term, since it can’t receive permission
from a majority of the cluster. Then, when it rejoins the cluster, it
still won’t be able to increment its term, since the other servers
will have been receiving regular heartbeats from the leader. Once the
server receives a heartbeat from the leader itself, it will return to
the follower state(in the same term).

---

No unit tests yet, but judging by the amount of problems replicate_test
was able to find it stresses the algorithm quite nicely.

diff --git a/idl/raft.idl.hh b/idl/raft.idl.hh
index 97f26ad82a..5517dbf295 100644
--- a/idl/raft.idl.hh
+++ b/idl/raft.idl.hh
@@ -62,11 +62,13 @@ struct vote_request {
raft::internal::tagged_uint64<raft::term_tag> current_term;
raft::internal::tagged_uint64<raft::index_tag> last_log_idx;
raft::internal::tagged_uint64<raft::term_tag> last_log_term;
+ bool prevote;
};

struct vote_reply {
raft::internal::tagged_uint64<raft::term_tag> current_term;
bool vote_granted;
+ bool prevote;
};

struct install_snapshot {
diff --git a/raft/fsm.hh b/raft/fsm.hh
index 17fb348284..a45f5bff97 100644
--- a/raft/fsm.hh
+++ b/raft/fsm.hh
@@ -48,6 +48,8 @@ struct fsm_config {
// is configured by the snapshot, otherwise the state
// machine will deadlock.
size_t max_log_length;
+ // If set to true will enable prevoting stage during election
+ bool enable_prevoting;
};

// 3.4 Leader election
@@ -69,6 +71,7 @@ static constexpr logical_clock::duration ELECTION_TIMEOUT = logical_clock::durat
// leader.
class follower {};
class candidate {};
+class prevote {};
class leader {};

// Raft protocol finite state machine
@@ -110,7 +113,7 @@ class fsm {
// id of the current leader
server_id _current_leader;
// What state the server is in. The default is follower.
- std::variant<follower, candidate, leader> _state;
+ std::variant<follower, candidate, leader, prevote> _state;
// _current_term, _voted_for && _log are persisted in persistence
// The latest term the server has seen.
term_t _current_term;
@@ -221,10 +224,14 @@ class fsm {
}
}

+ void campain(bool prevote);
+
void become_leader();

void become_candidate();

+ void become_prevote();
+
void become_follower(server_id leader);

// Controls whether the follower has been responsive recently,
@@ -250,6 +257,8 @@ class fsm {
// Tick implementation on a leader
void tick_leader();

+ void reset_election_timeout();
+
public:
explicit fsm(server_id id, term_t current_term, server_id voted_for, log log,
failure_detector& failure_detector, fsm_config conf);
@@ -263,6 +272,9 @@ class fsm {
bool is_candidate() const {
return std::holds_alternative<candidate>(_state);
}
+ bool is_prevote() const {
+ return std::holds_alternative<prevote>(_state);
+ }
index_t log_last_idx() const {
return _log.last_idx();
}
@@ -336,6 +348,24 @@ class fsm {
friend std::ostream& operator<<(std::ostream& os, const fsm& f);
};

+namespace internal {
+static inline bool need_update_term(const vote_request& r) {
+ return !r.prevote;
+}
+static inline bool need_update_term(const vote_reply& r) {
+ // We send pre-vote requests with a term in our future. If the
+ // pre-vote is granted, we will increment our term when we get a
+ // quorum. If it is not, the term comes from the node that
+ // rejected our vote so we should become a follower at the new
+ // term.
+ return !r.prevote || !r.vote_granted;
+}
+template<typename T>
+static inline bool need_update_term(const T& r) {
+ return true;
+}
+}
+
template <typename Message>
void fsm::step(server_id from, Message&& msg) {
static_assert(std::is_rvalue_reference<decltype(msg)>::value, "must be rvalue");
@@ -353,11 +383,13 @@ void fsm::step(server_id from, Message&& msg) {
// follower state. If a server receives a request with
// a stale term number, it rejects the request.
if (msg.current_term > _current_term) {
+ server_id leader{};
+
logger.trace("{} [term: {}] received a message with higher term from {} [term: {}]",
_my_id, _current_term, from, msg.current_term);

if constexpr (std::is_same_v<Message, append_request>) {
- become_follower(from);
+ leader = from;
} else {
if constexpr (std::is_same_v<Message, vote_request>) {
if (_current_leader != server_id{} && election_elapsed() < ELECTION_TIMEOUT) {
@@ -366,15 +398,17 @@ void fsm::step(server_id from, Message&& msg) {
// within the minimum election timeout of
// hearing from a current leader, it does not
// update its term or grant its vote.
- logger.trace("{} [term: {}] not granting a vote within a minimum election timeout, elapsed {}",
- _my_id, _current_term, election_elapsed());
+ logger.trace("{} [term: {}] not granting a vote within a minimum election timeout, elapsed {} (current leader = {})",
+ _my_id, _current_term, election_elapsed(), _current_leader);
return;
}
}
- become_follower(server_id{});
}
- update_current_term(msg.current_term);
-
+ // Do not update term or become follower on prevote
+ if (internal::need_update_term(msg)) {
+ become_follower(leader);
+ update_current_term(msg.current_term);
+ }
} else if (msg.current_term < _current_term) {
if constexpr (std::is_same_v<Message, append_request>) {
// Instructs the leader to step down.
@@ -392,7 +426,7 @@ void fsm::step(server_id from, Message&& msg) {
} else /* _current_term == msg.current_term */ {
if constexpr (std::is_same_v<Message, append_request> ||
std::is_same_v<Message, install_snapshot>) {
- if (is_candidate()) {
+ if (is_candidate() || is_prevote()) {
// 3.4 Leader Election
// While waiting for votes, a candidate may receive an AppendEntries
// RPC from another server claiming to be leader. If the
@@ -427,8 +461,8 @@ void fsm::step(server_id from, Message&& msg) {
} else if constexpr (std::is_same_v<Message, vote_request>) {
request_vote(from, std::move(msg));
} else if constexpr (std::is_same_v<Message, vote_reply>) {
- if constexpr (!std::is_same_v<State, candidate>) {
- // Ignore stray reply if we're not a candidate.
+ if constexpr (!std::is_same_v<State, candidate> && !std::is_same_v<State, prevote>) {
+ // Ignore stray reply if we're not a candidate or prevote
return;
}
request_vote_reply(from, std::move(msg));
diff --git a/raft/raft.hh b/raft/raft.hh
index 79c1b594aa..4524bdc666 100644
--- a/raft/raft.hh
+++ b/raft/raft.hh
@@ -259,6 +259,8 @@ struct vote_request {
index_t last_log_idx;
// The term of the candidate's last log entry.
term_t last_log_term;
+ // True if this is prevote request
+ bool prevote;
};

struct vote_reply {
@@ -266,6 +268,8 @@ struct vote_reply {
term_t current_term;
// True means the candidate received a vote.
bool vote_granted;
+ // True if it is a reply to prevote request
+ bool prevote;
};

struct install_snapshot {
diff --git a/raft/server.hh b/raft/server.hh
index 2be5156c8d..7c47ea677f 100644
--- a/raft/server.hh
+++ b/raft/server.hh
@@ -46,6 +46,8 @@ class server {
// is configured by the snapshot, otherwise the state
// machine will deadlock on attempt to submit a new entry.
size_t max_log_length = 5000;
+ // If set to true will enable prevoting stage during election
+ bool enable_prevoting = true;
};

virtual ~server() {}
diff --git a/raft/fsm.cc b/raft/fsm.cc
index fb55875b20..4f38413ecc 100644
--- a/raft/fsm.cc
+++ b/raft/fsm.cc
@@ -118,13 +118,11 @@ void fsm::update_current_term(term_t current_term)
assert(_current_term < current_term);
_current_term = current_term;
_voted_for = server_id{};
+}

+void fsm::reset_election_timeout() {
static thread_local std::default_random_engine re{std::random_device{}()};
static thread_local std::uniform_int_distribution<> dist(1, ELECTION_TIMEOUT.count());
- // Reset the randomized election timeout on each term
- // change, even if we do not plan to campaign during this
- // term: the main purpose of the timeout is to avoid
- // starting our campaign simultaneously with other followers.
_randomized_election_timeout = ELECTION_TIMEOUT + logical_clock::duration{dist(re)};
}

@@ -138,6 +136,7 @@ void fsm::become_leader() {
_log_limiter_semaphore.emplace(this);
_log_limiter_semaphore->sem.consume(_log.length());
_last_election_time = _clock.now();
+ reset_election_timeout();
// a new leader needs to commit at lease one entry to make sure that
// all existing entries in its log are commited as well. Also it should
// send append entries rpc as soon as possible to establish its leqdership
@@ -155,13 +154,20 @@ void fsm::become_follower(server_id leader) {
_tracker = std::nullopt;
_log_limiter_semaphore = std::nullopt;
_votes = std::nullopt;
+ reset_election_timeout();
if (_current_leader) {
_last_election_time = _clock.now();
}
}

-void fsm::become_candidate() {
- _state = candidate{};
+void fsm::campain(bool do_prevote) {
+ // When starting a campain we need to reset current leader otherwise
+ // distruptive server prevention will stall an election if qurum of nodes
+ // start election together since each one will ignore vote requests from others
+ _current_leader = {};
+
+ reset_election_timeout();
+
_tracker = std::nullopt;
_log_limiter_semaphore = std::nullopt;
// 3.4 Leader election
@@ -177,34 +183,55 @@ void fsm::become_candidate() {
_votes.emplace(_log.get_configuration());

const auto& voters = _votes->voters();
- if (voters.find(server_address{_my_id}) == voters.end()) {
+ if (!voters.contains(server_address{_my_id})) {
// If the server is not part of the current configuration,
// revert to the follower state without increasing
// the current term.
become_follower(server_id{});
return;
}
- update_current_term(term_t{_current_term + 1});
+
+ term_t term{_current_term + 1};
+ if (!do_prevote) {
+ update_current_term(term);
+ }
// Replicate RequestVote
for (const auto& server : voters) {
if (server.id == _my_id) {
// Vote for self.
_votes->register_vote(server.id, true);
- _voted_for = _my_id;
+ if (!do_prevote) {
+ // Only record real votes
+ _voted_for = _my_id;
+ }
// Already signaled _sm_events in update_current_term()
continue;
}
- logger.trace("{} [term: {}, index: {}, last log term: {}] sent vote request to {}",
- _my_id, _current_term, _log.last_idx(), _log.last_term(), server.id);
+ logger.trace("{} [term: {}, index: {}, last log term: {}{}] sent vote request to {}",
+ _my_id, term, _log.last_idx(), _log.last_term(), do_prevote ? ", prevote" : "", server.id);

- send_to(server.id, vote_request{_current_term, _log.last_idx(), _log.last_term()});
+ send_to(server.id, vote_request{term, _log.last_idx(), _log.last_term(), do_prevote});
}
if (_votes->tally_votes() == vote_result::WON) {
// A single node cluster.
- become_leader();
+ if (!do_prevote) {
+ become_leader();
+ } else {
+ become_candidate();
+ }
}
}

+void fsm::become_candidate() {
+ _state = candidate{};
+ campain(false);
+}
+
+void fsm::become_prevote() {
+ _state = prevote{};
+ campain(true);
+}
+
future<fsm_output> fsm::poll_output() {
logger.trace("fsm::poll_output() {} stable index: {} last index: {}",
_my_id, _log.stable_idx(), _log.last_idx());
@@ -406,7 +433,11 @@ void fsm::tick() {
} else if (is_past_election_timeout()) {
logger.trace("tick[{}]: becoming a candidate, last election: {}, now: {}", _my_id,
_last_election_time, _clock.now());
- become_candidate();
+ if (_config.enable_prevoting) {
+ become_prevote();
+ } else {
+ become_candidate();
+ }
}
}

@@ -535,13 +566,15 @@ void fsm::request_vote(server_id from, vote_request&& request) {
// We can cast a vote in any state. If the candidate's term is
// lower than ours, we ignore the request. Otherwise we first
// update our current term and convert to a follower.
- assert(_current_term == request.current_term);
+ assert(request.prevote || _current_term == request.current_term);

bool can_vote =
// We can vote if this is a repeat of a vote we've already cast...
_voted_for == from ||
// ...we haven't voted and we don't think there's a leader yet in this term...
- (_voted_for == server_id{} && _current_leader == server_id{});
+ (_voted_for == server_id{} && _current_leader == server_id{}) ||
+ // ...this is prevote for a future term...
+ (request.prevote && request.current_term > _current_term);

// ...and we believe the candidate is up to date.
if (can_vote && _log.is_up_to_date(request.last_log_idx, request.last_log_term)) {
@@ -550,28 +583,38 @@ void fsm::request_vote(server_id from, vote_request&& request) {
"voted for {} [log_term: {}, log_index: {}]",
_my_id, _current_term, _log.last_idx(), _log.last_term(), _voted_for,
from, request.last_log_term, request.last_log_idx);
- // If a server grants a vote, it must reset its election
- // timer. See Raft Summary.
- _last_election_time = _clock.now();
- _voted_for = from;
-
- send_to(from, vote_reply{_current_term, true});
+ if (!request.prevote) { // Only record real votes
+ // If a server grants a vote, it must reset its election
+ // timer. See Raft Summary.
+ _last_election_time = _clock.now();
+ _voted_for = from;
+ }
+ // When responding to Msg{Pre,}Vote messages we include the term
+ // from the message, not the local term. To see why, consider the
+ // case where a single node was previously partitioned away and
+ // it's local term is now out of date. If we include the local term
+ // (recall that for pre-votes we don't update the local term), the
+ // (pre-)campaigning node on the other end will proceed to ignore
+ // the message (it ignores all out of date messages).
+ // The term in the original message and current local term are the
+ // same in the case of regular votes, but different for pre-votes.
+ send_to(from, vote_reply{request.current_term, true, request.prevote});
} else {
// If a vote is not granted, this server is a potential
// viable candidate, so it should not reset its election
// timer, to avoid election disruption by non-viable
// candidates.
logger.trace("{} [term: {}, index: {}, log_term: {}, voted_for: {}] "
- "rejected vote for {} [log_term: {}, log_index: {}]",
+ "rejected vote for {} [current_term: {}, log_term: {}, log_index: {}, prevote: {}]",
_my_id, _current_term, _log.last_idx(), _log.last_term(), _voted_for,
- from, request.last_log_term, request.last_log_idx);
+ from, request.current_term, request.last_log_term, request.last_log_idx, request.prevote);

- send_to(from, vote_reply{_current_term, false});
+ send_to(from, vote_reply{_current_term, false, request.prevote});
}
}

void fsm::request_vote_reply(server_id from, vote_reply&& reply) {
- assert(is_candidate());
+ assert(is_candidate() || is_prevote());

logger.trace("{} received a {} vote from {}", _my_id, reply.vote_granted ? "yes" : "no", from);

@@ -581,7 +624,11 @@ void fsm::request_vote_reply(server_id from, vote_reply&& reply) {
case vote_result::UNKNOWN:
break;
case vote_result::WON:
- become_leader();
+ if (is_candidate()) {
+ become_leader();
+ } else {
+ become_candidate();
+ }
break;
case vote_result::LOST:
become_follower(server_id{});
diff --git a/raft/server.cc b/raft/server.cc
index 38ecb59e75..f4927faf81 100644
--- a/raft/server.cc
+++ b/raft/server.cc
@@ -198,7 +198,8 @@ future<> server_impl::start() {
_fsm = std::make_unique<fsm>(_id, term, vote, std::move(log), *_failure_detector,
fsm_config {
.append_request_threshold = _config.append_request_threshold,
- .max_log_length = _config.max_log_length
+ .max_log_length = _config.max_log_length,
+ .enable_prevoting = _config.enable_prevoting
});
assert(_fsm->get_current_term() != term_t(0));

@@ -604,7 +605,7 @@ void server_impl::register_metrics() {
}

future<> server_impl::elect_me_leader() {
- while (!_fsm->is_candidate() && !_fsm->is_leader()) {
+ while (_fsm->is_follower()) {
_fsm->tick();
}
do {
--
Gleb.

Konstantin Osipov

<kostja@scylladb.com>
unread,
Feb 25, 2021, 2:35:37 PM2/25/21
to Gleb Natapov, scylladb-dev@googlegroups.com
* Gleb Natapov <gl...@scylladb.com> [21/02/25 18:45]:
> One downside of Raft’s leader election algorithm is that a server that
> has been partitioned from the cluster is likely to cause a disruption
> when it regains connectivity. When a server is partitioned, it will
> not receive heartbeats. It will soon increment its term to start
> an election, although it won’t be able to collect enough votes to
> become leader. When the server regains connectivity sometime later, its
> larger term number will propagate to the rest of the cluster (either
> through the server’s RequestVote requests or through its AppendEntries
> response). This will force the cluster leader to step down, and a new
> election will have to take place to select a new leader.

The circumstances will be a bit different because of sticky
leadership: upon receiving a RequestVote request within Election
Timeout of hearing from a stable leader a server does not request
its term and denies the vote (4.2.3 of the paper).

However, at some point AppendEntries from the leader will reach
the troublemaker server with higher term, and this server will
respond with AppendReply "Rejected" and its higher term.

Upon receiving AppendReply "Rejected" with a higher term, the
leader will switch to the follower. An election will ensue, in
which the old leader will very likely win, but it could be avoided
completely indeed.

> --- a/idl/raft.idl.hh
> +++ b/idl/raft.idl.hh
> @@ -62,11 +62,13 @@ struct vote_request {
> raft::internal::tagged_uint64<raft::term_tag> current_term;
> raft::internal::tagged_uint64<raft::index_tag> last_log_idx;
> raft::internal::tagged_uint64<raft::term_tag> last_log_term;
> + bool prevote;

I think it's better to use a separate request type for reasons
described below.

I also don't quite see why you need a separate pre-vote state.
One option is to add std::optional<votes> to the follower state. Its
presence would indicate pre-voting round in progress, and it could
be cleared on every incoming RPC different from PreVoteReply.
Now we need to keep track of an extra state in a lot of places.

> diff --git a/raft/fsm.cc b/raft/fsm.cc
> index fb55875b20..4f38413ecc 100644
> --- a/raft/fsm.cc
> +++ b/raft/fsm.cc
> @@ -118,13 +118,11 @@ void fsm::update_current_term(term_t current_term)
> assert(_current_term < current_term);
> _current_term = current_term;
> _voted_for = server_id{};
> +}
>
> +void fsm::reset_election_timeout() {
> static thread_local std::default_random_engine re{std::random_device{}()};
> static thread_local std::uniform_int_distribution<> dist(1, ELECTION_TIMEOUT.count());
> - // Reset the randomized election timeout on each term
> - // change, even if we do not plan to campaign during this
> - // term: the main purpose of the timeout is to avoid
> - // starting our campaign simultaneously with other followers.
> _randomized_election_timeout = ELECTION_TIMEOUT + logical_clock::duration{dist(re)};
> }

I don't understand the change and the destruction of the comment.

It would be nice to put into a separate commit with an own
rationale.

> -void fsm::become_candidate() {
> - _state = candidate{};
> +void fsm::campain(bool do_prevote) {
> + // When starting a campain we need to reset current leader otherwise
> + // distruptive server prevention will stall an election if qurum of nodes
> + // start election together since each one will ignore vote requests from others

It seems the circumstances described in this comment are the exact
consequence of tweaking the reset of election timeout. There are
other reasons why a concurrent campaign may bottleneck, so it's
unclear how the change helps.
E.g. each candidate will vote for self to begin with. If there is
a reason to reset the current leader, I don't understand it from
reading the comment.

> + _current_leader = {};
> +
> + reset_election_timeout();
> +
> _tracker = std::nullopt;
> _log_limiter_semaphore = std::nullopt;
> // 3.4 Leader election
> @@ -177,34 +183,55 @@ void fsm::become_candidate() {
> _votes.emplace(_log.get_configuration());
>
> const auto& voters = _votes->voters();
> - if (voters.find(server_address{_my_id}) == voters.end()) {
> + if (!voters.contains(server_address{_my_id})) {

This is an unrelated change. I personally don't care, but it seems
to be a common habit to punish for unrelated changes during reviews :)

> // If the server is not part of the current configuration,
> // revert to the follower state without increasing
> // the current term.
> become_follower(server_id{});
> return;
> }
> - update_current_term(term_t{_current_term + 1});
> +
> + term_t term{_current_term + 1};

I wonder if we should play this game with using the term from the future
similarly to etcd, or rather avoid incrementing the current term
when receiving the pre-vote message. We can treat PreVoteRequest message
explicitly before looking and acting upon msg.current_term in fsm::step().
Looks like become_candidate() became more complex, and also
campaign() is ridden with do_prevote checks. The code could look
more clean if become_candidate() is left as is and prevote() was
doing the parts it needs explicitly.

> + if (!request.prevote) { // Only record real votes
> + // If a server grants a vote, it must reset its election
> + // timer. See Raft Summary.
> + _last_election_time = _clock.now();
> + _voted_for = from;
> + }
> + // When responding to Msg{Pre,}Vote messages we include the term
> + // from the message, not the local term. To see why, consider the
> + // case where a single node was previously partitioned away and
> + // it's local term is now out of date. If we include the local term
> + // (recall that for pre-votes we don't update the local term), the
> + // (pre-)campaigning node on the other end will proceed to ignore
> + // the message (it ignores all out of date messages).
> + // The term in the original message and current local term are the
> + // same in the case of regular votes, but different for pre-votes.
> + send_to(from, vote_reply{request.current_term, true, request.prevote});

Here as well it seems that following etcd impl makes the
existing logic harder to grasp. Apparently in etcd adding pre-voting
happened after the fact. If pre-votes requests are handled separately
in fsm::step() then the logic to respond to them could be in own
request_prevote(), clear and explicit.

Using an own PreVoteRequest and PreVoteReply RPC also seems to be
safer than re-using an existing message.

--
Konstantin Osipov, Moscow, Russia
https://scylladb.com

Gleb Natapov

<gleb@scylladb.com>
unread,
Feb 26, 2021, 2:32:46 AM2/26/21
to Konstantin Osipov, scylladb-dev@googlegroups.com
On Thu, Feb 25, 2021 at 10:35:33PM +0300, Konstantin Osipov wrote:
> * Gleb Natapov <gl...@scylladb.com> [21/02/25 18:45]:
> > One downside of Raft’s leader election algorithm is that a server that
> > has been partitioned from the cluster is likely to cause a disruption
> > when it regains connectivity. When a server is partitioned, it will
> > not receive heartbeats. It will soon increment its term to start
> > an election, although it won’t be able to collect enough votes to
> > become leader. When the server regains connectivity sometime later, its
> > larger term number will propagate to the rest of the cluster (either
> > through the server’s RequestVote requests or through its AppendEntries
> > response). This will force the cluster leader to step down, and a new
> > election will have to take place to select a new leader.
>
> The circumstances will be a bit different because of sticky
> leadership: upon receiving a RequestVote request within Election
> Timeout of hearing from a stable leader a server does not request
> its term and denies the vote (4.2.3 of the paper).
>
> However, at some point AppendEntries from the leader will reach
> the troublemaker server with higher term, and this server will
> respond with AppendReply "Rejected" and its higher term.
>
That is exactly what is written above :) I can drop RequestVote
part but the commit message is a verbatim copy of PhD, I prefer it to
stay that way. Who knows may be one we will make sticky leadership
optional.

> Upon receiving AppendReply "Rejected" with a higher term, the
> leader will switch to the follower. An election will ensue, in
> which the old leader will very likely win, but it could be avoided
> completely indeed.
>
> > --- a/idl/raft.idl.hh
> > +++ b/idl/raft.idl.hh
> > @@ -62,11 +62,13 @@ struct vote_request {
> > raft::internal::tagged_uint64<raft::term_tag> current_term;
> > raft::internal::tagged_uint64<raft::index_tag> last_log_idx;
> > raft::internal::tagged_uint64<raft::term_tag> last_log_term;
> > + bool prevote;
>
> I think it's better to use a separate request type for reasons
> described below.
>
This is how I started. It did not turned out nicer than having a flag
(on the contrary, most checks for vote message had to be amended)
and required to extend rpc interface by something like 25%.

> I also don't quite see why you need a separate pre-vote state.
> One option is to add std::optional<votes> to the follower state. Its
> presence would indicate pre-voting round in progress, and it could
> be cleared on every incoming RPC different from PreVoteReply.
> Now we need to keep track of an extra state in a lot of places.
All the same places that check new state will have to check this
std::optional<votes>. It does not necessary have to be new state, it
may be a variant of candidate state indicated by a flag, but definitely
not a follower.

>
> > diff --git a/raft/fsm.cc b/raft/fsm.cc
> > index fb55875b20..4f38413ecc 100644
> > --- a/raft/fsm.cc
> > +++ b/raft/fsm.cc
> > @@ -118,13 +118,11 @@ void fsm::update_current_term(term_t current_term)
> > assert(_current_term < current_term);
> > _current_term = current_term;
> > _voted_for = server_id{};
> > +}
> >
> > +void fsm::reset_election_timeout() {
> > static thread_local std::default_random_engine re{std::random_device{}()};
> > static thread_local std::uniform_int_distribution<> dist(1, ELECTION_TIMEOUT.count());
> > - // Reset the randomized election timeout on each term
> > - // change, even if we do not plan to campaign during this
> > - // term: the main purpose of the timeout is to avoid
> > - // starting our campaign simultaneously with other followers.
> > _randomized_election_timeout = ELECTION_TIMEOUT + logical_clock::duration{dist(re)};
> > }
>
> I don't understand the change and the destruction of the comment.
The comment is not true any more. The election timer is updated even if
term does not change due to prevote not changing it.

> It would be nice to put into a separate commit with an own
> rationale.
Without the prevote it is not needed.

>
> > -void fsm::become_candidate() {
> > - _state = candidate{};
> > +void fsm::campain(bool do_prevote) {
> > + // When starting a campain we need to reset current leader otherwise
> > + // distruptive server prevention will stall an election if qurum of nodes
> > + // start election together since each one will ignore vote requests from others
>
> It seems the circumstances described in this comment are the exact
> consequence of tweaking the reset of election timeout. There are
> other reasons why a concurrent campaign may bottleneck, so it's
> unclear how the change helps.
What do you mean by "exact consequence"? Both changes were done because
of an observed situation where two nodes chose election timeout close to
each other and repeatedly encountered the situation described above. I
solved both issue by resetting the leader and randomizing timeout again
for the next prevote (without term change).

> E.g. each candidate will vote for self to begin with. If there is
> a reason to reset the current leader, I don't understand it from
> reading the comment.
It is possible that for the reason you state here the vote will be
rejected, but it is better than been ignored.

>
> > + _current_leader = {};
> > +
> > + reset_election_timeout();
> > +
> > _tracker = std::nullopt;
> > _log_limiter_semaphore = std::nullopt;
> > // 3.4 Leader election
> > @@ -177,34 +183,55 @@ void fsm::become_candidate() {
> > _votes.emplace(_log.get_configuration());
> >
> > const auto& voters = _votes->voters();
> > - if (voters.find(server_address{_my_id}) == voters.end()) {
> > + if (!voters.contains(server_address{_my_id})) {
>
> This is an unrelated change. I personally don't care, but it seems
> to be a common habit to punish for unrelated changes during reviews :)
And rightly so! One unrelated change may be fine, but let it go an
patches will contain a fix and 100 unrelated changes. Will push into
separate patch.

>
> > // If the server is not part of the current configuration,
> > // revert to the follower state without increasing
> > // the current term.
> > become_follower(server_id{});
> > return;
> > }
> > - update_current_term(term_t{_current_term + 1});
> > +
> > + term_t term{_current_term + 1};
>
> I wonder if we should play this game with using the term from the future
> similarly to etcd, or rather avoid incrementing the current term
> when receiving the pre-vote message. We can treat PreVoteRequest message
> explicitly before looking and acting upon msg.current_term in fsm::step().
I do not understand. The whole idea is that the term is not incremented on
the sender, not incrementing it on a receiver is a consequence of it
(and also done here in this patch).
It will cause a lot of code duplication. The function is pretty large
and only three places check for prevote.

>
> > + if (!request.prevote) { // Only record real votes
> > + // If a server grants a vote, it must reset its election
> > + // timer. See Raft Summary.
> > + _last_election_time = _clock.now();
> > + _voted_for = from;
> > + }
> > + // When responding to Msg{Pre,}Vote messages we include the term
> > + // from the message, not the local term. To see why, consider the
> > + // case where a single node was previously partitioned away and
> > + // it's local term is now out of date. If we include the local term
> > + // (recall that for pre-votes we don't update the local term), the
> > + // (pre-)campaigning node on the other end will proceed to ignore
> > + // the message (it ignores all out of date messages).
> > + // The term in the original message and current local term are the
> > + // same in the case of regular votes, but different for pre-votes.
> > + send_to(from, vote_reply{request.current_term, true, request.prevote});
>
> Here as well it seems that following etcd impl makes the
> existing logic harder to grasp. Apparently in etcd adding pre-voting
> happened after the fact. If pre-votes requests are handled separately
> in fsm::step() then the logic to respond to them could be in own
> request_prevote(), clear and explicit.
It was added not more afterthefacty than in our case. The logic is this
because it makes sense. I do no see how handling pre-votes separately
helps any. You still need term checking logic, so you either make things
more complicated or duplicate some parts of step() logic for pre-vote.
Neither is good.

>
> Using an own PreVoteRequest and PreVoteReply RPC also seems to be
> safer than re-using an existing message.
>
I do not see why. All the same field are needed.

--
Gleb.

Konstantin Osipov

<kostja@scylladb.com>
unread,
Feb 26, 2021, 6:47:23 AM2/26/21
to Gleb Natapov, scylladb-dev@googlegroups.com
* Gleb Natapov <gl...@scylladb.com> [21/02/26 10:35]:
> > > -void fsm::become_candidate() {
> > > - _state = candidate{};
> > > +void fsm::campain(bool do_prevote) {
> > > + // When starting a campain we need to reset current leader otherwise
> > > + // distruptive server prevention will stall an election if qurum of nodes
> > > + // start election together since each one will ignore vote requests from others
> >
> > It seems the circumstances described in this comment are the exact
> > consequence of tweaking the reset of election timeout. There are
> > other reasons why a concurrent campaign may bottleneck, so it's
> > unclear how the change helps.
> What do you mean by "exact consequence"? Both changes were done because
> of an observed situation where two nodes chose election timeout close to
> each other and repeatedly encountered the situation described above. I
> solved both issue by resetting the leader and randomizing timeout again
> for the next prevote (without term change).

I agree sending out pre-vote messages should randomize election
timeout. But I don't see how resetting leader id is of significant
additional value. If two nodes enter pre-vote state
simultaneously, the pre-vote election may indeed end in a race.
The solution to this is revert to/stay the follower until next
expiration of the timeout. Why is resetting leader id so
important?

It may be an artifact of handling PreVoteRequest by the same code
as VoteRequest?

> > > - if (voters.find(server_address{_my_id}) == voters.end()) {
> > > + if (!voters.contains(server_address{_my_id})) {
> >
> > This is an unrelated change. I personally don't care, but it seems
> > to be a common habit to punish for unrelated changes during reviews :)
> And rightly so! One unrelated change may be fine, but let it go an
> patches will contain a fix and 100 unrelated changes. Will push into
> separate patch.

Oh well...

> > > // revert to the follower state without increasing
> > > // the current term.
> > > become_follower(server_id{});
> > > return;
> > > }
> > > - update_current_term(term_t{_current_term + 1});
> > > +
> > > + term_t term{_current_term + 1};
> >
> > I wonder if we should play this game with using the term from the future
> > similarly to etcd, or rather avoid incrementing the current term
> > when receiving the pre-vote message. We can treat PreVoteRequest message
> > explicitly before looking and acting upon msg.current_term in fsm::step().
> I do not understand. The whole idea is that the term is not incremented on
> the sender, not incrementing it on a receiver is a consequence of it
> (and also done here in this patch).

I agree. But imagine you handle pre-votes explicitly in fsm::step, even
before you check msg.current_term or act on msg.current_term.
Then you don't care what is the message's term - it can be in
future, in the past, whatever. You respond with your own term,
which will guarantee that the candidate doing a pre-vote will
switch to the latest term possible for the actual election.
Simply setting candidate's term to +1 may be a hit or may be a
miss - the current cluster's term can be +1 or +10 to the
candidate's term, and the candidate doesn't know it.

> > > if (_votes->tally_votes() == vote_result::WON) {
> > > // A single node cluster.
> > > - become_leader();
> > > + if (!do_prevote) {
> > > + become_leader();
> > > + } else {
> > > + become_candidate();
> > > + }
> >
> > Looks like become_candidate() became more complex, and also
> > campaign() is ridden with do_prevote checks. The code could look
> > more clean if become_candidate() is left as is and prevote() was
> > doing the parts it needs explicitly.
> It will cause a lot of code duplication. The function is pretty large
> and only three places check for prevote.

On one hand I agree that anyone then can suggest to merge the two
to avoid code duplication. On the other, it will be easier to see
that PreVote is consistent, that changes in voting do not
impact pre-voting, and vice-versa. I doubt anyone touching this
code will be reckless enough to forget to modify both branches
when needed. Finally, by splitting the code into two, there will
be a chance to extract common helpers, so the duplication will be
limited.
Why do you think it's important to check the term for pre-votes?
On the contrary, since pre-votes are not supposed to have any
impact on the receiver's term, this logic is redundant. We can
safely ignore the term of pre-vote request, not touch our term or
state, and respond with our own term.

> > Using an own PreVoteRequest and PreVoteReply RPC also seems to be
> > safer than re-using an existing message.
> >
> I do not see why. All the same field are needed.

Except bool prevote!-)

What prevents you from inheriting PreVoteRequest from VoteRequest
and PreVoteReply from VoteReply if you would like to avoid code
duplication?

Konstantin Osipov

<kostja@scylladb.com>
unread,
Feb 26, 2021, 7:20:02 AM2/26/21
to Gleb Natapov, scylladb-dev@googlegroups.com
* Konstantin Osipov <kos...@scylladb.com> [21/02/26 14:47]:

BTW, in this paper:

https://www.openlife.cc/sites/default/files/4-modifications-for-Raft-consensus.pdf

PreVoteRequest uses a different message than VoteRequest and different
arguments.

Quoting:

Called by a server before changing itself to Candidate status. If
a majority of servers return true, proceed to Candidate.
Otherwise, wait for another election timeout.

Arguments:

nextTerm caller's term + 1
candidateId caller
lastLogIndex index of caller's last log entry
lastLogTerm term of caller's last log entry

Results:
term currentTerm, for caller to update itself
voteGranted true means caller would receive vote if it
was a candidate

Receiver implementation:

1. Reply false if last AppendEntries call was received
less than election timeout ago (leader stickiness)
2. Reply false if nextTerm < currentTerm
3. If caller's log is is at least as up­to­date as receiver's
log, return true

Following the spirit of this description, we could have both
term and next_term in pre-vote rpc. Then pre-vote RPC will never
trigger term arms race.

What concerns me is that messages with old terms are typically
dropped, whereas dropping pre-vote RPC is dangerous - it may lead
to a deadlock/stagnation of voting.


--
Konstantin Osipov, Moscow, Russia
https://scylladb.com

Pekka Enberg

<penberg@scylladb.com>
unread,
Feb 27, 2021, 3:47:03 AM2/27/21
to Gleb Natapov, Konstantin Osipov, scylladb-dev
Hi Gleb,

On Thu, Feb 25, 2021 at 5:43 PM Gleb Natapov <gl...@scylladb.com> wrote:
One downside of Raft’s leader election algorithm is that a server that
has been partitioned from the cluster is likely to cause a disruption
when it regains connectivity.

Does this patch also address the liveness issue outlined in this blog post where the system is unable to establish a stable leader in the presence of partial network failures?


- Pekka 

Konstantin Osipov

<kostja@scylladb.com>
unread,
Feb 27, 2021, 8:29:59 AM2/27/21
to Pekka Enberg, Gleb Natapov, scylladb-dev
* Pekka Enberg <pen...@scylladb.com> [21/02/27 16:22]:
This post is somebody's attempt to hype up. etcd raft has
pre-voting, and this post is written as if it didn't. Yes, with
pre-voting the situation described in this post is impossible.
Which is why I have little trust in what's written in the post.

Pekka Enberg

<penberg@scylladb.com>
unread,
Feb 27, 2021, 9:34:42 AM2/27/21
to Konstantin Osipov, Pekka Enberg, Gleb Natapov, scylladb-dev
Hi Kostja,

AFAICT, pre-voting is not enabled in etcd by default:


The blog post is indeed vague about this, but hardly a reason not to trust anything in that post.

- Pekka 

Konstantin Osipov

<kostja@scylladb.com>
unread,
Feb 27, 2021, 10:06:11 AM2/27/21
to Pekka Enberg, Gleb Natapov, scylladb-dev
* Pekka Enberg <pen...@scylladb.com> [21/02/27 17:38]:
> > > On Thu, Feb 25, 2021 at 5:43 PM Gleb Natapov <gl...@scylladb.com> wrote:
> > >
> > > > One downside of Raft’s leader election algorithm is that a server that
> > > > has been partitioned from the cluster is likely to cause a disruption
> > > > when it regains connectivity.
> > > >
> > >
> > > Does this patch also address the liveness issue outlined in this blog
> > post
> > > where the system is unable to establish a stable leader in the presence
> > of
> > > partial network failures?
> > >
> > >
> > https://decentralizedthoughts.github.io/2020-12-12-raft-liveness-full-omission/
> >
> > This post is somebody's attempt to hype up. etcd raft has
> > pre-voting, and this post is written as if it didn't. Yes, with
> > pre-voting the situation described in this post is impossible.
> > Which is why I have little trust in what's written in the post.
> >
>
> AFAICT, pre-voting is not enabled in etcd by default:
>
> https://twitter.com/alexwritescode/status/1332889824924209154
>
> The blog post is indeed vague about this, but hardly a reason not to trust
> anything in that post.

Raft properties without pre-voting are well studied by Heidi
Howard in 2015, and addressed in the paper, on the raft mailing list
and in many implementations. But Heidi of course
took a chance to mention and promote her work one more time:

https://twitter.com/heidiann360/status/1332711011451867139?lang=en

The entire blog post should have been: ugh, we didn't use
pre-voting. As is, it portrays Raft as having
issues where it doesn't.

Raft main issue in my view is the cost of failure detection.
Configuration changes do not have a clear TLA+ proof. But that
bothers few.

Gleb Natapov

<gleb@scylladb.com>
unread,
Feb 28, 2021, 6:16:36 AM2/28/21
to Konstantin Osipov, scylladb-dev@googlegroups.com
On Fri, Feb 26, 2021 at 02:47:21PM +0300, Konstantin Osipov wrote:
> * Gleb Natapov <gl...@scylladb.com> [21/02/26 10:35]:
> > > > -void fsm::become_candidate() {
> > > > - _state = candidate{};
> > > > +void fsm::campain(bool do_prevote) {
> > > > + // When starting a campain we need to reset current leader otherwise
> > > > + // distruptive server prevention will stall an election if qurum of nodes
> > > > + // start election together since each one will ignore vote requests from others
> > >
> > > It seems the circumstances described in this comment are the exact
> > > consequence of tweaking the reset of election timeout. There are
> > > other reasons why a concurrent campaign may bottleneck, so it's
> > > unclear how the change helps.
> > What do you mean by "exact consequence"? Both changes were done because
> > of an observed situation where two nodes chose election timeout close to
> > each other and repeatedly encountered the situation described above. I
> > solved both issue by resetting the leader and randomizing timeout again
> > for the next prevote (without term change).
>
> I agree sending out pre-vote messages should randomize election
> timeout. But I don't see how resetting leader id is of significant
> additional value. If two nodes enter pre-vote state
> simultaneously, the pre-vote election may indeed end in a race.
> The solution to this is revert to/stay the follower until next
> expiration of the timeout. Why is resetting leader id so
> important?
What do you mean by "so important". Will it work without? Yes it
probably will since exactly same problem exists during voting and
randomizing timeout on retry hides it. Why I think the leader id should
be reset nonetheless? Because it will result in a negative reply instead
of ignoring a message and thus will move algorithm to the retry faster.

>
> It may be an artifact of handling PreVoteRequest by the same code
> as VoteRequest?
>
Not at all. They are handled by the same code because except of updating
the term they do exactly same thing.


> > > > // revert to the follower state without increasing
> > > > // the current term.
> > > > become_follower(server_id{});
> > > > return;
> > > > }
> > > > - update_current_term(term_t{_current_term + 1});
> > > > +
> > > > + term_t term{_current_term + 1};
> > >
> > > I wonder if we should play this game with using the term from the future
> > > similarly to etcd, or rather avoid incrementing the current term
> > > when receiving the pre-vote message. We can treat PreVoteRequest message
> > > explicitly before looking and acting upon msg.current_term in fsm::step().
> > I do not understand. The whole idea is that the term is not incremented on
> > the sender, not incrementing it on a receiver is a consequence of it
> > (and also done here in this patch).
>
> I agree. But imagine you handle pre-votes explicitly in fsm::step, even
> before you check msg.current_term or act on msg.current_term.
And that logic will consist of checking terms and sending replies.
So what is the value of handling it explicitly instead of handling it
reusing current code. The value of reusing is obviously because the code
is reused.
I do not subscribe to the idea that duplication is fine because nobody
will forget to update both code paths (yes they most definitely will). So
for that reason I am against the duplication. Having them in the same
function also makes it easy to see what is common between both and what
is different which also help to whoever changes the code to consider
how his changes affect both algorithms. Extracting common parts into a
common function is fine, but I do not see how it can be done here.
What do you mean? We should not grant a vote if prevote term is smaller,
so you need to compare terms obviously. You may avoid comparing term on
preVoteReply and rely on reject status exclusively if you handle it
explicitly, but I do not see who writing a special case for this is
better then just set correct term on a reply which is literary one
linear change.

Why do we even bother handling all cases in step() anyway? Why not handle
each type of message in its specialized variant of step() and make it
crystal clear how each messages is handled? You seams to what us do it for
prevote but not other messages. I fails to see how prevote is special.

> > > Using an own PreVoteRequest and PreVoteReply RPC also seems to be
> > > safer than re-using an existing message.
> > >
> > I do not see why. All the same field are needed.
>
> Except bool prevote!-)
You either distinguish messages by type or using a flag. In that sense prevote
flag is used by both to tell which message is this.

>
> What prevents you from inheriting PreVoteRequest from VoteRequest
> and PreVoteReply from VoteReply if you would like to avoid code
> duplication?
>
That what I did (idl still has to list both). The complexity comes
from dealing with different types as opposite to a flag when messages
are handled in step(). Both can be done, the flag simply easier and more
concise. Not mentioning reduction in RPC interface.

--
Gleb.

Gleb Natapov

<gleb@scylladb.com>
unread,
Feb 28, 2021, 6:21:26 AM2/28/21
to Konstantin Osipov, scylladb-dev@googlegroups.com
On Fri, Feb 26, 2021 at 03:19:59PM +0300, Konstantin Osipov wrote:
> * Konstantin Osipov <kos...@scylladb.com> [21/02/26 14:47]:
>
> BTW, in this paper:
>
> https://www.openlife.cc/sites/default/files/4-modifications-for-Raft-consensus.pdf
>
> PreVoteRequest uses a different message than VoteRequest and different
> arguments.
>
We are probably reading very different doc but the one I read have
exactly same message for PreVoteRequest and VoteRequest. PreVoteRequest
has different _meaning_ for the "term" field which holds term + 1 instead
of term and this is exactly what this patches implements.

--
Gleb.

Gleb Natapov

<gleb@scylladb.com>
unread,
Mar 3, 2021, 9:25:41 AM3/3/21
to scylladb-dev@googlegroups.com
This patcheset adds prevoting stage in leader election algorithm. See
patch 2 for explanation what is it and why it is needed.

v1->v2:
- move cleanups to a separate patch
- add boost tests
- make prevoting a variant of candidate state instead of a separate
state

Gleb Natapov (4):
raft: use modern unordered_set::contains instead of find in
become_candidate
raft: implement prevoting stage in leader election
raft: add boost tests for prevoting
rpc: store leader and candidate state in state variant

idl/raft.idl.hh | 2 +
raft/fsm.hh | 94 +++++++++++++++++----
raft/raft.hh | 4 +
raft/server.hh | 2 +
raft/fsm.cc | 153 ++++++++++++++++++++++-------------
raft/server.cc | 5 +-
test/boost/raft_etcd_test.cc | 2 +-
test/boost/raft_fsm_test.cc | 142 +++++++++++++++++++++++++++++++-
8 files changed, 326 insertions(+), 78 deletions(-)

--
2.29.2

Gleb Natapov

<gleb@scylladb.com>
unread,
Mar 3, 2021, 9:25:42 AM3/3/21
to scylladb-dev@googlegroups.com
One downside of Raft^Yגs leader election algorithm is that a server that
has been partitioned from the cluster is likely to cause a disruption
when it regains connectivity. When a server is partitioned, it will
not receive heartbeats. It will soon increment its term to start
an election, although it won^Yגt be able to collect enough votes to
become leader. When the server regains connectivity sometime later, its
larger term number will propagate to the rest of the cluster (either
through the server^Yגs RequestVote requests or through its AppendEntries
response). This will force the cluster leader to step down, and a new
election will have to take place to select a new leader.

Prevoting stage is addressing that. In the Prevote algorithm, a
candidate only increments its term if it first learns from a majority of
the cluster that they would be willing to grant the candidate their votes
(if the candidate^Yגs log is sufficiently up-to-date, and the voters have
not received heartbeats from a valid leader for at least a baseline
election timeout).

The Prevote algorithm solves the issue of a partitioned server disrupting
the cluster when it rejoins. While a server is partitioned, it won^Yגt
be able to increment its term, since it can^Yגt receive permission
from a majority of the cluster. Then, when it rejoins the cluster, it
still won^Yגt be able to increment its term, since the other servers
will have been receiving regular heartbeats from the leader. Once the
server receives a heartbeat from the leader itself, it will return to
the follower state(in the same term).
---
idl/raft.idl.hh | 2 ++
raft/fsm.hh | 51 ++++++++++++++++++++++----
raft/raft.hh | 4 +++
raft/server.hh | 2 ++
raft/tracker.hh | 6 +++-
raft/fsm.cc | 96 ++++++++++++++++++++++++++++++++++++-------------
raft/server.cc | 5 +--
raft/tracker.cc | 4 +--
8 files changed, 134 insertions(+), 36 deletions(-)

diff --git a/idl/raft.idl.hh b/idl/raft.idl.hh
index 97f26ad82a..5517dbf295 100644
--- a/idl/raft.idl.hh
+++ b/idl/raft.idl.hh
@@ -62,11 +62,13 @@ struct vote_request {
raft::internal::tagged_uint64<raft::term_tag> current_term;
raft::internal::tagged_uint64<raft::index_tag> last_log_idx;
raft::internal::tagged_uint64<raft::term_tag> last_log_term;
+ bool prevote;
};

struct vote_reply {
raft::internal::tagged_uint64<raft::term_tag> current_term;
bool vote_granted;
+ bool prevote;
};

struct install_snapshot {
diff --git a/raft/fsm.hh b/raft/fsm.hh
index 17fb348284..3c698534ba 100644
--- a/raft/fsm.hh
+++ b/raft/fsm.hh
@@ -48,6 +48,8 @@ struct fsm_config {
// is configured by the snapshot, otherwise the state
// machine will deadlock.
size_t max_log_length;
+ // If set to true will enable prevoting stage during election
+ bool enable_prevoting;
};

// 3.4 Leader election
@@ -221,10 +223,14 @@ class fsm {
}
}

+ void campain(bool prevote);
+
void become_leader();

void become_candidate();

+ void become_prevote();
+
void become_follower(server_id leader);

// Controls whether the follower has been responsive recently,
@@ -250,6 +256,8 @@ class fsm {
// Tick implementation on a leader
void tick_leader();

+ void reset_election_timeout();
+
public:
explicit fsm(server_id id, term_t current_term, server_id voted_for, log log,
failure_detector& failure_detector, fsm_config conf);
@@ -336,6 +344,31 @@ class fsm {
friend std::ostream& operator<<(std::ostream& os, const fsm& f);
};

+namespace internal {
+static inline bool need_update_term(const vote_request& r) {
+ return !r.prevote;
+}
+static inline bool need_update_term(const vote_reply& r) {
+ // We send pre-vote requests with a term in our future. If the
+ // pre-vote is granted, we will increment our term when we get a
+ // quorum. If it is not, the term comes from the node that
+ // rejected our vote so we should become a follower at the new
+ // term.
+ return !r.prevote || !r.vote_granted;
+}
+template<typename T>
+static inline bool need_update_term(const T& r) {
+ return true;
+}
+static inline bool is_prevote_request(const vote_request& r) {
+ return r.prevote;
+}
+template<typename T>
+static inline bool is_prevote_request(const T& r) {
+ return false;
+}
+}
+
template <typename Message>
void fsm::step(server_id from, Message&& msg) {
static_assert(std::is_rvalue_reference<decltype(msg)>::value, "must be rvalue");
@@ -353,11 +386,13 @@ void fsm::step(server_id from, Message&& msg) {
// follower state. If a server receives a request with
// a stale term number, it rejects the request.
if (msg.current_term > _current_term) {
+ server_id leader{};
+
logger.trace("{} [term: {}] received a message with higher term from {} [term: {}]",
_my_id, _current_term, from, msg.current_term);

if constexpr (std::is_same_v<Message, append_request>) {
- become_follower(from);
+ leader = from;
} else {
if constexpr (std::is_same_v<Message, vote_request>) {
if (_current_leader != server_id{} && election_elapsed() < ELECTION_TIMEOUT) {
@@ -366,15 +401,17 @@ void fsm::step(server_id from, Message&& msg) {
// within the minimum election timeout of
// hearing from a current leader, it does not
// update its term or grant its vote.
- logger.trace("{} [term: {}] not granting a vote within a minimum election timeout, elapsed {}",
- _my_id, _current_term, election_elapsed());
+ logger.trace("{} [term: {}] not granting a vote within a minimum election timeout, elapsed {} (current leader = {})",
+ _my_id, _current_term, election_elapsed(), _current_leader);
return;
}
}
- become_follower(server_id{});
}
- update_current_term(msg.current_term);
-
+ // Do not update term or become follower on prevote
+ if (internal::need_update_term(msg)) {
+ become_follower(leader);
+ update_current_term(msg.current_term);
+ }
} else if (msg.current_term < _current_term) {
if constexpr (std::is_same_v<Message, append_request>) {
// Instructs the leader to step down.
@@ -382,6 +419,8 @@ void fsm::step(server_id from, Message&& msg) {
send_to(from, std::move(reply));
} else if constexpr (std::is_same_v<Message, install_snapshot>) {
send_to(from, snapshot_reply{ .success = false });
+ } else if (internal::is_prevote_request(msg)) {
+ send_to(from, vote_reply{_current_term, false, true});
} else {
// Ignore other cases
logger.trace("{} [term: {}] ignored a message with lower term from {} [term: {}]",
diff --git a/raft/tracker.hh b/raft/tracker.hh
index e463bdd00e..6462217100 100644
--- a/raft/tracker.hh
+++ b/raft/tracker.hh
@@ -180,8 +180,9 @@ class votes {
server_address_set _voters;
election_tracker _current;
std::optional<election_tracker> _previous;
+ bool _prevote;
public:
- votes(configuration configuration);
+ votes(configuration configuration, bool prevote = false);

const server_address_set& voters() const {
return _voters;
@@ -189,6 +190,9 @@ class votes {

void register_vote(server_id from, bool granted);
vote_result tally_votes() const;
+ bool is_prevote() const {
+ return _prevote;
+ };

friend std::ostream& operator<<(std::ostream& os, const votes& v);
};
diff --git a/raft/fsm.cc b/raft/fsm.cc
index 5e28fa5e73..a4eb865c30 100644
--- a/raft/fsm.cc
+++ b/raft/fsm.cc
@@ -118,13 +118,11 @@ void fsm::update_current_term(term_t current_term)
assert(_current_term < current_term);
_current_term = current_term;
_voted_for = server_id{};
+}

+void fsm::reset_election_timeout() {
static thread_local std::default_random_engine re{std::random_device{}()};
static thread_local std::uniform_int_distribution<> dist(1, ELECTION_TIMEOUT.count());
- // Reset the randomized election timeout on each term
- // change, even if we do not plan to campaign during this
- // term: the main purpose of the timeout is to avoid
- // starting our campaign simultaneously with other followers.
_randomized_election_timeout = ELECTION_TIMEOUT + logical_clock::duration{dist(re)};
}

@@ -138,6 +136,7 @@ void fsm::become_leader() {
_log_limiter_semaphore.emplace(this);
_log_limiter_semaphore->sem.consume(_log.length());
_last_election_time = _clock.now();
+ reset_election_timeout();
// a new leader needs to commit at lease one entry to make sure that
// all existing entries in its log are commited as well. Also it should
// send append entries rpc as soon as possible to establish its leqdership
@@ -155,13 +154,21 @@ void fsm::become_follower(server_id leader) {
_tracker = std::nullopt;
_log_limiter_semaphore = std::nullopt;
_votes = std::nullopt;
+ reset_election_timeout();
if (_current_leader) {
_last_election_time = _clock.now();
}
}

-void fsm::become_candidate() {
+void fsm::campain(bool do_prevote) {
_state = candidate{};
+ // When starting a campain we need to reset current leader otherwise
+ // distruptive server prevention will stall an election if qurum of nodes
+ // start election together since each one will ignore vote requests from others
+ _current_leader = {};
+
+ reset_election_timeout();
+
_tracker = std::nullopt;
_log_limiter_semaphore = std::nullopt;
// 3.4 Leader election
@@ -174,7 +181,7 @@ void fsm::become_candidate() {
// and initiating another round of RequestVote RPCs.
_last_election_time = _clock.now();

- _votes.emplace(_log.get_configuration());
+ _votes.emplace(_log.get_configuration(), do_prevote);

const auto& voters = _votes->voters();
if (!voters.contains(server_address{_my_id})) {
@@ -184,27 +191,46 @@ void fsm::become_candidate() {
become_follower(server_id{});
return;
}
- update_current_term(term_t{_current_term + 1});
+
+ term_t term{_current_term + 1};
+ if (!do_prevote) {
+ update_current_term(term);
+ }
// Replicate RequestVote
for (const auto& server : voters) {
if (server.id == _my_id) {
// Vote for self.
_votes->register_vote(server.id, true);
- _voted_for = _my_id;
+ if (!do_prevote) {
+ // Only record real votes
+ _voted_for = _my_id;
+ }
// Already signaled _sm_events in update_current_term()
continue;
}
- logger.trace("{} [term: {}, index: {}, last log term: {}] sent vote request to {}",
- _my_id, _current_term, _log.last_idx(), _log.last_term(), server.id);
+ logger.trace("{} [term: {}, index: {}, last log term: {}{}] sent vote request to {}",
+ _my_id, term, _log.last_idx(), _log.last_term(), do_prevote ? ", prevote" : "", server.id);

- send_to(server.id, vote_request{_current_term, _log.last_idx(), _log.last_term()});
+ send_to(server.id, vote_request{term, _log.last_idx(), _log.last_term(), do_prevote});
}
if (_votes->tally_votes() == vote_result::WON) {
// A single node cluster.
- become_leader();
+ if (!do_prevote) {
+ become_leader();
+ } else {
+ become_candidate();
+ }
}
}

+void fsm::become_candidate() {
+ campain(false);
+}
+
+void fsm::become_prevote() {
+ campain(true);
+}
+
future<fsm_output> fsm::poll_output() {
logger.trace("fsm::poll_output() {} stable index: {} last index: {}",
_my_id, _log.stable_idx(), _log.last_idx());
@@ -406,7 +432,11 @@ void fsm::tick() {
} else if (is_past_election_timeout()) {
logger.trace("tick[{}]: becoming a candidate, last election: {}, now: {}", _my_id,
_last_election_time, _clock.now());
- become_candidate();
+ if (_config.enable_prevoting) {
+ become_prevote();
+ } else {
+ become_candidate();
+ }
}
}

@@ -535,13 +565,15 @@ void fsm::request_vote(server_id from, vote_request&& request) {
// We can cast a vote in any state. If the candidate's term is
// lower than ours, we ignore the request. Otherwise we first
// update our current term and convert to a follower.
- assert(_current_term == request.current_term);
+ assert(request.prevote || _current_term == request.current_term);

bool can_vote =
// We can vote if this is a repeat of a vote we've already cast...
_voted_for == from ||
// ...we haven't voted and we don't think there's a leader yet in this term...
- (_voted_for == server_id{} && _current_leader == server_id{});
+ (_voted_for == server_id{} && _current_leader == server_id{}) ||
+ // ...this is prevote for a future term...
+ (request.prevote && request.current_term > _current_term);

// ...and we believe the candidate is up to date.
if (can_vote && _log.is_up_to_date(request.last_log_idx, request.last_log_term)) {
@@ -550,23 +582,33 @@ void fsm::request_vote(server_id from, vote_request&& request) {
"voted for {} [log_term: {}, log_index: {}]",
_my_id, _current_term, _log.last_idx(), _log.last_term(), _voted_for,
from, request.last_log_term, request.last_log_idx);
- // If a server grants a vote, it must reset its election
- // timer. See Raft Summary.
- _last_election_time = _clock.now();
- _voted_for = from;
-
- send_to(from, vote_reply{_current_term, true});
+ if (!request.prevote) { // Only record real votes
+ // If a server grants a vote, it must reset its election
+ // timer. See Raft Summary.
+ _last_election_time = _clock.now();
+ _voted_for = from;
+ }
+ // When responding to Msg{Pre,}Vote messages we include the term
+ // from the message, not the local term. To see why, consider the
+ // case where a single node was previously partitioned away and
+ // it's local term is now out of date. If we include the local term
+ // (recall that for pre-votes we don't update the local term), the
+ // (pre-)campaigning node on the other end will proceed to ignore
+ // the message (it ignores all out of date messages).
+ // The term in the original message and current local term are the
+ // same in the case of regular votes, but different for pre-votes.
+ send_to(from, vote_reply{request.current_term, true, request.prevote});
} else {
// If a vote is not granted, this server is a potential
// viable candidate, so it should not reset its election
// timer, to avoid election disruption by non-viable
// candidates.
logger.trace("{} [term: {}, index: {}, log_term: {}, voted_for: {}] "
- "rejected vote for {} [log_term: {}, log_index: {}]",
+ "rejected vote for {} [current_term: {}, log_term: {}, log_index: {}, prevote: {}]",
_my_id, _current_term, _log.last_idx(), _log.last_term(), _voted_for,
- from, request.last_log_term, request.last_log_idx);
+ from, request.current_term, request.last_log_term, request.last_log_idx, request.prevote);

- send_to(from, vote_reply{_current_term, false});
+ send_to(from, vote_reply{_current_term, false, request.prevote});
}
}

@@ -581,7 +623,11 @@ void fsm::request_vote_reply(server_id from, vote_reply&& reply) {
case vote_result::UNKNOWN:
break;
case vote_result::WON:
- become_leader();
+ if (_votes->is_prevote()) {
+ become_candidate();
+ } else {
+ become_leader();
+ }
break;
case vote_result::LOST:
become_follower(server_id{});
diff --git a/raft/server.cc b/raft/server.cc
index 38ecb59e75..f4927faf81 100644
--- a/raft/server.cc
+++ b/raft/server.cc
@@ -198,7 +198,8 @@ future<> server_impl::start() {
_fsm = std::make_unique<fsm>(_id, term, vote, std::move(log), *_failure_detector,
fsm_config {
.append_request_threshold = _config.append_request_threshold,
- .max_log_length = _config.max_log_length
+ .max_log_length = _config.max_log_length,
+ .enable_prevoting = _config.enable_prevoting
});
assert(_fsm->get_current_term() != term_t(0));

@@ -604,7 +605,7 @@ void server_impl::register_metrics() {
}

future<> server_impl::elect_me_leader() {
- while (!_fsm->is_candidate() && !_fsm->is_leader()) {
+ while (_fsm->is_follower()) {
_fsm->tick();
}
do {
diff --git a/raft/tracker.cc b/raft/tracker.cc
index 39a9f75b14..b56890b9a4 100644
--- a/raft/tracker.cc
+++ b/raft/tracker.cc
@@ -202,9 +202,9 @@ index_t tracker::committed(index_t prev_commit_idx) {
}
}

-votes::votes(configuration configuration)
+votes::votes(configuration configuration, bool prevote)
:_voters(configuration.current)
- , _current(configuration.current) {
+ , _current(configuration.current), _prevote(prevote) {

if (configuration.is_joint()) {
_previous.emplace(configuration.previous);
--
2.29.2

Gleb Natapov

<gleb@scylladb.com>
unread,
Mar 3, 2021, 9:25:45 AM3/3/21
to scylladb-dev@googlegroups.com
We already have server state dependant state in fsm, so there is no need
to maintain "voters" and "tracker" optionals as well. The upside is that
optional and variant sates cannot drift apart now.
---
raft/fsm.hh | 42 +++++++++++++++++++------
raft/tracker.hh | 6 +---
raft/fsm.cc | 59 +++++++++++++++++-------------------
raft/tracker.cc | 4 +--
test/boost/raft_etcd_test.cc | 2 +-
5 files changed, 63 insertions(+), 50 deletions(-)

diff --git a/raft/fsm.hh b/raft/fsm.hh
index 467851c6b2..d63228483f 100644
--- a/raft/fsm.hh
+++ b/raft/fsm.hh
@@ -69,9 +69,20 @@ static constexpr logical_clock::duration ELECTION_TIMEOUT = logical_clock::durat
// (if a client contacts a follower, the follower redirects it to
// the leader). The third state, candidate, is used to elect a new
// leader.
-class follower {};
-class candidate {};
-class leader {};
+struct follower : std::monostate {};
+struct candidate {
+ // Votes received during an election round.
+ votes votes;
+ // True if the candidate in prevote state
+ bool is_prevote;
+ candidate(configuration configuration, bool prevote) :
+ votes(std::move(configuration)), is_prevote(prevote) {}
+};
+struct leader {
+ // A state for each follower
+ tracker tracker;
+ leader(server_id id) : tracker(id) {}
+};

// Raft protocol finite state machine
//
@@ -160,13 +171,7 @@ class fsm {
// reset on each term change. For testing, it's necessary to have the value
// at election_timeout without becoming a candidate.
logical_clock::duration _randomized_election_timeout = ELECTION_TIMEOUT + logical_clock::duration{1};
- // Votes received during an election round. Available only in
- // candidate state.
- std::optional<votes> _votes;

-protected: // For testing
- // A state for each follower, maintained only on the leader.
- std::optional<tracker> _tracker;
private:
// Holds all replies to AppendEntries RPC which are not
// yet sent out. If AppendEntries request is accepted, we must
@@ -258,6 +263,23 @@ class fsm {

void reset_election_timeout();

+ candidate& candidate_state() {
+ return std::get<candidate>(_state);
+ }
+
+ const candidate& candidate_state() const {
+ return std::get<candidate>(_state);
+ }
+
+protected: // For testing
+ leader& leader_state() {
+ return std::get<leader>(_state);
+ }
+
+ const leader& leader_state() const {
+ return std::get<leader>(_state);
+ }
+
public:
explicit fsm(server_id id, term_t current_term, server_id voted_for, log log,
failure_detector& failure_detector, fsm_config conf);
@@ -272,7 +294,7 @@ class fsm {
return std::holds_alternative<candidate>(_state);
}
bool is_prevote_candidate() const {
- return is_candidate() && _votes->is_prevote();
+ return is_candidate() && std::get<candidate>(_state).is_prevote;
}
index_t log_last_idx() const {
return _log.last_idx();
diff --git a/raft/tracker.hh b/raft/tracker.hh
index 6462217100..e463bdd00e 100644
--- a/raft/tracker.hh
+++ b/raft/tracker.hh
@@ -180,9 +180,8 @@ class votes {
server_address_set _voters;
election_tracker _current;
std::optional<election_tracker> _previous;
- bool _prevote;
public:
- votes(configuration configuration, bool prevote = false);
+ votes(configuration configuration);

const server_address_set& voters() const {
return _voters;
@@ -190,9 +189,6 @@ class votes {

void register_vote(server_id from, bool granted);
vote_result tally_votes() const;
- bool is_prevote() const {
- return _prevote;
- };

friend std::ostream& operator<<(std::ostream& os, const votes& v);
};
diff --git a/raft/fsm.cc b/raft/fsm.cc
index a4eb865c30..04176b5cce 100644
--- a/raft/fsm.cc
+++ b/raft/fsm.cc
@@ -87,7 +87,7 @@ const log_entry& fsm::add_entry(T command) {
// entry is replicated to the C_new servers, and
// a majority of the new configuration is used to
// determine the C_new entry’s commitment.
- _tracker->set_configuration(_log.get_configuration(), _log.last_idx());
+ leader_state().tracker.set_configuration(_log.get_configuration(), _log.last_idx());
}

return *_log[_log.last_idx()];
@@ -128,11 +128,8 @@ void fsm::reset_election_timeout() {

void fsm::become_leader() {
assert(!std::holds_alternative<leader>(_state));
- assert(!_tracker);
- _state = leader{};
+ _state = leader(_my_id);
_current_leader = _my_id;
- _votes = std::nullopt;
- _tracker.emplace(_my_id);
_log_limiter_semaphore.emplace(this);
_log_limiter_semaphore->sem.consume(_log.length());
_last_election_time = _clock.now();
@@ -144,16 +141,14 @@ void fsm::become_leader() {
add_entry(log_entry::dummy());
// set_configuration() begins replicating from the last entry
// in the log.
- _tracker->set_configuration(_log.get_configuration(), _log.last_idx());
+ leader_state().tracker.set_configuration(_log.get_configuration(), _log.last_idx());
replicate();
}

void fsm::become_follower(server_id leader) {
_current_leader = leader;
_state = follower{};
- _tracker = std::nullopt;
_log_limiter_semaphore = std::nullopt;
- _votes = std::nullopt;
reset_election_timeout();
if (_current_leader) {
_last_election_time = _clock.now();
@@ -161,7 +156,7 @@ void fsm::become_follower(server_id leader) {
}

void fsm::campain(bool do_prevote) {
- _state = candidate{};
+ _state = candidate(_log.get_configuration(), do_prevote);
// When starting a campain we need to reset current leader otherwise
// distruptive server prevention will stall an election if qurum of nodes
// start election together since each one will ignore vote requests from others
@@ -169,7 +164,6 @@ void fsm::campain(bool do_prevote) {

reset_election_timeout();

- _tracker = std::nullopt;
_log_limiter_semaphore = std::nullopt;
// 3.4 Leader election
//
@@ -181,9 +175,9 @@ void fsm::campain(bool do_prevote) {
// and initiating another round of RequestVote RPCs.
_last_election_time = _clock.now();

- _votes.emplace(_log.get_configuration(), do_prevote);
+ auto& votes = candidate_state().votes;

- const auto& voters = _votes->voters();
+ const auto& voters = votes.voters();
if (!voters.contains(server_address{_my_id})) {
// If the server is not part of the current configuration,
// revert to the follower state without increasing
@@ -200,7 +194,7 @@ void fsm::campain(bool do_prevote) {
for (const auto& server : voters) {
if (server.id == _my_id) {
// Vote for self.
- _votes->register_vote(server.id, true);
+ votes.register_vote(server.id, true);
if (!do_prevote) {
// Only record real votes
_voted_for = _my_id;
@@ -213,7 +207,7 @@ void fsm::campain(bool do_prevote) {

send_to(server.id, vote_request{term, _log.last_idx(), _log.last_term(), do_prevote});
}
- if (_votes->tally_votes() == vote_result::WON) {
+ if (votes.tally_votes() == vote_result::WON) {
// A single node cluster.
if (!do_prevote) {
become_leader();
@@ -316,8 +310,8 @@ void fsm::advance_stable_idx(index_t idx) {
// If this server is leader and is part of the current
// configuration, update it's progress and optionally
// commit new entries.
- if (is_leader() && _tracker->leader_progress()) {
- _tracker->leader_progress()->stable_to(idx);
+ if (is_leader() && leader_state().tracker.leader_progress()) {
+ leader_state().tracker.leader_progress()->stable_to(idx);
replicate();
maybe_commit();
}
@@ -325,7 +319,7 @@ void fsm::advance_stable_idx(index_t idx) {

void fsm::maybe_commit() {

- index_t new_commit_idx = _tracker->committed(_commit_idx);
+ index_t new_commit_idx = leader_state().tracker.committed(_commit_idx);

if (new_commit_idx <= _commit_idx) {
return;
@@ -362,12 +356,12 @@ void fsm::maybe_commit() {
configuration cfg(_log.get_configuration());
cfg.leave_joint();
_log.emplace_back(seastar::make_lw_shared<log_entry>({_current_term, _log.next_idx(), std::move(cfg)}));
- _tracker->set_configuration(_log.get_configuration(), _log.last_idx());
+ leader_state().tracker.set_configuration(_log.get_configuration(), _log.last_idx());
// Leaving joint configuration may commit more entries
// even if we had no new acks, by switching the quorum
// from joint to simple majority.
maybe_commit();
- } else if (_tracker->leader_progress() == nullptr) {
+ } else if (leader_state().tracker.leader_progress() == nullptr) {
// 4.2.2 Removing the current leader
//
// A leader that is removed from the configuration
@@ -391,7 +385,7 @@ void fsm::tick_leader() {
}

size_t active = 1; // +1 for self
- for (auto& [id, progress] : *_tracker) {
+ for (auto& [id, progress] : leader_state().tracker) {
if (progress.id != _my_id) {
if (_failure_detector.is_alive(progress.id)) {
active++;
@@ -413,7 +407,7 @@ void fsm::tick_leader() {
}
}
}
- if (active >= _tracker->size()/2 + 1) {
+ if (active >= leader_state().tracker.size()/2 + 1) {
// Advance last election time if we heard from
// the quorum during this tick.
_last_election_time = _clock.now();
@@ -484,7 +478,7 @@ void fsm::append_entries(server_id from, append_request&& request) {
void fsm::append_entries_reply(server_id from, append_reply&& reply) {
assert(is_leader());

- follower_progress* opt_progress = _tracker->find(from);
+ follower_progress* opt_progress = leader_state().tracker.find(from);
if (opt_progress == nullptr) {
// A message from a follower removed from the
// configuration.
@@ -554,7 +548,7 @@ void fsm::append_entries_reply(server_id from, append_reply&& reply) {

// We may have just applied a configuration that removes this
// followre, so re-track it.
- opt_progress = _tracker->find(from);
+ opt_progress = leader_state().tracker.find(from);
if (opt_progress != nullptr) {
replicate_to(*opt_progress, false);
}
@@ -617,13 +611,14 @@ void fsm::request_vote_reply(server_id from, vote_reply&& reply) {

logger.trace("{} received a {} vote from {}", _my_id, reply.vote_granted ? "yes" : "no", from);

- _votes->register_vote(from, reply.vote_granted);
+ auto& state = std::get<candidate>(_state);
+ state.votes.register_vote(from, reply.vote_granted);

- switch (_votes->tally_votes()) {
+ switch (state.votes.tally_votes()) {
case vote_result::UNKNOWN:
break;
case vote_result::WON:
- if (_votes->is_prevote()) {
+ if (state.is_prevote) {
become_candidate();
} else {
become_leader();
@@ -743,7 +738,7 @@ void fsm::replicate_to(follower_progress& progress, bool allow_empty) {

void fsm::replicate() {
assert(is_leader());
- for (auto& [id, progress] : *_tracker) {
+ for (auto& [id, progress] : leader_state().tracker) {
if (progress.id != _my_id) {
replicate_to(progress, false);
}
@@ -770,7 +765,7 @@ bool fsm::can_read() {
}

void fsm::snapshot_status(server_id id, std::optional<index_t> idx) {
- follower_progress& progress = *_tracker->find(id);
+ follower_progress& progress = *leader_state().tracker.find(id);

if (progress.state != follower_progress::state::SNAPSHOT) {
logger.trace("snasphot_status[{}]: called not in snapshot state", _my_id);
@@ -821,8 +816,8 @@ std::ostream& operator<<(std::ostream& os, const fsm& f) {
os << "commit index: " << f._observed._commit_idx << "), ";
os << "current time: " << f._clock.now() << ", ";
os << "last election time: " << f._last_election_time << ", ";
- if (f._votes) {
- os << "votes (" << *f._votes << "), ";
+ if (f.is_candidate()) {
+ os << "votes (" << f.candidate_state().votes << "), ";
}
os << "messages: " << f._messages.size() << ", ";

@@ -833,9 +828,9 @@ std::ostream& operator<<(std::ostream& os, const fsm& f) {
} else if (std::holds_alternative<follower>(f._state)) {
os << "follower";
}
- if (f._tracker) {
+ if (f.is_leader()) {
os << "followers (";
- for (const auto& [server_id, follower_progress]: *f._tracker) {
+ for (const auto& [server_id, follower_progress]: f.leader_state().tracker) {
os << server_id << ", ";
os << follower_progress.next_idx << ", ";
os << follower_progress.match_idx << ", ";
diff --git a/raft/tracker.cc b/raft/tracker.cc
index b56890b9a4..39a9f75b14 100644
--- a/raft/tracker.cc
+++ b/raft/tracker.cc
@@ -202,9 +202,9 @@ index_t tracker::committed(index_t prev_commit_idx) {
}
}

-votes::votes(configuration configuration, bool prevote)
+votes::votes(configuration configuration)
:_voters(configuration.current)
- , _current(configuration.current), _prevote(prevote) {
+ , _current(configuration.current) {

if (configuration.is_joint()) {
_previous.emplace(configuration.previous);
diff --git a/test/boost/raft_etcd_test.cc b/test/boost/raft_etcd_test.cc
index 420742ec45..04757301e4 100644
--- a/test/boost/raft_etcd_test.cc
+++ b/test/boost/raft_etcd_test.cc
@@ -79,7 +79,7 @@ class fsm_debug : public raft::fsm {
public:
using raft::fsm::fsm;
const raft::follower_progress& get_progress(server_id id) {
- raft::follower_progress* progress = _tracker->find(id);
+ raft::follower_progress* progress = leader_state().tracker.find(id);
return *progress;
}
};
--
2.29.2

Gleb Natapov

<gleb@scylladb.com>
unread,
Mar 3, 2021, 9:28:49 AM3/3/21
to scylladb-dev@googlegroups.com
Why I wrote "rpc" in the subject? We will never know.
--
Gleb.

Konstantin Osipov

<kostja@scylladb.com>
unread,
Mar 3, 2021, 1:23:33 PM3/3/21
to Gleb Natapov, scylladb-dev@googlegroups.com
* Gleb Natapov <gl...@scylladb.com> [21/03/03 17:30]:

> +namespace internal {
> +static inline bool need_update_term(const vote_request& r) {
> + return !r.prevote;
> +}
> +static inline bool need_update_term(const vote_reply& r) {
> + // We send pre-vote requests with a term in our future. If the
> + // pre-vote is granted, we will increment our term when we get a
> + // quorum. If it is not, the term comes from the node that
> + // rejected our vote so we should become a follower at the new
> + // term.
> + return !r.prevote || !r.vote_granted;
> +}

If the follower's term is higher than the candidate's term, the
vote is not granted. So I don't see why we need to check
vote_reply at all - we should always update our term if
vote_reply's term is higher than ours, just like with out types of
messages.

> + if (internal::need_update_term(msg)) {
> + become_follower(leader);
> + update_current_term(msg.current_term);
> + }

It would be nice if need_update_term() would be named in terms of
what it checks, not what it is used for. If you decide to adopt suggestion
above, this could be replaced with if (!is_prevote_request(msg)) {

> + bool _prevote;
> public:
> - votes(configuration configuration);
> + votes(configuration configuration, bool prevote = false);
>
> const server_address_set& voters() const {
> return _voters;
> @@ -189,6 +190,9 @@ class votes {
>
> void register_vote(server_id from, bool granted);
> vote_result tally_votes() const;
> + bool is_prevote() const {
> + return _prevote;
> + };

It would be nice to name prevote consistently one of {is_prevote,
_is_prevote, is_prevote()} across entire code base (it is now
sometimes named is_prevote(), sometimes _prevote, and sometimes
do_prevote). enable_prevoting, is exempt from this
complaint.
>
> static thread_local std::uniform_int_distribution<> dist(1, ELECTION_TIMEOUT.count());
> - // Reset the randomized election timeout on each term
> - // change, even if we do not plan to campaign during this
> - // term: the main purpose of the timeout is to avoid
> - // starting our campaign simultaneously with other followers.
> _randomized_election_timeout = ELECTION_TIMEOUT + logical_clock::duration{dist(re)};

I still do not understand this change well.
Now that you removed a declaration for the previous policy, what
is the policy to reset election timeout now? In what circumstances
should it be done?

> +void fsm::campain(bool do_prevote) {
> _state = candidate{};
> + // When starting a campain we need to reset current leader otherwise
> + // distruptive server prevention will stall an election if qurum of nodes

quorum

> + // start election together since each one will ignore vote requests from others
> + _current_leader = {};

Would love to see it in an own patch, seems like a bug in the old
code regardless of whether we have prevote or not.

> +
> + reset_election_timeout();

I still don't understand why reset_election_timeout is no longer
associated with term change. The timeout used to be randomized
once per term change, do you want to randomize it more often or
less often, why?

> - become_leader();
> + if (!do_prevote) {
> + become_leader();
> + } else {
> + become_candidate();
> + }

Instead of introducing campaign() and become_prevote() we could
have become_candidate(bool is_prevote) - then here we could have

become_candidate(false);

> }
> }
>
> +void fsm::become_candidate() {
> + campain(false);
> +}
> +
> +void fsm::become_prevote() {
> + campain(true);
> +}
> +
> future<fsm_output> fsm::poll_output() {
> logger.trace("fsm::poll_output() {} stable index: {} last index: {}",
> _my_id, _log.stable_idx(), _log.last_idx());
> @@ -406,7 +432,11 @@ void fsm::tick() {
> } else if (is_past_election_timeout()) {
> logger.trace("tick[{}]: becoming a candidate, last election: {}, now: {}", _my_id,
> _last_election_time, _clock.now());
> - become_candidate();
> + if (_config.enable_prevoting) {
> + become_prevote();
> + } else {
> + become_candidate();

Here we could have
become_candidate(_config.enable_prevoting).

> - (_voted_for == server_id{} && _current_leader == server_id{});
> + (_voted_for == server_id{} && _current_leader == server_id{}) ||
> + // ...this is prevote for a future term...
> + (request.prevote && request.current_term > _current_term);

Could you please justify why we should grant prevote even though
we have heard from a leader within election timeout? If the
cluster has no new log entries, the pre-vote
candidate will collect a quorum, increment its vote and disrupt
the current leader.

So I rather think we should not.

> + // When responding to Msg{Pre,}Vote messages we include the term
> + // from the message, not the local term. To see why, consider the
> + // case where a single node was previously partitioned away and
> + // it's local term is now out of date. If we include the local term
> + // (recall that for pre-votes we don't update the local term), the
> + // (pre-)campaigning node on the other end will proceed to ignore
> + // the message (it ignores all out of date messages).

While I understand what is meant here, it took me an effort to
come to the conclusion: this would be bad
because pre-vote candidate will not be able to account our positive vote,
while our intent is to give him a chance to run a real campaign.

> + // The term in the original message and current local term are the
> + // same in the case of regular votes, but different for pre-votes.

Here we're granting a vote, so our term is lower or equal
to than the term of the pre-vote request. I would not use the word
"different" then but be more specific and put this sentence first, to
give the explanation some context.

PS The new version looks much easier to follow indeed.

--
Konstantin Osipov, Moscow, Russia
https://scylladb.com

Konstantin Osipov

<kostja@scylladb.com>
unread,
Mar 3, 2021, 1:25:24 PM3/3/21
to Gleb Natapov, scylladb-dev@googlegroups.com
* Konstantin Osipov <kos...@scylladb.com> [21/03/03 21:23]:
> candidate will collect a quorum, increment its vote and disrupt

increment its term

> the current leader.

Gleb Natapov

<gleb@scylladb.com>
unread,
Mar 3, 2021, 2:42:31 PM3/3/21
to Konstantin Osipov, scylladb-dev@googlegroups.com
On Wed, Mar 03, 2021 at 09:23:30PM +0300, Konstantin Osipov wrote:
> * Gleb Natapov <gl...@scylladb.com> [21/03/03 17:30]:
>
> > +namespace internal {
> > +static inline bool need_update_term(const vote_request& r) {
> > + return !r.prevote;
> > +}
> > +static inline bool need_update_term(const vote_reply& r) {
> > + // We send pre-vote requests with a term in our future. If the
> > + // pre-vote is granted, we will increment our term when we get a
> > + // quorum. If it is not, the term comes from the node that
> > + // rejected our vote so we should become a follower at the new
> > + // term.
> > + return !r.prevote || !r.vote_granted;
> > +}
>
> If the follower's term is higher than the candidate's term, the
> vote is not granted. So I don't see why we need to check
> vote_reply at all - we should always update our term if
> vote_reply's term is higher than ours, just like with out types of
> messages.
>
That will defeat the whole purpose of the exercise. Prevote request
caries higher term then actual node's term. If a node will increase its
term on a prevote reply without accruing a quorum of replies we failed
to achieve prevote purpose. See the comment in the function which
explains exactly that.


> > + if (internal::need_update_term(msg)) {
> > + become_follower(leader);
> > + update_current_term(msg.current_term);
> > + }
>
> It would be nice if need_update_term() would be named in terms of
> what it checks, not what it is used for. If you decide to adopt suggestion
> above, this could be replaced with if (!is_prevote_request(msg)) {
I cannot adopt the suggestion and even if I could the check would not
check for a request only. I can split it to two and then the "if"
will look like:

if (!(is_prevote_request(msg) || is_prevote_granted_reply(msg))

But I do not think this will be any better.
Anytime state changes. It was never about prevote state. The comment is
incorrect for the reason I explained earlier: timeout can be reset
without term change when prevote stage starts.

>
> > +void fsm::campain(bool do_prevote) {
> > _state = candidate{};
> > + // When starting a campain we need to reset current leader otherwise
> > + // distruptive server prevention will stall an election if qurum of nodes
>
> quorum
>
> > + // start election together since each one will ignore vote requests from others
> > + _current_leader = {};
>
> Would love to see it in an own patch, seems like a bug in the old
> code regardless of whether we have prevote or not.
OK.

>
> > +
> > + reset_election_timeout();
>
> I still don't understand why reset_election_timeout is no longer
> associated with term change. The timeout used to be randomized
> once per term change, do you want to randomize it more often or
> less often, why?
>
If two nodes choose the same timeout when entering prevote state they
will livelock. They need to choose different timeout when re-entering
prevote and there is no term change during prevote.
I cannot justify it because this is not what the code is about. In case
there is a leader and we heard from it we will not be here (see 'return'
in step() in that case).

We can get her if this node is already voted for someone, but now it got
a prevote request with higher term. It need to answer positively because
if the node would request a real vote it would have been granted.
--
Gleb.

Konstantin Osipov

<kostja@scylladb.com>
unread,
Mar 3, 2021, 5:13:51 PM3/3/21
to Gleb Natapov, scylladb-dev@googlegroups.com
* Gleb Natapov <gl...@scylladb.com> [21/03/03 23:23]:
> > > +namespace internal {
> > > +static inline bool need_update_term(const vote_request& r) {
> > > + return !r.prevote;
> > > +}
> > > +static inline bool need_update_term(const vote_reply& r) {
> > > + // We send pre-vote requests with a term in our future. If the
> > > + // pre-vote is granted, we will increment our term when we get a
> > > + // quorum. If it is not, the term comes from the node that
> > > + // rejected our vote so we should become a follower at the new
> > > + // term.
> > > + return !r.prevote || !r.vote_granted;
> > > +}
> >
> > If the follower's term is higher than the candidate's term, the
> > vote is not granted. So I don't see why we need to check
> > vote_reply at all - we should always update our term if
> > vote_reply's term is higher than ours, just like with out types of
> > messages.
> >
> That will defeat the whole purpose of the exercise. Prevote request
> caries higher term then actual node's term. If a node will increase its
> term on a prevote reply without accruing a quorum of replies we failed
> to achieve prevote purpose. See the comment in the function which
> explains exactly that.

Our goal is to avoid incrementing pre-vote candidate term ahead of
the term of an existing leader or candidate (I will call this term
"cluster term" below).

But it is fine to catch up the pre-vote candidate's term with an
existing cluster term when the pre-vote candidate's term is behind.

If the leader is alive and healthy the pre-vote candidate will not
win the election, neither this time, nor the next time when it
tries to run another election with cluster_term + 1. Neither it
will trigger terms race by running a pre-vote round with
cluster_term + 1, because pre-vote request term is not
incrementing any server's term.

> > > + if (internal::need_update_term(msg)) {
> > > + become_follower(leader);
> > > + update_current_term(msg.current_term);
> > > + }
> >
> > It would be nice if need_update_term() would be named in terms of
> > what it checks, not what it is used for. If you decide to adopt suggestion
> > above, this could be replaced with if (!is_prevote_request(msg)) {
> I cannot adopt the suggestion and even if I could the check would not
>, check for a request only. I can split it to two and then the "if"
> will look like:
>
> if (!(is_prevote_request(msg) || is_prevote_granted_reply(msg))
>
> But I do not think this will be any better.

Actually it is much more clear. I couldn't come up with name
is_prevote_granted_reply() myself.

> > It would be nice to name prevote consistently one of {is_prevote,
> > _is_prevote, is_prevote()} across entire code base (it is now
> > sometimes named is_prevote(), sometimes _prevote, and sometimes
> > do_prevote). enable_prevoting, is exempt from this
> > complaint.
> > >
> > > static thread_local std::uniform_int_distribution<> dist(1, ELECTION_TIMEOUT.count());
> > > - // Reset the randomized election timeout on each term
> > > - // change, even if we do not plan to campaign during this
> > > - // term: the main purpose of the timeout is to avoid
> > > - // starting our campaign simultaneously with other followers.
> > > _randomized_election_timeout = ELECTION_TIMEOUT + logical_clock::duration{dist(re)};
> >
> > I still do not understand this change well.
> > Now that you removed a declaration for the previous policy, what
> > is the policy to reset election timeout now? In what circumstances
> > should it be done?
> Anytime state changes. It was never about prevote state. The comment is
> incorrect for the reason I explained earlier: timeout can be reset
> without term change when prevote stage starts.

Yes, the comment has become incomplete ;) So instead of removing
it, could you please extend it?


> > > + reset_election_timeout();
> >
> > I still don't understand why reset_election_timeout is no longer
> > associated with term change. The timeout used to be randomized
> > once per term change, do you want to randomize it more often or
> > less often, why?
> >
> If two nodes choose the same timeout when entering prevote state they
> will livelock. They need to choose different timeout when re-entering
> prevote and there is no term change during prevote.

Looks like pre-vote adds one more case when one needs to reset the
election timeout. Why not call reset_election_timeout() from
update_current_term() with the old explanation, and add a new call
to reset_election_timeout() in become_candidate()?

For example, the patch now resets the election timeout in
become_follower(). It seems there is no harm in doing it, but
it is useless work: we can change the state without
changing the term.

BTW, resetting the timeout more frequently doesn't reduce the probability of
of conflict. There are other ways to reset it which do:
- use a float value and add some dispersion to tick() interval
- reset it only when a vote is denied because it has already been
granted to another candidate. This will guarantee all
nodes converge to using timeouts spaced apart.
- reset it only on configuration changes, based on a rank
computed deterministically from this server's relative order in the
config.


> > > + // ...this is prevote for a future term...
> > > + (request.prevote && request.current_term > _current_term);
> >
> > Could you please justify why we should grant prevote even though
> > we have heard from a leader within election timeout?
> I cannot justify it because this is not what the code is about. In case
> there is a leader and we heard from it we will not be here (see 'return'
> in step() in that case).
>
> We can get her if this node is already voted for someone, but now it got
> a prevote request with higher term. It need to answer positively because
> if the node would request a real vote it would have been granted.

Got it now, thanks. Please add this to the explanation of what's
going on though :)

Gleb Natapov

<gleb@scylladb.com>
unread,
Mar 4, 2021, 3:31:20 AM3/4/21
to Konstantin Osipov, scylladb-dev@googlegroups.com
I fail to understand your point. You seams to be saying that if a
prevote candidate cannot win an election because of another node having
higher term it should adopt the term, right? Well this is what the code
is doing. If we will always update the term the code will do something
completely different and incorrect.

> > > > + if (internal::need_update_term(msg)) {
> > > > + become_follower(leader);
> > > > + update_current_term(msg.current_term);
> > > > + }
> > >
> > > It would be nice if need_update_term() would be named in terms of
> > > what it checks, not what it is used for. If you decide to adopt suggestion
> > > above, this could be replaced with if (!is_prevote_request(msg)) {
> > I cannot adopt the suggestion and even if I could the check would not
> >, check for a request only. I can split it to two and then the "if"
> > will look like:
> >
> > if (!(is_prevote_request(msg) || is_prevote_granted_reply(msg))
> >
> > But I do not think this will be any better.
>
> Actually it is much more clear. I couldn't come up with name
> is_prevote_granted_reply() myself.
>
It will double the amount of helper functions which is a pity. May be:
is_prevote_request_or_prevote_granted_reply(). Very long but used once
anyway.

> > > It would be nice to name prevote consistently one of {is_prevote,
> > > _is_prevote, is_prevote()} across entire code base (it is now
> > > sometimes named is_prevote(), sometimes _prevote, and sometimes
> > > do_prevote). enable_prevoting, is exempt from this
> > > complaint.
> > > >
> > > > static thread_local std::uniform_int_distribution<> dist(1, ELECTION_TIMEOUT.count());
> > > > - // Reset the randomized election timeout on each term
> > > > - // change, even if we do not plan to campaign during this
> > > > - // term: the main purpose of the timeout is to avoid
> > > > - // starting our campaign simultaneously with other followers.
> > > > _randomized_election_timeout = ELECTION_TIMEOUT + logical_clock::duration{dist(re)};
> > >
> > > I still do not understand this change well.
> > > Now that you removed a declaration for the previous policy, what
> > > is the policy to reset election timeout now? In what circumstances
> > > should it be done?
> > Anytime state changes. It was never about prevote state. The comment is
> > incorrect for the reason I explained earlier: timeout can be reset
> > without term change when prevote stage starts.
>
> Yes, the comment has become incomplete ;) So instead of removing
> it, could you please extend it?
Not incomplete, but incorrect. The code also moved from a function that
updates term, so it does not make any sense to mentioned it there.

>
>
> > > > + reset_election_timeout();
> > >
> > > I still don't understand why reset_election_timeout is no longer
> > > associated with term change. The timeout used to be randomized
> > > once per term change, do you want to randomize it more often or
> > > less often, why?
> > >
> > If two nodes choose the same timeout when entering prevote state they
> > will livelock. They need to choose different timeout when re-entering
> > prevote and there is no term change during prevote.
>
> Looks like pre-vote adds one more case when one needs to reset the
> election timeout. Why not call reset_election_timeout() from
> update_current_term() with the old explanation, and add a new call
> to reset_election_timeout() in become_candidate()?
I do not understand why you even want to tie timer randomization to a
term change. The event that should trigger it is state change since you
want to remain in this state for at least that time or until another
state transition happens. Before prevote each state transition caused
term change, so the distinction was blurred, but prevote made it clear
that the reasoning is incorrect.

>
> For example, the patch now resets the election timeout in
> become_follower(). It seems there is no harm in doing it, but
> it is useless work: we can change the state without
> changing the term.
Why is it useless work? We choose how long the follower will wait
before starting next election. If there are several candidates that
move to the follower state simultaneously they want to randomize their
timeout to move to candidate state at different times to not clash. Of
course we can assume that the timeout was sufficiently random already,
but following that reasoning the only place that needs randomization
is when re-entering candidate state (because that could have happened
because of a clash), not on each term change. Why should a follower
randomize when it adopts a leader's term, but not otherwise?

>
> BTW, resetting the timeout more frequently doesn't reduce the probability of
> of conflict. There are other ways to reset it which do:
> - use a float value and add some dispersion to tick() interval
No need to add float calculations.

> - reset it only when a vote is denied because it has already been
> granted to another candidate. This will guarantee all
> nodes converge to using timeouts spaced apart.
Exactly! So I do not understand why you think term change is somehow
relevant and insist no the comment which only confuses the matter.

> - reset it only on configuration changes, based on a rank
> computed deterministically from this server's relative order in the
> config.
>
This may still clash.

--
Gleb.

Gleb Natapov

<gleb@scylladb.com>
unread,
Mar 4, 2021, 7:33:02 AM3/4/21
to scylladb-dev@googlegroups.com
Not resetting a leader causes vote requests to be ignored instead of
rejected which will make voting round to take more time to fail and may
slow down new leader election.
---
raft/fsm.cc | 4 ++++
1 file changed, 4 insertions(+)

diff --git a/raft/fsm.cc b/raft/fsm.cc
index 6edb4acfa1..90d26ee383 100644
--- a/raft/fsm.cc
+++ b/raft/fsm.cc
@@ -161,6 +161,10 @@ void fsm::become_follower(server_id leader) {
}

void fsm::become_candidate() {
+ // When starting a campain we need to reset current leader otherwise
+ // disruptive server prevention will stall an election if quorum of nodes
+ // start election together since each one will ignore vote requests from others
+ _current_leader = {};
_state = candidate{};
_tracker = std::nullopt;
_log_limiter_semaphore = std::nullopt;
--
2.29.2

Gleb Natapov

<gleb@scylladb.com>
unread,
Mar 4, 2021, 7:33:02 AM3/4/21
to scylladb-dev@googlegroups.com
This patcheset adds prevoting stage in leader election algorithm. See
patch 2 for explanation what is it and why it is needed.

v2->v3:
- rename all *prevote to is_prevote
- randomize election timeout on entering candidate sate only
- replace helper function to check prevote messages with inline
constexpr checks
- drop become_prevote() in favor of become_candidate(bool)

v1->v2:
- move cleanups to a separate patch
- add boost tests
- make prevoting a variant of candidate state instead of a separate
state

Gleb Natapov (5):
raft: use modern unordered_set::contains instead of find in
become_candidate
raft: reset the leader on entering candidate state
raft: implement prevoting stage in leader election
raft: add boost tests for prevoting
raft: store leader and candidate state in state variant

idl/raft.idl.hh | 2 +
raft/fsm.hh | 79 +++++++++++++++----
raft/raft.hh | 4 +
raft/server.hh | 2 +
raft/fsm.cc | 143 +++++++++++++++++++++--------------
raft/server.cc | 5 +-
test/boost/raft_etcd_test.cc | 2 +-
test/boost/raft_fsm_test.cc | 142 +++++++++++++++++++++++++++++++++-
8 files changed, 301 insertions(+), 78 deletions(-)

--
2.29.2

Gleb Natapov

<gleb@scylladb.com>
unread,
Mar 4, 2021, 7:33:02 AM3/4/21
to scylladb-dev@googlegroups.com
---
raft/fsm.cc | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/raft/fsm.cc b/raft/fsm.cc
index 6dc8aad956..6edb4acfa1 100644
--- a/raft/fsm.cc
+++ b/raft/fsm.cc
@@ -177,7 +177,7 @@ void fsm::become_candidate() {
_votes.emplace(_log.get_configuration());

const auto& voters = _votes->voters();
- if (voters.find(server_address{_my_id}) == voters.end()) {
+ if (!voters.contains(server_address{_my_id})) {
// If the server is not part of the current configuration,
// revert to the follower state without increasing
// the current term.
--
2.29.2

Gleb Natapov

<gleb@scylladb.com>
unread,
Mar 4, 2021, 7:33:04 AM3/4/21
to scylladb-dev@googlegroups.com
---
raft/fsm.hh | 3 +
test/boost/raft_fsm_test.cc | 142 +++++++++++++++++++++++++++++++++++-
2 files changed, 141 insertions(+), 4 deletions(-)

diff --git a/raft/fsm.hh b/raft/fsm.hh
index eaeb26e3e8..dce8027761 100644
--- a/raft/fsm.hh
+++ b/raft/fsm.hh
@@ -267,6 +267,9 @@ class fsm {
bool is_candidate() const {
return std::holds_alternative<candidate>(_state);
}
+ bool is_prevote_candidate() const {
+ return is_candidate() && _votes->is_prevote();
+ }
index_t log_last_idx() const {
return _log.last_idx();
}
diff --git a/test/boost/raft_fsm_test.cc b/test/boost/raft_fsm_test.cc
index fa751fa7be..1745952279 100644
--- a/test/boost/raft_fsm_test.cc
+++ b/test/boost/raft_fsm_test.cc
@@ -57,7 +57,7 @@ raft::snapshot log_snapshot(raft::log& log, index_t idx) {
return raft::snapshot{.idx = idx, .term = log.last_term(), .config = log.get_snapshot().config};
}

-raft::fsm_config fsm_cfg{.append_request_threshold = 1};
+raft::fsm_config fsm_cfg{.append_request_threshold = 1, .enable_prevoting = false};

BOOST_AUTO_TEST_CASE(test_votes) {
auto id = []() -> raft::server_address { return raft::server_address{utils::make_random_uuid()}; };
@@ -327,13 +327,13 @@ BOOST_AUTO_TEST_CASE(test_log_last_conf_idx) {
BOOST_CHECK_EQUAL(log.in_memory_size(), 1);
}

-BOOST_AUTO_TEST_CASE(test_election_single_node) {
+void test_election_single_node_helper(raft::fsm_config fcfg) {

failure_detector fd;
server_id id1{utils::make_random_uuid()};
raft::configuration cfg({id1});
raft::log log{raft::snapshot{.config = cfg}};
- raft::fsm fsm(id1, term_t{}, server_id{}, std::move(log), fd, fsm_cfg);
+ raft::fsm fsm(id1, term_t{}, server_id{}, std::move(log), fd, fcfg);

BOOST_CHECK(fsm.is_follower());

@@ -364,6 +364,9 @@ BOOST_AUTO_TEST_CASE(test_election_single_node) {
BOOST_CHECK(output.committed.size() == 1 && std::holds_alternative<raft::log_entry::dummy>(output.committed[0]->data));
}

+BOOST_AUTO_TEST_CASE(test_election_single_node) {
+ test_election_single_node_helper(fsm_cfg);
+}
// Test that adding an entry to a single-node cluster
// does not lead to RPC
BOOST_AUTO_TEST_CASE(test_single_node_is_quiet) {
@@ -419,7 +422,7 @@ BOOST_AUTO_TEST_CASE(test_election_two_nodes) {
BOOST_CHECK(fsm.is_leader());
// Out of order response to the previous election is ignored
fsm.step(id2, raft::vote_reply{output.term - term_t{1}, false});
- assert(fsm.is_leader());
+ BOOST_CHECK(fsm.is_leader());

// Vote request within the election timeout is ignored
// (avoiding disruptive leaders).
@@ -505,6 +508,137 @@ BOOST_AUTO_TEST_CASE(test_election_four_nodes) {
BOOST_CHECK(fsm.is_leader());
}

+BOOST_AUTO_TEST_CASE(test_election_single_node_prevote) {
+ auto fcfg = fsm_cfg;
+ fcfg.enable_prevoting = true;
+ test_election_single_node_helper(fcfg);
+}
+
+BOOST_AUTO_TEST_CASE(test_election_two_nodes_prevote) {
+ auto fcfg = fsm_cfg;
+ fcfg.enable_prevoting = true;
+
+ failure_detector fd;
+
+ server_id id1{utils::make_random_uuid()}, id2{utils::make_random_uuid()};
+
+ raft::configuration cfg({id1, id2});
+ raft::log log{raft::snapshot{.config = cfg}};
+
+ raft::fsm fsm(id1, term_t{}, server_id{}, std::move(log), fd, fcfg);
+
+ // Initial state is follower
+ BOOST_CHECK(fsm.is_follower());
+
+ // After election timeout, a follower becomes a prevote candidate
+ election_timeout(fsm);
+ BOOST_CHECK(fsm.is_prevote_candidate());
+ // Term was not increased
+ BOOST_CHECK_EQUAL(fsm.get_current_term(), term_t{});
+
+ // If nothing happens, the candidate stays this way
+ election_timeout(fsm);
+ BOOST_CHECK(fsm.is_prevote_candidate());
+ BOOST_CHECK_EQUAL(fsm.get_current_term(), term_t{});
+
+ auto output = fsm.get_output();
+ // After a favourable prevote reply, we become a regular candidate (quorum is 2)
+ fsm.step(id2, raft::vote_reply{output.term, true, true});
+ BOOST_CHECK(fsm.is_candidate() && !fsm.is_prevote_candidate());
+ // And increased our term this time
+ BOOST_CHECK_EQUAL(fsm.get_current_term(), term_t{1});
+
+ election_timeout(fsm);
+ // Check that rejected prevote with higher term causes prevote candidate move to follower
+ fsm.step(id2, raft::vote_reply{term_t{2}, false, true});
+ BOOST_CHECK(fsm.is_follower());
+ BOOST_CHECK_EQUAL(fsm.get_current_term(), term_t{2});
+
+ election_timeout(fsm);
+ (void)fsm.get_output();
+ // Check that receiving prevote with smaller term generate reject with newer term
+ fsm.step(id2, raft::vote_request{term_t{1}, index_t{}, term_t{}, true});
+ output = fsm.get_output();
+ BOOST_CHECK_EQUAL(output.messages.size(), 1);
+ auto msg = std::get<raft::vote_reply>(output.messages.back().second);
+ BOOST_CHECK(msg.current_term == term_t{2} && !msg.vote_granted);
+
+ // Check that prevote with higer term get a reply with term in the future
+ // and does not change local term.
+ // Move to follower again
+ fsm.step(id2, raft::vote_reply{term_t{3}, false, true});
+ BOOST_CHECK(fsm.is_follower());
+ // Send prevote with higher term
+ fsm.step(id2, raft::vote_request{term_t{4}, index_t{}, term_t{}, true});
+ output = fsm.get_output();
+ BOOST_CHECK_EQUAL(output.messages.size(), 1);
+ // Reply has request's term
+ msg = std::get<raft::vote_reply>(output.messages.back().second);
+ BOOST_CHECK(msg.current_term == term_t{4} && msg.vote_granted);
+ // But fsm current term stays the same
+ BOOST_CHECK_EQUAL(fsm.get_current_term(), term_t{3});
+}
+
+BOOST_AUTO_TEST_CASE(test_election_four_nodes_prevote) {
+ auto fcfg = fsm_cfg;
+ fcfg.enable_prevoting = true;
+
+ failure_detector fd;
+
+ server_id id1{utils::make_random_uuid()},
+ id2{utils::make_random_uuid()},
+ id3{utils::make_random_uuid()},
+ id4{utils::make_random_uuid()};
+
+ raft::configuration cfg({id1, id2, id3, id4});
+ raft::log log{raft::snapshot{.config = cfg}};
+
+ raft::fsm fsm(id1, term_t{}, server_id{}, std::move(log), fd, fcfg);
+
+ // Initial state is follower
+ BOOST_CHECK(fsm.is_follower());
+
+ // Inform FSM about a new leader at a new term
+ fsm.step(id4, raft::append_request{term_t{1}, id4, index_t{1}, term_t{1}});
+
+ (void) fsm.get_output();
+
+ // Request a prevote during the same term. Even though
+ // we haven't voted, we should deny a vote because we
+ // know about a leader for this term.
+ fsm.step(id3, raft::vote_request{term_t{1}, index_t{1}, term_t{1}, true});
+
+ auto output = fsm.get_output();
+ auto reply = std::get<raft::vote_reply>(output.messages.back().second);
+ BOOST_CHECK(!reply.vote_granted && reply.is_prevote);
+
+ // Run out of steam for this term. Start a new one.
+ fd.alive = false;
+ election_timeout(fsm);
+ BOOST_CHECK(fsm.is_candidate() && fsm.is_prevote_candidate());
+
+ output = fsm.get_output();
+ // Add a favourable prevote reply, not enough for quorum
+ fsm.step(id2, raft::vote_reply{output.term + term_t{1}, true, true});
+ BOOST_CHECK(fsm.is_candidate() && fsm.is_prevote_candidate());
+
+ // Add another one, this adds up to quorum
+ fsm.step(id3, raft::vote_reply{output.term + term_t{1}, true, true});
+ BOOST_CHECK(fsm.is_candidate() && !fsm.is_prevote_candidate());
+
+ // Check that prevote with future term is answered even if we voted already
+ // Request regular vote
+ fsm.step(id2, raft::vote_request{fsm.get_current_term(), index_t{1}, term_t{1}, false});
+ // Clear message queue
+ (void)fsm.get_output();
+ // Ask for prevote with future term
+ fsm.step(id3, raft::vote_request{fsm.get_current_term() + term_t{1}, index_t{1}, term_t{1}, true});
+ output = fsm.get_output();
+ BOOST_CHECK_EQUAL(output.messages.size(), 1);
+ reply = std::get<raft::vote_reply>(output.messages.back().second);
+ BOOST_CHECK(reply.vote_granted && reply.is_prevote);
+}
+
BOOST_AUTO_TEST_CASE(test_log_matching_rule) {

failure_detector fd;
--
2.29.2

Gleb Natapov

<gleb@scylladb.com>
unread,
Mar 4, 2021, 7:33:04 AM3/4/21
to scylladb-dev@googlegroups.com
raft/fsm.hh | 36 ++++++++++++++++++----
raft/raft.hh | 4 +++
raft/server.hh | 2 ++
raft/tracker.hh | 6 +++-
raft/fsm.cc | 82 ++++++++++++++++++++++++++++++++++---------------
raft/server.cc | 5 +--
raft/tracker.cc | 4 +--
8 files changed, 105 insertions(+), 36 deletions(-)

diff --git a/idl/raft.idl.hh b/idl/raft.idl.hh
index 97f26ad82a..6d81de7e0c 100644
--- a/idl/raft.idl.hh
+++ b/idl/raft.idl.hh
@@ -62,11 +62,13 @@ struct vote_request {
raft::internal::tagged_uint64<raft::term_tag> current_term;
raft::internal::tagged_uint64<raft::index_tag> last_log_idx;
raft::internal::tagged_uint64<raft::term_tag> last_log_term;
+ bool is_prevote;
};

struct vote_reply {
raft::internal::tagged_uint64<raft::term_tag> current_term;
bool vote_granted;
+ bool is_prevote;
};

struct install_snapshot {
diff --git a/raft/fsm.hh b/raft/fsm.hh
index 38e414c25c..eaeb26e3e8 100644
--- a/raft/fsm.hh
+++ b/raft/fsm.hh
@@ -48,6 +48,8 @@ struct fsm_config {
// is configured by the snapshot, otherwise the state
// machine will deadlock.
size_t max_log_size;
+ // If set to true will enable prevoting stage during election
+ bool enable_prevoting;
};

// 3.4 Leader election
@@ -223,7 +225,7 @@ class fsm {

void become_leader();

- void become_candidate();
+ void become_candidate(bool is_prevote);

void become_follower(server_id leader);

@@ -250,6 +252,8 @@ class fsm {
// Tick implementation on a leader
void tick_leader();

+ void reset_election_timeout();
+
public:
explicit fsm(server_id id, term_t current_term, server_id voted_for, log log,
failure_detector& failure_detector, fsm_config conf);
@@ -353,11 +357,13 @@ void fsm::step(server_id from, Message&& msg) {
// follower state. If a server receives a request with
// a stale term number, it rejects the request.
if (msg.current_term > _current_term) {
+ server_id leader{};
+
logger.trace("{} [term: {}] received a message with higher term from {} [term: {}]",
_my_id, _current_term, from, msg.current_term);

if constexpr (std::is_same_v<Message, append_request>) {
- become_follower(from);
+ leader = from;
} else {
if constexpr (std::is_same_v<Message, vote_request>) {
if (_current_leader != server_id{} && election_elapsed() < ELECTION_TIMEOUT) {
@@ -366,15 +372,29 @@ void fsm::step(server_id from, Message&& msg) {
// within the minimum election timeout of
// hearing from a current leader, it does not
// update its term or grant its vote.
- logger.trace("{} [term: {}] not granting a vote within a minimum election timeout, elapsed {}",
- _my_id, _current_term, election_elapsed());
+ logger.trace("{} [term: {}] not granting a vote within a minimum election timeout, elapsed {} (current leader = {})",
+ _my_id, _current_term, election_elapsed(), _current_leader);
return;
}
}
- become_follower(server_id{});
}
- update_current_term(msg.current_term);
+ bool ignore_term = false;
+ if constexpr (std::is_same_v<Message, vote_request>) {
+ // Do not update term on provote request
+ ignore_term = msg.is_prevote;
+ } else if constexpr (std::is_same_v<Message, vote_reply>) {
+ // We send pre-vote requests with a term in our future. If the
+ // pre-vote is granted, we will increment our term when we get a
+ // quorum. If it is not, the term comes from the node that
+ // rejected our vote so we should become a follower at the new
+ // term.
+ ignore_term = msg.is_prevote && msg.vote_granted;
+ }

+ if (!ignore_term) {
+ become_follower(leader);
+ update_current_term(msg.current_term);
+ }
} else if (msg.current_term < _current_term) {
if constexpr (std::is_same_v<Message, append_request>) {
// Instructs the leader to step down.
@@ -382,6 +402,10 @@ void fsm::step(server_id from, Message&& msg) {
send_to(from, std::move(reply));
} else if constexpr (std::is_same_v<Message, install_snapshot>) {
send_to(from, snapshot_reply{ .success = false });
+ } else if constexpr (std::is_same_v<Message, vote_request>) {
+ if (msg.is_prevote) {
+ send_to(from, vote_reply{_current_term, false, true});
+ }
} else {
// Ignore other cases
logger.trace("{} [term: {}] ignored a message with lower term from {} [term: {}]",
diff --git a/raft/raft.hh b/raft/raft.hh
index 50e0002706..889bbe1288 100644
--- a/raft/raft.hh
+++ b/raft/raft.hh
@@ -259,6 +259,8 @@ struct vote_request {
index_t last_log_idx;
// The term of the candidate's last log entry.
term_t last_log_term;
+ // True if this is prevote request
+ bool is_prevote;
};

struct vote_reply {
@@ -266,6 +268,8 @@ struct vote_reply {
term_t current_term;
// True means the candidate received a vote.
bool vote_granted;
+ // True if it is a reply to prevote request
+ bool is_prevote;
};

struct install_snapshot {
diff --git a/raft/server.hh b/raft/server.hh
index abfbed42d9..6dc81b93cc 100644
--- a/raft/server.hh
+++ b/raft/server.hh
@@ -46,6 +46,8 @@ class server {
// is configured by the snapshot, otherwise the state
// machine will deadlock on attempt to submit a new entry.
size_t max_log_size = 5000;
+ // If set to true will enable prevoting stage during election
+ bool enable_prevoting = true;
};

virtual ~server() {}
diff --git a/raft/tracker.hh b/raft/tracker.hh
index 0ddecef215..084320aa9e 100644
--- a/raft/tracker.hh
+++ b/raft/tracker.hh
@@ -176,8 +176,9 @@ class votes {
server_address_set _voters;
election_tracker _current;
std::optional<election_tracker> _previous;
+ bool _is_prevote;
public:
- votes(configuration configuration);
+ votes(configuration configuration, bool is_prevote = false);

const server_address_set& voters() const {
return _voters;
@@ -185,6 +186,9 @@ class votes {

void register_vote(server_id from, bool granted);
vote_result tally_votes() const;
+ bool is_prevote() const {
+ return _is_prevote;
+ };

friend std::ostream& operator<<(std::ostream& os, const votes& v);
};
diff --git a/raft/fsm.cc b/raft/fsm.cc
index 90d26ee383..fee32cd716 100644
--- a/raft/fsm.cc
+++ b/raft/fsm.cc
@@ -31,6 +31,7 @@ fsm::fsm(server_id id, term_t current_term, server_id voted_for, log log,

_observed.advance(*this);
logger.trace("{}: starting log length {}", _my_id, _log.last_idx());
+ reset_election_timeout();

assert(!bool(_current_leader));
}
@@ -118,13 +119,11 @@ void fsm::update_current_term(term_t current_term)
assert(_current_term < current_term);
_current_term = current_term;
_voted_for = server_id{};
+}

+void fsm::reset_election_timeout() {
static thread_local std::default_random_engine re{std::random_device{}()};
static thread_local std::uniform_int_distribution<> dist(1, ELECTION_TIMEOUT.count());
- // Reset the randomized election timeout on each term
- // change, even if we do not plan to campaign during this
- // term: the main purpose of the timeout is to avoid
- // starting our campaign simultaneously with other followers.
_randomized_election_timeout = ELECTION_TIMEOUT + logical_clock::duration{dist(re)};
}

@@ -160,12 +159,15 @@ void fsm::become_follower(server_id leader) {
}
}

-void fsm::become_candidate() {
+void fsm::become_candidate(bool is_prevote) {
// When starting a campain we need to reset current leader otherwise
// disruptive server prevention will stall an election if quorum of nodes
// start election together since each one will ignore vote requests from others
_current_leader = {};
_state = candidate{};
+
+ reset_election_timeout();
+
_tracker = std::nullopt;
_log_limiter_semaphore = std::nullopt;
// 3.4 Leader election
@@ -178,7 +180,7 @@ void fsm::become_candidate() {
// and initiating another round of RequestVote RPCs.
_last_election_time = _clock.now();

- _votes.emplace(_log.get_configuration());
+ _votes.emplace(_log.get_configuration(), is_prevote);

const auto& voters = _votes->voters();
if (!voters.contains(server_address{_my_id})) {
@@ -188,24 +190,35 @@ void fsm::become_candidate() {
become_follower(server_id{});
return;
}
- update_current_term(term_t{_current_term + 1});
+
+ term_t term{_current_term + 1};
+ if (!is_prevote) {
+ update_current_term(term);
+ }
// Replicate RequestVote
for (const auto& server : voters) {
if (server.id == _my_id) {
// Vote for self.
_votes->register_vote(server.id, true);
- _voted_for = _my_id;
+ if (!is_prevote) {
+ // Only record real votes
+ _voted_for = _my_id;
+ }
// Already signaled _sm_events in update_current_term()
continue;
}
- logger.trace("{} [term: {}, index: {}, last log term: {}] sent vote request to {}",
- _my_id, _current_term, _log.last_idx(), _log.last_term(), server.id);
+ logger.trace("{} [term: {}, index: {}, last log term: {}{}] sent vote request to {}",
+ _my_id, term, _log.last_idx(), _log.last_term(), is_prevote ? ", prevote" : "", server.id);

- send_to(server.id, vote_request{_current_term, _log.last_idx(), _log.last_term()});
+ send_to(server.id, vote_request{term, _log.last_idx(), _log.last_term(), is_prevote});
}
if (_votes->tally_votes() == vote_result::WON) {
// A single node cluster.
- become_leader();
+ if (!is_prevote) {
+ become_leader();
+ } else {
+ become_candidate(false);
+ }
}
}

@@ -422,7 +435,7 @@ void fsm::tick() {
} else if (is_past_election_timeout()) {
logger.trace("tick[{}]: becoming a candidate, last election: {}, now: {}", _my_id,
_last_election_time, _clock.now());
- become_candidate();
+ become_candidate(_config.enable_prevoting);
}
}

@@ -551,13 +564,17 @@ void fsm::request_vote(server_id from, vote_request&& request) {
// We can cast a vote in any state. If the candidate's term is
// lower than ours, we ignore the request. Otherwise we first
// update our current term and convert to a follower.
- assert(_current_term == request.current_term);
+ assert(request.is_prevote || _current_term == request.current_term);

bool can_vote =
// We can vote if this is a repeat of a vote we've already cast...
_voted_for == from ||
// ...we haven't voted and we don't think there's a leader yet in this term...
- (_voted_for == server_id{} && _current_leader == server_id{});
+ (_voted_for == server_id{} && _current_leader == server_id{}) ||
+ // ...this is prevote for a future term...
+ // (we will get here if the node does not know any leader yet and already
+ // voted for some other node, but now it get even newer prevote request)
+ (request.is_prevote && request.current_term > _current_term);

// ...and we believe the candidate is up to date.
if (can_vote && _log.is_up_to_date(request.last_log_idx, request.last_log_term)) {
@@ -566,23 +583,34 @@ void fsm::request_vote(server_id from, vote_request&& request) {
"voted for {} [log_term: {}, log_index: {}]",
_my_id, _current_term, _log.last_idx(), _log.last_term(), _voted_for,
from, request.last_log_term, request.last_log_idx);
- // If a server grants a vote, it must reset its election
- // timer. See Raft Summary.
- _last_election_time = _clock.now();
- _voted_for = from;
-
- send_to(from, vote_reply{_current_term, true});
+ if (!request.is_prevote) { // Only record real votes
+ // If a server grants a vote, it must reset its election
+ // timer. See Raft Summary.
+ _last_election_time = _clock.now();
+ _voted_for = from;
+ }
+ // The term in the original message and current local term are the
+ // same in the case of regular votes, but different for pre-votes.
+ //
+ // When responding to Msg{Pre,}Vote messages we include the term
+ // from the message, not the local term. To see why, consider the
+ // case where a single node was previously partitioned away and
+ // it's local term is now out of date. If we include the local term
+ // (recall that for pre-votes we don't update the local term), the
+ // (pre-)campaigning node on the other end will proceed to ignore
+ // the message (it ignores all out of date messages).
+ send_to(from, vote_reply{request.current_term, true, request.is_prevote});
} else {
// If a vote is not granted, this server is a potential
// viable candidate, so it should not reset its election
// timer, to avoid election disruption by non-viable
// candidates.
logger.trace("{} [term: {}, index: {}, log_term: {}, voted_for: {}] "
- "rejected vote for {} [log_term: {}, log_index: {}]",
+ "rejected vote for {} [current_term: {}, log_term: {}, log_index: {}, is_prevote: {}]",
_my_id, _current_term, _log.last_idx(), _log.last_term(), _voted_for,
- from, request.last_log_term, request.last_log_idx);
+ from, request.current_term, request.last_log_term, request.last_log_idx, request.is_prevote);

- send_to(from, vote_reply{_current_term, false});
+ send_to(from, vote_reply{_current_term, false, request.is_prevote});
}
}

@@ -597,7 +625,11 @@ void fsm::request_vote_reply(server_id from, vote_reply&& reply) {
case vote_result::UNKNOWN:
break;
case vote_result::WON:
- become_leader();
+ if (_votes->is_prevote()) {
+ become_candidate(false);
+ } else {
+ become_leader();
+ }
break;
case vote_result::LOST:
become_follower(server_id{});
diff --git a/raft/server.cc b/raft/server.cc
index 18f3236878..3d073dd3e9 100644
--- a/raft/server.cc
+++ b/raft/server.cc
@@ -198,7 +198,8 @@ future<> server_impl::start() {
_fsm = std::make_unique<fsm>(_id, term, vote, std::move(log), *_failure_detector,
fsm_config {
.append_request_threshold = _config.append_request_threshold,
- .max_log_size = _config.max_log_size
+ .max_log_size = _config.max_log_size,
+ .enable_prevoting = _config.enable_prevoting
});
assert(_fsm->get_current_term() != term_t(0));

@@ -604,7 +605,7 @@ void server_impl::register_metrics() {
}

future<> server_impl::elect_me_leader() {
- while (!_fsm->is_candidate() && !_fsm->is_leader()) {
+ while (_fsm->is_follower()) {
_fsm->tick();
}
do {
diff --git a/raft/tracker.cc b/raft/tracker.cc
index 39a9f75b14..eea557d9cc 100644
--- a/raft/tracker.cc
+++ b/raft/tracker.cc
@@ -202,9 +202,9 @@ index_t tracker::committed(index_t prev_commit_idx) {
}
}

-votes::votes(configuration configuration)
+votes::votes(configuration configuration, bool is_prevote)
:_voters(configuration.current)
- , _current(configuration.current) {
+ , _current(configuration.current), _is_prevote(is_prevote) {

Gleb Natapov

<gleb@scylladb.com>
unread,
Mar 4, 2021, 7:33:05 AM3/4/21
to scylladb-dev@googlegroups.com
We already have server state dependant state in fsm, so there is no need
to maintain "voters" and "tracker" optionals as well. The upside is that
optional and variant sates cannot drift apart now.
---
raft/fsm.hh | 42 +++++++++++++++++++------
raft/tracker.hh | 6 +---
raft/fsm.cc | 59 +++++++++++++++++-------------------
raft/tracker.cc | 4 +--
test/boost/raft_etcd_test.cc | 2 +-
5 files changed, 63 insertions(+), 50 deletions(-)

diff --git a/raft/fsm.hh b/raft/fsm.hh
index dce8027761..fa6d68954c 100644
--- a/raft/fsm.hh
+++ b/raft/fsm.hh
@@ -254,6 +259,23 @@ class fsm {

void reset_election_timeout();

+ candidate& candidate_state() {
+ return std::get<candidate>(_state);
+ }
+
+ const candidate& candidate_state() const {
+ return std::get<candidate>(_state);
+ }
+
+protected: // For testing
+ leader& leader_state() {
+ return std::get<leader>(_state);
+ }
+
+ const leader& leader_state() const {
+ return std::get<leader>(_state);
+ }
+
public:
explicit fsm(server_id id, term_t current_term, server_id voted_for, log log,
failure_detector& failure_detector, fsm_config conf);
@@ -268,7 +290,7 @@ class fsm {
return std::holds_alternative<candidate>(_state);
}
bool is_prevote_candidate() const {
- return is_candidate() && _votes->is_prevote();
+ return is_candidate() && std::get<candidate>(_state).is_prevote;
}
index_t log_last_idx() const {
return _log.last_idx();
diff --git a/raft/tracker.hh b/raft/tracker.hh
index 084320aa9e..0ddecef215 100644
--- a/raft/tracker.hh
+++ b/raft/tracker.hh
@@ -176,9 +176,8 @@ class votes {
server_address_set _voters;
election_tracker _current;
std::optional<election_tracker> _previous;
- bool _is_prevote;
public:
- votes(configuration configuration, bool is_prevote = false);
+ votes(configuration configuration);

const server_address_set& voters() const {
return _voters;
@@ -186,9 +185,6 @@ class votes {

void register_vote(server_id from, bool granted);
vote_result tally_votes() const;
- bool is_prevote() const {
- return _is_prevote;
- };

friend std::ostream& operator<<(std::ostream& os, const votes& v);
};
diff --git a/raft/fsm.cc b/raft/fsm.cc
index fee32cd716..e26374bc9b 100644
--- a/raft/fsm.cc
+++ b/raft/fsm.cc
@@ -88,7 +88,7 @@ const log_entry& fsm::add_entry(T command) {
// entry is replicated to the C_new servers, and
// a majority of the new configuration is used to
// determine the C_new entry’s commitment.
- _tracker->set_configuration(_log.get_configuration(), _log.last_idx());
+ leader_state().tracker.set_configuration(_log.get_configuration(), _log.last_idx());
}

return *_log[_log.last_idx()];
@@ -129,11 +129,8 @@ void fsm::reset_election_timeout() {

void fsm::become_leader() {
assert(!std::holds_alternative<leader>(_state));
- assert(!_tracker);
- _state = leader{};
+ _state = leader(_my_id);
_current_leader = _my_id;
- _votes = std::nullopt;
- _tracker.emplace(_my_id);
_log_limiter_semaphore.emplace(this);
_log_limiter_semaphore->sem.consume(_log.in_memory_size());
_last_election_time = _clock.now();
@@ -144,16 +141,14 @@ void fsm::become_leader() {
add_entry(log_entry::dummy());
// set_configuration() begins replicating from the last entry
// in the log.
- _tracker->set_configuration(_log.get_configuration(), _log.last_idx());
+ leader_state().tracker.set_configuration(_log.get_configuration(), _log.last_idx());
replicate();
}

void fsm::become_follower(server_id leader) {
_current_leader = leader;
_state = follower{};
- _tracker = std::nullopt;
_log_limiter_semaphore = std::nullopt;
- _votes = std::nullopt;
if (_current_leader) {
_last_election_time = _clock.now();
}
@@ -164,11 +159,10 @@ void fsm::become_candidate(bool is_prevote) {
// disruptive server prevention will stall an election if quorum of nodes
// start election together since each one will ignore vote requests from others
_current_leader = {};
- _state = candidate{};
+ _state = candidate(_log.get_configuration(), is_prevote);

reset_election_timeout();

- _tracker = std::nullopt;
_log_limiter_semaphore = std::nullopt;
// 3.4 Leader election
//
@@ -180,9 +174,9 @@ void fsm::become_candidate(bool is_prevote) {
// and initiating another round of RequestVote RPCs.
_last_election_time = _clock.now();

- _votes.emplace(_log.get_configuration(), is_prevote);
+ auto& votes = candidate_state().votes;

- const auto& voters = _votes->voters();
+ const auto& voters = votes.voters();
if (!voters.contains(server_address{_my_id})) {
// If the server is not part of the current configuration,
// revert to the follower state without increasing
@@ -199,7 +193,7 @@ void fsm::become_candidate(bool is_prevote) {
for (const auto& server : voters) {
if (server.id == _my_id) {
// Vote for self.
- _votes->register_vote(server.id, true);
+ votes.register_vote(server.id, true);
if (!is_prevote) {
// Only record real votes
_voted_for = _my_id;
@@ -212,7 +206,7 @@ void fsm::become_candidate(bool is_prevote) {

send_to(server.id, vote_request{term, _log.last_idx(), _log.last_term(), is_prevote});
}
- if (_votes->tally_votes() == vote_result::WON) {
+ if (votes.tally_votes() == vote_result::WON) {
// A single node cluster.
if (!is_prevote) {
become_leader();
@@ -307,8 +301,8 @@ void fsm::advance_stable_idx(index_t idx) {
// If this server is leader and is part of the current
// configuration, update it's progress and optionally
// commit new entries.
- if (is_leader() && _tracker->leader_progress()) {
- _tracker->leader_progress()->accepted(idx);
+ if (is_leader() && leader_state().tracker.leader_progress()) {
+ leader_state().tracker.leader_progress()->accepted(idx);
replicate();
maybe_commit();
}
@@ -316,7 +310,7 @@ void fsm::advance_stable_idx(index_t idx) {

void fsm::maybe_commit() {

- index_t new_commit_idx = _tracker->committed(_commit_idx);
+ index_t new_commit_idx = leader_state().tracker.committed(_commit_idx);

if (new_commit_idx <= _commit_idx) {
return;
@@ -353,7 +347,7 @@ void fsm::maybe_commit() {
configuration cfg(_log.get_configuration());
cfg.leave_joint();
_log.emplace_back(seastar::make_lw_shared<log_entry>({_current_term, _log.next_idx(), std::move(cfg)}));
- _tracker->set_configuration(_log.get_configuration(), _log.last_idx());
+ leader_state().tracker.set_configuration(_log.get_configuration(), _log.last_idx());
// Leaving joint configuration may commit more entries
// even if we had no new acks. Imagine the cluster is
// in joint configuration {{A, B}, {A, B, C, D, E}}.
@@ -370,7 +364,7 @@ void fsm::maybe_commit() {
// configuration, and 6 if we assume we left it. Let
// it happen without an extra FSM step.
maybe_commit();
- } else if (_tracker->leader_progress() == nullptr) {
+ } else if (leader_state().tracker.leader_progress() == nullptr) {
// 4.2.2 Removing the current leader
//
// A leader that is removed from the configuration
@@ -394,7 +388,7 @@ void fsm::tick_leader() {
}

size_t active = 1; // +1 for self
- for (auto& [id, progress] : *_tracker) {
+ for (auto& [id, progress] : leader_state().tracker) {
if (progress.id != _my_id) {
if (_failure_detector.is_alive(progress.id)) {
active++;
@@ -416,7 +410,7 @@ void fsm::tick_leader() {
}
}
}
- if (active >= _tracker->size()/2 + 1) {
+ if (active >= leader_state().tracker.size()/2 + 1) {
// Advance last election time if we heard from
// the quorum during this tick.
_last_election_time = _clock.now();
@@ -483,7 +477,7 @@ void fsm::append_entries(server_id from, append_request&& request) {
void fsm::append_entries_reply(server_id from, append_reply&& reply) {
assert(is_leader());

- follower_progress* opt_progress = _tracker->find(from);
+ follower_progress* opt_progress = leader_state().tracker.find(from);
if (opt_progress == nullptr) {
// A message from a follower removed from the
// configuration.
@@ -553,7 +547,7 @@ void fsm::append_entries_reply(server_id from, append_reply&& reply) {

// We may have just applied a configuration that removes this
// follower, so re-track it.
- opt_progress = _tracker->find(from);
+ opt_progress = leader_state().tracker.find(from);
if (opt_progress != nullptr) {
replicate_to(*opt_progress, false);
}
@@ -619,13 +613,14 @@ void fsm::request_vote_reply(server_id from, vote_reply&& reply) {

logger.trace("{} received a {} vote from {}", _my_id, reply.vote_granted ? "yes" : "no", from);

- _votes->register_vote(from, reply.vote_granted);
+ auto& state = std::get<candidate>(_state);
+ state.votes.register_vote(from, reply.vote_granted);

- switch (_votes->tally_votes()) {
+ switch (state.votes.tally_votes()) {
case vote_result::UNKNOWN:
break;
case vote_result::WON:
- if (_votes->is_prevote()) {
+ if (state.is_prevote) {
become_candidate(false);
} else {
become_leader();
@@ -745,7 +740,7 @@ void fsm::replicate_to(follower_progress& progress, bool allow_empty) {

void fsm::replicate() {
assert(is_leader());
- for (auto& [id, progress] : *_tracker) {
+ for (auto& [id, progress] : leader_state().tracker) {
if (progress.id != _my_id) {
replicate_to(progress, false);
}
@@ -772,7 +767,7 @@ bool fsm::can_read() {
}

void fsm::snapshot_status(server_id id, std::optional<index_t> idx) {
- follower_progress& progress = *_tracker->find(id);
+ follower_progress& progress = *leader_state().tracker.find(id);

if (progress.state != follower_progress::state::SNAPSHOT) {
logger.trace("snasphot_status[{}]: called not in snapshot state", _my_id);
@@ -823,8 +818,8 @@ std::ostream& operator<<(std::ostream& os, const fsm& f) {
os << "commit index: " << f._observed._commit_idx << "), ";
os << "current time: " << f._clock.now() << ", ";
os << "last election time: " << f._last_election_time << ", ";
- if (f._votes) {
- os << "votes (" << *f._votes << "), ";
+ if (f.is_candidate()) {
+ os << "votes (" << f.candidate_state().votes << "), ";
}
os << "messages: " << f._messages.size() << ", ";

@@ -835,9 +830,9 @@ std::ostream& operator<<(std::ostream& os, const fsm& f) {
} else if (std::holds_alternative<follower>(f._state)) {
os << "follower";
}
- if (f._tracker) {
+ if (f.is_leader()) {
os << "followers (";
- for (const auto& [server_id, follower_progress]: *f._tracker) {
+ for (const auto& [server_id, follower_progress]: f.leader_state().tracker) {
os << server_id << ", ";
os << follower_progress.next_idx << ", ";
os << follower_progress.match_idx << ", ";
diff --git a/raft/tracker.cc b/raft/tracker.cc
index eea557d9cc..39a9f75b14 100644
--- a/raft/tracker.cc
+++ b/raft/tracker.cc
@@ -202,9 +202,9 @@ index_t tracker::committed(index_t prev_commit_idx) {
}
}

-votes::votes(configuration configuration, bool is_prevote)
+votes::votes(configuration configuration)
:_voters(configuration.current)
- , _current(configuration.current), _is_prevote(is_prevote) {
+ , _current(configuration.current) {

if (configuration.is_joint()) {
_previous.emplace(configuration.previous);
diff --git a/test/boost/raft_etcd_test.cc b/test/boost/raft_etcd_test.cc
index 30cdc84b6a..0cd4cf7577 100644

Gleb Natapov

<gleb@scylladb.com>
unread,
Mar 11, 2021, 4:11:13 AM3/11/21
to scylladb-dev@googlegroups.com
ping
--
Gleb.

Konstantin Osipov

<kostja@scylladb.com>
unread,
Mar 11, 2021, 4:23:09 AM3/11/21
to Gleb Natapov, scylladb-dev@googlegroups.com
* Gleb Natapov <gl...@scylladb.com> [21/03/04 16:15]:
> One downside of Raft^Yגs leader election algorithm is that a server that

Looks like ^Yג instead of ' sneaked in, across this entire commit
comment. The comment itself is very nice.

> + bool ignore_term = false;
> + if constexpr (std::is_same_v<Message, vote_request>) {
> + // Do not update term on provote request

provote -> prevote
Sorry for a nit, but perhaps revert this

if (is_prevote) {
become_candidate(false);
} else {
become_leader();
}


> + }
> + // The term in the original message and current local term are the
> + // same in the case of regular votes, but different for pre-votes.
> + //
> + // When responding to Msg{Pre,}Vote messages we include the term

Let's remove 'Msg' please ;)

> + // from the message, not the local term. To see why, consider the
> + // case where a single node was previously partitioned away and
> + // it's local term is now out of date. If we include the local term

it's -> its

> + // (recall that for pre-votes we don't update the local term), the
> + // (pre-)campaigning node on the other end will proceed to ignore
> + // the message (it ignores all out of date messages).

Overall looking really tidy.

Konstantin Osipov

<kostja@scylladb.com>
unread,
Mar 11, 2021, 4:26:42 AM3/11/21
to Gleb Natapov, scylladb-dev@googlegroups.com
* Gleb Natapov <gl...@scylladb.com> [21/03/04 16:15]:
> We already have server state dependant state in fsm, so there is no need
> to maintain "voters" and "tracker" optionals as well. The upside is that
> optional and variant sates cannot drift apart now.

Very nice.
>
> + candidate& candidate_state() {
> + return std::get<candidate>(_state);
> + }

Maybe candidate& candidate() is a tad shorter?

Konstantin Osipov

<kostja@scylladb.com>
unread,
Mar 11, 2021, 4:27:15 AM3/11/21
to Gleb Natapov, scylladb-dev@googlegroups.com
* Gleb Natapov <gl...@scylladb.com> [21/03/04 16:15]:

Reviewed by: Konstantin Osipov <kos...@scylladb.com>

(left minor style comments).

> This patcheset adds prevoting stage in leader election algorithm. See
> patch 2 for explanation what is it and why it is needed.
>
> v2->v3:
> - rename all *prevote to is_prevote
> - randomize election timeout on entering candidate sate only
> - replace helper function to check prevote messages with inline
> constexpr checks
> - drop become_prevote() in favor of become_candidate(bool)

Gleb Natapov

<gleb@scylladb.com>
unread,
Mar 11, 2021, 4:58:59 AM3/11/21
to Konstantin Osipov, scylladb-dev@googlegroups.com
On Thu, Mar 11, 2021 at 12:26:40PM +0300, Konstantin Osipov wrote:
> * Gleb Natapov <gl...@scylladb.com> [21/03/04 16:15]:
> > We already have server state dependant state in fsm, so there is no need
> > to maintain "voters" and "tracker" optionals as well. The upside is that
> > optional and variant sates cannot drift apart now.
>
> Very nice.
> >
> > + candidate& candidate_state() {
> > + return std::get<candidate>(_state);
> > + }
>
> Maybe candidate& candidate() is a tad shorter?
>
This one does not compile. It forces us to use "struct candidate"
instead of just "candidate" in may places and then confuses constructor
invocation with function call when "candidate" object is created.

--
Gleb.

Gleb Natapov

<gleb@scylladb.com>
unread,
Mar 11, 2021, 6:23:33 AM3/11/21
to scylladb-dev@googlegroups.com
---
raft/fsm.cc | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/raft/fsm.cc b/raft/fsm.cc
index 6dc8aad956..6edb4acfa1 100644
--- a/raft/fsm.cc
+++ b/raft/fsm.cc
@@ -177,7 +177,7 @@ void fsm::become_candidate() {
_votes.emplace(_log.get_configuration());

const auto& voters = _votes->voters();
- if (voters.find(server_address{_my_id}) == voters.end()) {
+ if (!voters.contains(server_address{_my_id})) {
// If the server is not part of the current configuration,
// revert to the follower state without increasing

Gleb Natapov

<gleb@scylladb.com>
unread,
Mar 11, 2021, 6:23:33 AM3/11/21
to scylladb-dev@googlegroups.com
---
raft/fsm.hh | 3 +
test/boost/raft_fsm_test.cc | 142 +++++++++++++++++++++++++++++++++++-
2 files changed, 141 insertions(+), 4 deletions(-)

diff --git a/raft/fsm.hh b/raft/fsm.hh
index 4f01da47b7..24b69345cb 100644
--- a/raft/fsm.hh
+++ b/raft/fsm.hh
@@ -267,6 +267,9 @@ class fsm {
bool is_candidate() const {
return std::holds_alternative<candidate>(_state);
}
+ bool is_prevote_candidate() const {
+ return is_candidate() && _votes->is_prevote();
+ }
index_t log_last_idx() const {
return _log.last_idx();
}

Gleb Natapov

<gleb@scylladb.com>
unread,
Mar 11, 2021, 6:23:33 AM3/11/21
to scylladb-dev@googlegroups.com
This patcheset adds prevoting stage in leader election algorithm. See
patch 2 for explanation what is it and why it is needed.

v3->v4:
- drop broken unicode from commit message
- fix typos
- flipped 'if' condition in become_candidate()

v2->v3:
- rename all *prevote to is_prevote
- randomize election timeout on entering candidate sate only
- replace helper function to check prevote messages with inline
constexpr checks
- drop become_prevote() in favor of become_candidate(bool)

Gleb Natapov

<gleb@scylladb.com>
unread,
Mar 11, 2021, 6:23:35 AM3/11/21
to scylladb-dev@googlegroups.com
Not resetting a leader causes vote requests to be ignored instead of
rejected which will make voting round to take more time to fail and may
slow down new leader election.
---
raft/fsm.cc | 4 ++++
1 file changed, 4 insertions(+)

diff --git a/raft/fsm.cc b/raft/fsm.cc
index 6edb4acfa1..90d26ee383 100644
--- a/raft/fsm.cc
+++ b/raft/fsm.cc
@@ -161,6 +161,10 @@ void fsm::become_follower(server_id leader) {
}

void fsm::become_candidate() {
+ // When starting a campain we need to reset current leader otherwise
+ // disruptive server prevention will stall an election if quorum of nodes
+ // start election together since each one will ignore vote requests from others
+ _current_leader = {};
_state = candidate{};
_tracker = std::nullopt;
_log_limiter_semaphore = std::nullopt;
--
2.29.2

Gleb Natapov

<gleb@scylladb.com>
unread,
Mar 11, 2021, 6:23:35 AM3/11/21
to scylladb-dev@googlegroups.com
One downside of Raft's leader election algorithm is that a server that
has been partitioned from the cluster is likely to cause a disruption
when it regains connectivity. When a server is partitioned, it will
not receive heartbeats. It will soon increment its term to start
an election, although it won't be able to collect enough votes to
become leader. When the server regains connectivity sometime later, its
larger term number will propagate to the rest of the cluster (either
through the server's RequestVote requests or through its AppendEntries
response). This will force the cluster leader to step down, and a new
election will have to take place to select a new leader.

Prevoting stage is addressing that. In the Prevote algorithm, a
candidate only increments its term if it first learns from a majority of
the cluster that they would be willing to grant the candidate their votes
(if the candidate's log is sufficiently up-to-date, and the voters have
not received heartbeats from a valid leader for at least a baseline
election timeout).

The Prevote algorithm solves the issue of a partitioned server disrupting
the cluster when it rejoins. While a server is partitioned, it won't
be able to increment its term, since it can't receive permission
from a majority of the cluster. Then, when it rejoins the cluster, it
still won't be able to increment its term, since the other servers
diff --git a/raft/fsm.hh b/raft/fsm.hh
index 38e414c25c..4f01da47b7 100644
--- a/raft/fsm.hh
+++ b/raft/fsm.hh
@@ -48,6 +48,8 @@ struct fsm_config {
// is configured by the snapshot, otherwise the state
// machine will deadlock.
size_t max_log_size;
+ // If set to true will enable prevoting stage during election
+ bool enable_prevoting;
};

// 3.4 Leader election
@@ -223,7 +225,7 @@ class fsm {

void become_leader();

- void become_candidate();
+ void become_candidate(bool is_prevote);

void become_follower(server_id leader);

@@ -250,6 +252,8 @@ class fsm {
// Tick implementation on a leader
void tick_leader();

+ void reset_election_timeout();
+
public:
explicit fsm(server_id id, term_t current_term, server_id voted_for, log log,
failure_detector& failure_detector, fsm_config conf);
@@ -353,11 +357,13 @@ void fsm::step(server_id from, Message&& msg) {
// follower state. If a server receives a request with
// a stale term number, it rejects the request.
if (msg.current_term > _current_term) {
+ server_id leader{};
+
logger.trace("{} [term: {}] received a message with higher term from {} [term: {}]",
_my_id, _current_term, from, msg.current_term);

if constexpr (std::is_same_v<Message, append_request>) {
- become_follower(from);
+ leader = from;
} else {
if constexpr (std::is_same_v<Message, vote_request>) {
if (_current_leader != server_id{} && election_elapsed() < ELECTION_TIMEOUT) {
@@ -366,15 +372,29 @@ void fsm::step(server_id from, Message&& msg) {
// within the minimum election timeout of
// hearing from a current leader, it does not
// update its term or grant its vote.
- logger.trace("{} [term: {}] not granting a vote within a minimum election timeout, elapsed {}",
- _my_id, _current_term, election_elapsed());
+ logger.trace("{} [term: {}] not granting a vote within a minimum election timeout, elapsed {} (current leader = {})",
+ _my_id, _current_term, election_elapsed(), _current_leader);
return;
}
}
- become_follower(server_id{});
}
- update_current_term(msg.current_term);
+ bool ignore_term = false;
+ if constexpr (std::is_same_v<Message, vote_request>) {
+ // Do not update term on prevote request
diff --git a/raft/tracker.hh b/raft/tracker.hh
index 0ddecef215..084320aa9e 100644
--- a/raft/tracker.hh
+++ b/raft/tracker.hh
@@ -176,8 +176,9 @@ class votes {
server_address_set _voters;
election_tracker _current;
std::optional<election_tracker> _previous;
+ bool _is_prevote;
public:
- votes(configuration configuration);
+ votes(configuration configuration, bool is_prevote = false);

const server_address_set& voters() const {
return _voters;
@@ -185,6 +186,9 @@ class votes {

void register_vote(server_id from, bool granted);
vote_result tally_votes() const;
+ bool is_prevote() const {
+ return _is_prevote;
+ };

friend std::ostream& operator<<(std::ostream& os, const votes& v);
};
diff --git a/raft/fsm.cc b/raft/fsm.cc
index 90d26ee383..143b23f21f 100644
--- a/raft/fsm.cc
+++ b/raft/fsm.cc
@@ -31,6 +31,7 @@ fsm::fsm(server_id id, term_t current_term, server_id voted_for, log log,

_observed.advance(*this);
logger.trace("{}: starting log length {}", _my_id, _log.last_idx());
+ reset_election_timeout();

assert(!bool(_current_leader));
}
@@ -118,13 +119,11 @@ void fsm::update_current_term(term_t current_term)
assert(_current_term < current_term);
_current_term = current_term;
_voted_for = server_id{};
+}

+void fsm::reset_election_timeout() {
static thread_local std::default_random_engine re{std::random_device{}()};
static thread_local std::uniform_int_distribution<> dist(1, ELECTION_TIMEOUT.count());
- // Reset the randomized election timeout on each term
- // change, even if we do not plan to campaign during this
- // term: the main purpose of the timeout is to avoid
- // starting our campaign simultaneously with other followers.
_randomized_election_timeout = ELECTION_TIMEOUT + logical_clock::duration{dist(re)};
}

@@ -160,12 +159,15 @@ void fsm::become_follower(server_id leader) {
}
}

-void fsm::become_candidate() {
+void fsm::become_candidate(bool is_prevote) {
// When starting a campain we need to reset current leader otherwise
// disruptive server prevention will stall an election if quorum of nodes
// start election together since each one will ignore vote requests from others
_current_leader = {};
_state = candidate{};
+
+ reset_election_timeout();
+
_tracker = std::nullopt;
_log_limiter_semaphore = std::nullopt;
// 3.4 Leader election
@@ -178,7 +180,7 @@ void fsm::become_candidate() {
// and initiating another round of RequestVote RPCs.
_last_election_time = _clock.now();

- _votes.emplace(_log.get_configuration());
+ _votes.emplace(_log.get_configuration(), is_prevote);

const auto& voters = _votes->voters();
if (!voters.contains(server_address{_my_id})) {
+ if (is_prevote) {
+ become_candidate(false);
+ } else {
+ become_leader();
+ }
}
}

+ }
+ // The term in the original message and current local term are the
+ // same in the case of regular votes, but different for pre-votes.
+ //
+ // When responding to {Pre,}Vote messages we include the term
+ // from the message, not the local term. To see why, consider the
+ // case where a single node was previously partitioned away and
+ // its local term is now out of date. If we include the local term
+ // (recall that for pre-votes we don't update the local term), the
+ // (pre-)campaigning node on the other end will proceed to ignore
+ // the message (it ignores all out of date messages).
+ send_to(from, vote_reply{request.current_term, true, request.is_prevote});
} else {
// If a vote is not granted, this server is a potential
// viable candidate, so it should not reset its election
// timer, to avoid election disruption by non-viable
// candidates.
logger.trace("{} [term: {}, index: {}, log_term: {}, voted_for: {}] "
- "rejected vote for {} [log_term: {}, log_index: {}]",
+ "rejected vote for {} [current_term: {}, log_term: {}, log_index: {}, is_prevote: {}]",
_my_id, _current_term, _log.last_idx(), _log.last_term(), _voted_for,
- from, request.last_log_term, request.last_log_idx);
+ from, request.current_term, request.last_log_term, request.last_log_idx, request.is_prevote);

- send_to(from, vote_reply{_current_term, false});
+ send_to(from, vote_reply{_current_term, false, request.is_prevote});
}
}

@@ -597,7 +625,11 @@ void fsm::request_vote_reply(server_id from, vote_reply&& reply) {
case vote_result::UNKNOWN:
break;
case vote_result::WON:
diff --git a/raft/tracker.cc b/raft/tracker.cc
index 39a9f75b14..eea557d9cc 100644
--- a/raft/tracker.cc
+++ b/raft/tracker.cc
@@ -202,9 +202,9 @@ index_t tracker::committed(index_t prev_commit_idx) {
}
}

Gleb Natapov

<gleb@scylladb.com>
unread,
Mar 11, 2021, 6:23:36 AM3/11/21
to scylladb-dev@googlegroups.com
We already have server state dependant state in fsm, so there is no need
to maintain "voters" and "tracker" optionals as well. The upside is that
optional and variant sates cannot drift apart now.
---
raft/fsm.hh | 42 +++++++++++++++++++------
raft/tracker.hh | 6 +---
raft/fsm.cc | 59 +++++++++++++++++-------------------
raft/tracker.cc | 4 +--
test/boost/raft_etcd_test.cc | 2 +-
5 files changed, 63 insertions(+), 50 deletions(-)

diff --git a/raft/fsm.hh b/raft/fsm.hh
index 24b69345cb..7917084225 100644
--- a/raft/fsm.hh
+++ b/raft/fsm.hh
+ candidate& candidate_state() {
+ return std::get<candidate>(_state);
+ }
+
+ const candidate& candidate_state() const {
+ return std::get<candidate>(_state);
+ }
+
+protected: // For testing
+ leader& leader_state() {
+ return std::get<leader>(_state);
+ }
+
+ const leader& leader_state() const {
+ return std::get<leader>(_state);
+ }
+
public:
explicit fsm(server_id id, term_t current_term, server_id voted_for, log log,
failure_detector& failure_detector, fsm_config conf);
@@ -268,7 +290,7 @@ class fsm {
return std::holds_alternative<candidate>(_state);
}
bool is_prevote_candidate() const {
- return is_candidate() && _votes->is_prevote();
+ return is_candidate() && std::get<candidate>(_state).is_prevote;
}
index_t log_last_idx() const {
return _log.last_idx();
diff --git a/raft/tracker.hh b/raft/tracker.hh
index 084320aa9e..0ddecef215 100644
--- a/raft/tracker.hh
+++ b/raft/tracker.hh
@@ -176,9 +176,8 @@ class votes {
server_address_set _voters;
election_tracker _current;
std::optional<election_tracker> _previous;
- bool _is_prevote;
public:
- votes(configuration configuration, bool is_prevote = false);
+ votes(configuration configuration);

const server_address_set& voters() const {
return _voters;
@@ -186,9 +185,6 @@ class votes {

void register_vote(server_id from, bool granted);
vote_result tally_votes() const;
- bool is_prevote() const {
- return _is_prevote;
- };

friend std::ostream& operator<<(std::ostream& os, const votes& v);
};
diff --git a/raft/fsm.cc b/raft/fsm.cc
index 143b23f21f..f2352cba62 100644
--- a/raft/fsm.cc
+++ b/raft/fsm.cc
- _tracker = std::nullopt;
_log_limiter_semaphore = std::nullopt;
- _votes = std::nullopt;
if (_current_leader) {
_last_election_time = _clock.now();
}
@@ -164,11 +159,10 @@ void fsm::become_candidate(bool is_prevote) {
// disruptive server prevention will stall an election if quorum of nodes
// start election together since each one will ignore vote requests from others
_current_leader = {};
- _state = candidate{};
+ _state = candidate(_log.get_configuration(), is_prevote);

reset_election_timeout();

- _tracker = std::nullopt;
_log_limiter_semaphore = std::nullopt;
// 3.4 Leader election
//
@@ -180,9 +174,9 @@ void fsm::become_candidate(bool is_prevote) {
// and initiating another round of RequestVote RPCs.
_last_election_time = _clock.now();

- _votes.emplace(_log.get_configuration(), is_prevote);
+ auto& votes = candidate_state().votes;

- const auto& voters = _votes->voters();
+ const auto& voters = votes.voters();
if (!voters.contains(server_address{_my_id})) {
// If the server is not part of the current configuration,
// revert to the follower state without increasing
@@ -199,7 +193,7 @@ void fsm::become_candidate(bool is_prevote) {
for (const auto& server : voters) {
if (server.id == _my_id) {
// Vote for self.
- _votes->register_vote(server.id, true);
+ votes.register_vote(server.id, true);
if (!is_prevote) {
// Only record real votes
_voted_for = _my_id;
@@ -212,7 +206,7 @@ void fsm::become_candidate(bool is_prevote) {

send_to(server.id, vote_request{term, _log.last_idx(), _log.last_term(), is_prevote});
}
- if (_votes->tally_votes() == vote_result::WON) {
+ if (votes.tally_votes() == vote_result::WON) {
// A single node cluster.
if (is_prevote) {
become_candidate(false);
case vote_result::UNKNOWN:
break;
case vote_result::WON:
diff --git a/raft/tracker.cc b/raft/tracker.cc
index eea557d9cc..39a9f75b14 100644
--- a/raft/tracker.cc
+++ b/raft/tracker.cc
@@ -202,9 +202,9 @@ index_t tracker::committed(index_t prev_commit_idx) {
}
}

Tomasz Grabiec

<tgrabiec@scylladb.com>
unread,
Mar 11, 2021, 8:04:49 AM3/11/21
to Gleb Natapov, scylladb-dev
GIT URL please

--
You received this message because you are subscribed to the Google Groups "ScyllaDB development" group.
To unsubscribe from this group and stop receiving emails from it, send an email to scylladb-dev...@googlegroups.com.
To view this discussion on the web visit https://groups.google.com/d/msgid/scylladb-dev/20210311112324.197066-1-gleb%40scylladb.com.

Gleb Natapov

<gleb@scylladb.com>
unread,
Mar 11, 2021, 8:15:05 AM3/11/21
to Tomasz Grabiec, scylladb-dev
On Thu, Mar 11, 2021 at 02:04:36PM +0100, Tomasz Grabiec wrote:
> GIT URL please
>
scylla-dev raft-prevote-v4
--
Gleb.

Tomasz Grabiec

<tgrabiec@scylladb.com>
unread,
Mar 11, 2021, 10:48:38 AM3/11/21
to Gleb Natapov, scylladb-dev
On Thu, Mar 11, 2021 at 12:23 PM Gleb Natapov <gl...@scylladb.com> wrote:
One downside of Raft's leader election algorithm is that a server that
has been partitioned from the cluster is likely to cause a disruption
when it regains connectivity. When a server is partitioned, it will
not receive heartbeats. It will soon increment its term to start
an election, although it won't be able to collect enough votes to
become leader. When the server regains connectivity sometime later, its
larger term number will propagate to the rest of the cluster (either
through the server's RequestVote requests or through its AppendEntries
response). This will force the cluster leader to step down, and a new
election will have to take place to select a new leader.

Isn't this scenario already prevented by the means of this if (for vote request):

if constexpr (std::is_same_v<Message, vote_request>) {
                if (_current_leader != server_id{} && election_elapsed() < ELECTION_TIMEOUT) {
                    // 4.2.3 Disruptive servers
                    // If a server receives a RequestVote request

                    // within the minimum election timeout of
                    // hearing from a current leader, it does not
                    // update its term or grant its vote.
                    logger.trace("{} [term: {}] not granting a vote within a minimum election timeout, elapsed {}",
                        _my_id, _current_term, election_elapsed());
                    return;
                }
            }

Since the non-partitioned nodes hear from the leader via heartbeats, they won't let the disruptive server to start an election. 

Gleb Natapov

<gleb@scylladb.com>
unread,
Mar 11, 2021, 10:55:56 AM3/11/21
to Tomasz Grabiec, scylladb-dev
A vote attempt will be rejected, but a leader will get an elevated term
via append_reply and will turn itself to a follower which will eventually
trigger new election cycle.

BTW the commit message here is verbatim copy from the PhD.

--
Gleb.

Tomasz Grabiec

<tgrabiec@scylladb.com>
unread,
Mar 11, 2021, 11:20:02 AM3/11/21
to Gleb Natapov, scylladb-dev
Perhaps this wording was created with the assumption that this extra protection is not in place. We should adjust it to match our current implementation. 

The PhD has also this in section "4.2.3 Disruptive Servers":

"Because of this scenario, we now believe that no solution based on comparing logs alone (such
as the Pre-Vote check) will be sufficient to tell if an election will be disruptive.
(...)
Raft’s solution uses heartbeats to determine when a valid leader exists. In Raft, a leader is
considered active if it is able to maintain heartbeats to its followers (otherwise, another server will
start an election). Thus, servers should not be able to disrupt a leader whose cluster is receiving
heartbeats. 
(...)
while a leader is able to get heartbeats to its cluster, it will not be deposed
by larger term numbers"

So if we modify append reject handler to ignore larger term if it can still maintain a heartbeat to a quorum, we should not need pre-voting.

This article [1] seems to agree:

"An alternative fix is to require servers to ignore RequestVote RPCs if they have received an AppendEntries RPC from the leader within the election timeout. This was suggested in section 6 in the Raft paper as mitigation for liveness issues during reconfiguration. However, we will focus on PreVote instead."
 



 

Gleb Natapov

<gleb@scylladb.com>
unread,
Mar 11, 2021, 11:30:33 AM3/11/21
to Tomasz Grabiec, scylladb-dev
The existing protection is for entirely different scenario. It is to
protect from a node that may think that it is part of the cluster while
it is already not.

> The PhD has also this in section "4.2.3 Disruptive Servers":
>
> "Because of this scenario, we now believe that no solution based on
> comparing logs alone (such
> as the Pre-Vote check) will be sufficient to tell if an election will be
> disruptive.
> (...)
> Raft’s solution uses heartbeats to determine when a valid leader exists. In
> Raft, a leader is
> considered active if it is able to maintain heartbeats to its followers
> (otherwise, another server will
> start an election). Thus, servers should not be able to disrupt a leader
> whose cluster is receiving
> heartbeats.
> (...)
That part of PhD explains exactly why existing code that you referred
to above is needed and if you read one paragraph above of what you cited
here you will see:

"For example, Figure 4.7 shows a server that is being removed from a
cluster. Once the leader creates the Cnew log entry, the server being
removed could be disruptive"

>
> *while a leader is able to get heartbeats to its cluster, it will not be
> deposed*
> *by larger term numbers*"
Yes, and when it cannot deliver heartbeats to its cluster is can be. And
if a node is partitioned away it cannot.

>
> So if we modify append reject handler to ignore larger term if it can still
> maintain a heartbeat to a quorum, we should not need pre-voting.
Are you doubt the wisdom of the PhD?

>
> This article [1] seems to agree:
>
> "An alternative fix is to require servers to ignore RequestVote RPCs if
> they have received an AppendEntries RPC from the leader within the election
> timeout. This was suggested in section 6 in the Raft paper as mitigation
> for liveness issues during reconfiguration. However, we will focus on
> PreVote instead."
>
How is it the same thing? It describes disruptive server solution
we have in place now. It does not suggest that you can ignore term of
appendEntryReply. If you will do that you will never be able to update
such follower.

--
Gleb.

Konstantin Osipov

<kostja@scylladb.com>
unread,
Mar 11, 2021, 11:55:30 AM3/11/21
to Tomasz Grabiec, Gleb Natapov, scylladb-dev
* Tomasz Grabiec <tgra...@scylladb.com> [21/03/11 19:23]:

Tomasz, this can be tricky to grasp unless you see it happen.

The disruption comes *not* from the server becoming a leader,
but from it starting a terms race.

It becomes a candidate, increment its term, then converts back to
follower, but the bigger term has to be dealt with.

The follower won't decrement its term, so it sends a bigger term
in AppendReply reject to the leader. The leader can't simply bump
its term when it sees a higher one, it has to become a follower
first.

And the leader has to act on this AppendReply reject - otherwise
this follower will never be able to rejoin the cluster.

Does this scenario make sense?
> *while a leader is able to get heartbeats to its cluster, it will not be
> deposed*
> *by larger term numbers*"
>
> So if we modify append reject handler to ignore larger term if it can still
> maintain a heartbeat to a quorum, we should not need pre-voting.
>
> This article [1] seems to agree:
>
> "An alternative fix is to require servers to ignore RequestVote RPCs if
> they have received an AppendEntries RPC from the leader within the election
> timeout. This was suggested in section 6 in the Raft paper as mitigation
> for liveness issues during reconfiguration. However, we will focus on
> PreVote instead."
>
>
> [1]
> https://decentralizedthoughts.github.io/2020-12-12-raft-liveness-full-omission/
>
> --
> You received this message because you are subscribed to the Google Groups "ScyllaDB development" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to scylladb-dev...@googlegroups.com.
> To view this discussion on the web visit https://groups.google.com/d/msgid/scylladb-dev/CAO2XSW5Oy-qV%2B6122BGmya%2Bgo25jZyHqJd9EWJek_TcAGYZKiw%40mail.gmail.com.

Tomasz Grabiec

<tgrabiec@scylladb.com>
unread,
Mar 11, 2021, 12:53:14 PM3/11/21
to Konstantin Osipov, Tomasz Grabiec, Gleb Natapov, scylladb-dev
On Thu, Mar 11, 2021 at 5:55 PM Konstantin Osipov <kos...@scylladb.com> wrote:
* Tomasz Grabiec <tgra...@scylladb.com> [21/03/11 19:23]:

Tomasz, this can be tricky to grasp unless you see it happen.

The disruption comes *not* from the server becoming a leader,
but from it starting a terms race.

It becomes a candidate, increment its term, then converts back to
follower, but the bigger term has to be dealt with.

The follower won't decrement its term, so it sends a bigger term
in AppendReply reject to the leader. The leader can't simply bump
its term when it sees a higher one, it has to become a follower
first.

And the leader has to act on this AppendReply reject - otherwise
this follower will never be able to rejoin the cluster.

Does this scenario make sense?

Yes.

Tomasz Grabiec

<tgrabiec@scylladb.com>
unread,
Mar 11, 2021, 1:08:36 PM3/11/21
to Gleb Natapov, scylladb-dev
Still, the commit message is a bit misleading because the scenario it describes for vote_request cannot currently cause problems.
Well, it doesn't mention appendEntryReply, but it mentions that there is an alternative fix which makes pre-voting unnecessary, and mentions "section 6", so maybe section 6 also somehow implies that appendEntryReply should be changed as well, and we forgot to do that. I thought it did, but on a closer look, it doesn't. So we need pre-voting and the article is wrong in suggesting that there is an alternative fix to pre-voting. 

Gleb Natapov

<gleb@scylladb.com>
unread,
Mar 11, 2021, 1:28:15 PM3/11/21
to Tomasz Grabiec, scylladb-dev
The commit message mentions both vote_request and append_reply. I see
the value in having the commit message be the verbatim citation from
PhD. It is correct that disruptive leader optimization takes care of
vote requests, but why does it matter? What if the patches were committed
in different order?

We may also make each extension optional one day, who knows.
I think when article talks about alternative solution it specifically
means an alternative solution for specific scenario that it describes
earlier. And indeed the scenario is fixed by having only "disruptive
server" optimization. Server 3 will stay to be the leader until
partition heals. But to prevent leader disruption after partition heals
you need pre-vote.

--
Gleb.

Tomasz Grabiec

<tgrabiec@scylladb.com>
unread,
Mar 11, 2021, 3:00:13 PM3/11/21
to Gleb Natapov, scylladb-dev
The commit message should be interpreted relative to the state at the time of commit. It's generally not expected that commit messages still make sense relative to any other version. It's better if they do, but only if it doesn't sacrifice important details. 

It's maybe not a big deal, but it did confuse me when I tried to understand why we need this change, so there's that.

Gleb Natapov

<gleb@scylladb.com>
unread,
Mar 11, 2021, 3:20:32 PM3/11/21
to Tomasz Grabiec, scylladb-dev
Would it be OK to clarify that this is the quote from the PhD and
clarify that vote part is taken care of by something else?

--
Gleb.

Tomasz Grabiec

<tgrabiec@scylladb.com>
unread,
Mar 11, 2021, 3:33:48 PM3/11/21
to Gleb Natapov, scylladb-dev
Yes.

The series LGTM, so just post a new GIT URL.

Gleb Natapov

<gleb@scylladb.com>
unread,
Mar 12, 2021, 4:17:05 AM3/12/21
to Tomasz Grabiec, scylladb-dev
On Thu, Mar 11, 2021 at 09:33:36PM +0100, Tomasz Grabiec wrote:
> > Would it be OK to clarify that this is the quote from the PhD and
> > clarify that vote part is taken care of by something else?
> >
>
> Yes.
>
> The series LGTM, so just post a new GIT URL.
Done:

scylla-dev raft-prevote-v5

--
Gleb.

Commit Bot

<bot@cloudius-systems.com>
unread,
Mar 12, 2021, 5:16:09 AM3/12/21
to scylladb-dev@googlegroups.com, Gleb Natapov
From: Gleb Natapov <gl...@scylladb.com>
Committer: Gleb Natapov <gl...@scylladb.com>
Branch: next

raft: reset the leader on entering candidate state

Not resetting a leader causes vote requests to be ignored instead of
rejected which will make voting round to take more time to fail and may
slow down new leader election.

---
diff --git a/raft/fsm.cc b/raft/fsm.cc
--- a/raft/fsm.cc
+++ b/raft/fsm.cc
@@ -161,6 +161,10 @@ void fsm::become_follower(server_id leader) {
}

void fsm::become_candidate() {
+ // When starting a campain we need to reset current leader otherwise
+ // disruptive server prevention will stall an election if quorum of nodes
+ // start election together since each one will ignore vote requests from others
+ _current_leader = {};
_state = candidate{};

Commit Bot

<bot@cloudius-systems.com>
unread,
Mar 12, 2021, 5:16:10 AM3/12/21
to scylladb-dev@googlegroups.com, Gleb Natapov
From: Gleb Natapov <gl...@scylladb.com>
Committer: Gleb Natapov <gl...@scylladb.com>
Branch: next

raft: use modern unordered_set::contains instead of find in become_candidate

---
diff --git a/raft/fsm.cc b/raft/fsm.cc
--- a/raft/fsm.cc
+++ b/raft/fsm.cc
@@ -177,7 +177,7 @@ void fsm::become_candidate() {
_votes.emplace(_log.get_configuration());

const auto& voters = _votes->voters();
- if (voters.find(server_address{_my_id}) == voters.end()) {
+ if (!voters.contains(server_address{_my_id})) {
// If the server is not part of the current configuration,
// revert to the follower state without increasing
// the current term.

Commit Bot

<bot@cloudius-systems.com>
unread,
Mar 12, 2021, 5:16:11 AM3/12/21
to scylladb-dev@googlegroups.com, Gleb Natapov
From: Gleb Natapov <gl...@scylladb.com>
Committer: Gleb Natapov <gl...@scylladb.com>
Branch: next

raft: implement prevoting stage in leader election

This is how PhD explain the need for prevoting stage:

One downside of Raft's leader election algorithm is that a server that
has been partitioned from the cluster is likely to cause a disruption
when it regains connectivity. When a server is partitioned, it will
not receive heartbeats. It will soon increment its term to start
an election, although it won't be able to collect enough votes to
become leader. When the server regains connectivity sometime later, its
larger term number will propagate to the rest of the cluster (either
through the server's RequestVote requests or through its AppendEntries
response). This will force the cluster leader to step down, and a new
election will have to take place to select a new leader.

Prevoting stage is addressing that. In the Prevote algorithm, a
candidate only increments its term if it first learns from a majority of
the cluster that they would be willing to grant the candidate their votes
(if the candidate's log is sufficiently up-to-date, and the voters have
not received heartbeats from a valid leader for at least a baseline
election timeout).

The Prevote algorithm solves the issue of a partitioned server disrupting
the cluster when it rejoins. While a server is partitioned, it won't
be able to increment its term, since it can't receive permission
from a majority of the cluster. Then, when it rejoins the cluster, it
still won't be able to increment its term, since the other servers
will have been receiving regular heartbeats from the leader. Once the
server receives a heartbeat from the leader itself, it will return to
the follower state(in the same term).

In our implementation we have "stable leader" extension that prevents
spurious RequestVote to dispose an active leader, but AppendEntries with
higher term will still do that, so prevoting extension is also required.

---
diff --git a/idl/raft.idl.hh b/idl/raft.idl.hh
--- a/idl/raft.idl.hh
+++ b/idl/raft.idl.hh
@@ -62,11 +62,13 @@ struct vote_request {
raft::internal::tagged_uint64<raft::term_tag> current_term;
raft::internal::tagged_uint64<raft::index_tag> last_log_idx;
raft::internal::tagged_uint64<raft::term_tag> last_log_term;
+ bool is_prevote;
};

struct vote_reply {
raft::internal::tagged_uint64<raft::term_tag> current_term;
bool vote_granted;
+ bool is_prevote;
};

struct install_snapshot {
diff --git a/raft/fsm.cc b/raft/fsm.cc
--- a/raft/fsm.cc
+++ b/raft/fsm.cc
@@ -31,6 +31,7 @@ fsm::fsm(server_id id, term_t current_term, server_id voted_for, log log,

_observed.advance(*this);
logger.trace("{}: starting log length {}", _my_id, _log.last_idx());
+ reset_election_timeout();

assert(!bool(_current_leader));
}
@@ -118,13 +119,11 @@ void fsm::update_current_term(term_t current_term)
assert(_current_term < current_term);
_current_term = current_term;
_voted_for = server_id{};
+}

+void fsm::reset_election_timeout() {
static thread_local std::default_random_engine re{std::random_device{}()};
static thread_local std::uniform_int_distribution<> dist(1, ELECTION_TIMEOUT.count());
- // Reset the randomized election timeout on each term
- // change, even if we do not plan to campaign during this
- // term: the main purpose of the timeout is to avoid
- // starting our campaign simultaneously with other followers.
_randomized_election_timeout = ELECTION_TIMEOUT + logical_clock::duration{dist(re)};
}

@@ -160,12 +159,15 @@ void fsm::become_follower(server_id leader) {
}
}

-void fsm::become_candidate() {
+void fsm::become_candidate(bool is_prevote) {
// When starting a campain we need to reset current leader otherwise
// disruptive server prevention will stall an election if quorum of nodes
// start election together since each one will ignore vote requests from others
_current_leader = {};
_state = candidate{};
+
+ reset_election_timeout();
+
_tracker = std::nullopt;
_log_limiter_semaphore = std::nullopt;
// 3.4 Leader election
@@ -178,7 +180,7 @@ void fsm::become_candidate() {
// and initiating another round of RequestVote RPCs.
_last_election_time = _clock.now();

- _votes.emplace(_log.get_configuration());
+ _votes.emplace(_log.get_configuration(), is_prevote);

const auto& voters = _votes->voters();
if (!voters.contains(server_address{_my_id})) {
@@ -188,24 +190,35 @@ void fsm::become_candidate() {
become_follower(server_id{});
return;
}
- update_current_term(term_t{_current_term + 1});
+
+ term_t term{_current_term + 1};
+ if (!is_prevote) {
+ update_current_term(term);
+ }
// Replicate RequestVote
for (const auto& server : voters) {
if (server.id == _my_id) {
// Vote for self.
_votes->register_vote(server.id, true);
- _voted_for = _my_id;
+ if (!is_prevote) {
+ // Only record real votes
+ _voted_for = _my_id;
+ }
// Already signaled _sm_events in update_current_term()
continue;
}
- logger.trace("{} [term: {}, index: {}, last log term: {}] sent vote request to {}",
- _my_id, _current_term, _log.last_idx(), _log.last_term(), server.id);
+ logger.trace("{} [term: {}, index: {}, last log term: {}{}] sent vote request to {}",
+ _my_id, term, _log.last_idx(), _log.last_term(), is_prevote ? ", prevote" : "", server.id);

- send_to(server.id, vote_request{_current_term, _log.last_idx(), _log.last_term()});
+ send_to(server.id, vote_request{term, _log.last_idx(), _log.last_term(), is_prevote});
}
if (_votes->tally_votes() == vote_result::WON) {
// A single node cluster.
@@ -597,7 +625,11 @@ void fsm::request_vote_reply(server_id from, vote_reply&& reply) {
case vote_result::UNKNOWN:
break;
case vote_result::WON:
- become_leader();
+ if (_votes->is_prevote()) {
+ become_candidate(false);
+ } else {
+ become_leader();
+ }
break;
case vote_result::LOST:
become_follower(server_id{});
diff --git a/raft/fsm.hh b/raft/fsm.hh
--- a/raft/fsm.hh
+++ b/raft/fsm.hh
@@ -48,6 +48,8 @@ struct fsm_config {
// is configured by the snapshot, otherwise the state
// machine will deadlock.
size_t max_log_size;
+ // If set to true will enable prevoting stage during election
+ bool enable_prevoting;
};

// 3.4 Leader election
@@ -223,7 +225,7 @@ private:

void become_leader();

- void become_candidate();
+ void become_candidate(bool is_prevote);

void become_follower(server_id leader);

@@ -250,6 +252,8 @@ private:
// Tick implementation on a leader
void tick_leader();

+ void reset_election_timeout();
+
public:
explicit fsm(server_id id, term_t current_term, server_id voted_for, log log,
failure_detector& failure_detector, fsm_config conf);
@@ -353,11 +357,13 @@ void fsm::step(server_id from, Message&& msg) {
// follower state. If a server receives a request with
// a stale term number, it rejects the request.
if (msg.current_term > _current_term) {
+ server_id leader{};
+
logger.trace("{} [term: {}] received a message with higher term from {} [term: {}]",
_my_id, _current_term, from, msg.current_term);

if constexpr (std::is_same_v<Message, append_request>) {
- become_follower(from);
+ leader = from;
} else {
if constexpr (std::is_same_v<Message, vote_request>) {
if (_current_leader != server_id{} && election_elapsed() < ELECTION_TIMEOUT) {
@@ -366,22 +372,40 @@ void fsm::step(server_id from, Message&& msg) {
// within the minimum election timeout of
// hearing from a current leader, it does not
// update its term or grant its vote.
append_reply reply{_current_term, _commit_idx, append_reply::rejected{msg.prev_log_idx, _log.last_idx()}};
send_to(from, std::move(reply));
} else if constexpr (std::is_same_v<Message, install_snapshot>) {
send_to(from, snapshot_reply{ .success = false });
+ } else if constexpr (std::is_same_v<Message, vote_request>) {
+ if (msg.is_prevote) {
+ send_to(from, vote_reply{_current_term, false, true});
+ }
} else {
// Ignore other cases
logger.trace("{} [term: {}] ignored a message with lower term from {} [term: {}]",
diff --git a/raft/raft.hh b/raft/raft.hh
--- a/raft/raft.hh
+++ b/raft/raft.hh
@@ -259,13 +259,17 @@ struct vote_request {
index_t last_log_idx;
// The term of the candidate's last log entry.
term_t last_log_term;
+ // True if this is prevote request
+ bool is_prevote;
};

struct vote_reply {
// Current term, for the candidate to update itself.
term_t current_term;
// True means the candidate received a vote.
bool vote_granted;
+ // True if it is a reply to prevote request
+ bool is_prevote;
};

struct install_snapshot {
diff --git a/raft/server.cc b/raft/server.cc
--- a/raft/server.cc
+++ b/raft/server.cc
@@ -198,7 +198,8 @@ future<> server_impl::start() {
_fsm = std::make_unique<fsm>(_id, term, vote, std::move(log), *_failure_detector,
fsm_config {
.append_request_threshold = _config.append_request_threshold,
- .max_log_size = _config.max_log_size
+ .max_log_size = _config.max_log_size,
+ .enable_prevoting = _config.enable_prevoting
});
assert(_fsm->get_current_term() != term_t(0));

@@ -604,7 +605,7 @@ void server_impl::register_metrics() {
}

future<> server_impl::elect_me_leader() {
- while (!_fsm->is_candidate() && !_fsm->is_leader()) {
+ while (_fsm->is_follower()) {
_fsm->tick();
}
do {
diff --git a/raft/server.hh b/raft/server.hh
--- a/raft/server.hh
+++ b/raft/server.hh
@@ -46,6 +46,8 @@ public:
// is configured by the snapshot, otherwise the state
// machine will deadlock on attempt to submit a new entry.
size_t max_log_size = 5000;
+ // If set to true will enable prevoting stage during election
+ bool enable_prevoting = true;
};

virtual ~server() {}
diff --git a/raft/tracker.cc b/raft/tracker.cc
--- a/raft/tracker.cc
+++ b/raft/tracker.cc
@@ -202,9 +202,9 @@ index_t tracker::committed(index_t prev_commit_idx) {
}
}

-votes::votes(configuration configuration)
+votes::votes(configuration configuration, bool is_prevote)
:_voters(configuration.current)
- , _current(configuration.current) {
+ , _current(configuration.current), _is_prevote(is_prevote) {

if (configuration.is_joint()) {
_previous.emplace(configuration.previous);
diff --git a/raft/tracker.hh b/raft/tracker.hh
--- a/raft/tracker.hh
+++ b/raft/tracker.hh
@@ -176,15 +176,19 @@ class votes {
server_address_set _voters;
election_tracker _current;
std::optional<election_tracker> _previous;
+ bool _is_prevote;
public:
- votes(configuration configuration);
+ votes(configuration configuration, bool is_prevote = false);

const server_address_set& voters() const {
return _voters;
}

void register_vote(server_id from, bool granted);
vote_result tally_votes() const;
+ bool is_prevote() const {
+ return _is_prevote;
+ };

Commit Bot

<bot@cloudius-systems.com>
unread,
Mar 12, 2021, 5:16:12 AM3/12/21
to scylladb-dev@googlegroups.com, Gleb Natapov
From: Gleb Natapov <gl...@scylladb.com>
Committer: Gleb Natapov <gl...@scylladb.com>
Branch: next

raft: add boost tests for prevoting

---
diff --git a/raft/fsm.hh b/raft/fsm.hh
--- a/raft/fsm.hh
+++ b/raft/fsm.hh
@@ -267,6 +267,9 @@ public:
bool is_candidate() const {
return std::holds_alternative<candidate>(_state);
}
+ bool is_prevote_candidate() const {
+ return is_candidate() && _votes->is_prevote();
+ }
index_t log_last_idx() const {
return _log.last_idx();
}
diff --git a/test/boost/raft_fsm_test.cc b/test/boost/raft_fsm_test.cc

Commit Bot

<bot@cloudius-systems.com>
unread,
Mar 12, 2021, 5:16:13 AM3/12/21
to scylladb-dev@googlegroups.com, Gleb Natapov
From: Gleb Natapov <gl...@scylladb.com>
Committer: Gleb Natapov <gl...@scylladb.com>
Branch: next

raft: store leader and candidate state in state variant

We already have server state dependant state in fsm, so there is no need
to maintain "voters" and "tracker" optionals as well. The upside is that
optional and variant sates cannot drift apart now.

---
diff --git a/raft/fsm.cc b/raft/fsm.cc
--- a/raft/fsm.cc
+++ b/raft/fsm.cc
- _tracker = std::nullopt;
_log_limiter_semaphore = std::nullopt;
- _votes = std::nullopt;
if (_current_leader) {
_last_election_time = _clock.now();
}
@@ -164,11 +159,10 @@ void fsm::become_candidate(bool is_prevote) {
// disruptive server prevention will stall an election if quorum of nodes
// start election together since each one will ignore vote requests from others
_current_leader = {};
- _state = candidate{};
+ _state = candidate(_log.get_configuration(), is_prevote);

reset_election_timeout();

- _tracker = std::nullopt;
_log_limiter_semaphore = std::nullopt;
// 3.4 Leader election
//
@@ -180,9 +174,9 @@ void fsm::become_candidate(bool is_prevote) {
// and initiating another round of RequestVote RPCs.
_last_election_time = _clock.now();

- _votes.emplace(_log.get_configuration(), is_prevote);
+ auto& votes = candidate_state().votes;

- const auto& voters = _votes->voters();
+ const auto& voters = votes.voters();
if (!voters.contains(server_address{_my_id})) {
// If the server is not part of the current configuration,
// revert to the follower state without increasing
@@ -199,7 +193,7 @@ void fsm::become_candidate(bool is_prevote) {
for (const auto& server : voters) {
if (server.id == _my_id) {
// Vote for self.
- _votes->register_vote(server.id, true);
+ votes.register_vote(server.id, true);
if (!is_prevote) {
// Only record real votes
_voted_for = _my_id;
@@ -212,7 +206,7 @@ void fsm::become_candidate(bool is_prevote) {

send_to(server.id, vote_request{term, _log.last_idx(), _log.last_term(), is_prevote});
}
- if (_votes->tally_votes() == vote_result::WON) {
+ if (votes.tally_votes() == vote_result::WON) {
// A single node cluster.
if (is_prevote) {
become_candidate(false);
@@ -307,16 +301,16 @@ void fsm::advance_stable_idx(index_t idx) {
// If this server is leader and is part of the current
// configuration, update it's progress and optionally
// commit new entries.
- if (is_leader() && _tracker->leader_progress()) {
- _tracker->leader_progress()->accepted(idx);
+ if (is_leader() && leader_state().tracker.leader_progress()) {
+ leader_state().tracker.leader_progress()->accepted(idx);
replicate();
maybe_commit();
}
}

case vote_result::UNKNOWN:
break;
case vote_result::WON:
diff --git a/raft/fsm.hh b/raft/fsm.hh
--- a/raft/fsm.hh
+++ b/raft/fsm.hh
@@ -254,6 +259,23 @@ private:

void reset_election_timeout();

+ candidate& candidate_state() {
+ return std::get<candidate>(_state);
+ }
+
+ const candidate& candidate_state() const {
+ return std::get<candidate>(_state);
+ }
+
+protected: // For testing
+ leader& leader_state() {
+ return std::get<leader>(_state);
+ }
+
+ const leader& leader_state() const {
+ return std::get<leader>(_state);
+ }
+
public:
explicit fsm(server_id id, term_t current_term, server_id voted_for, log log,
failure_detector& failure_detector, fsm_config conf);
@@ -268,7 +290,7 @@ public:
return std::holds_alternative<candidate>(_state);
}
bool is_prevote_candidate() const {
- return is_candidate() && _votes->is_prevote();
+ return is_candidate() && std::get<candidate>(_state).is_prevote;
}
index_t log_last_idx() const {
return _log.last_idx();
diff --git a/raft/tracker.cc b/raft/tracker.cc
--- a/raft/tracker.cc
+++ b/raft/tracker.cc
@@ -202,9 +202,9 @@ index_t tracker::committed(index_t prev_commit_idx) {
}
}

-votes::votes(configuration configuration, bool is_prevote)
+votes::votes(configuration configuration)
:_voters(configuration.current)
- , _current(configuration.current), _is_prevote(is_prevote) {
+ , _current(configuration.current) {

if (configuration.is_joint()) {
_previous.emplace(configuration.previous);
diff --git a/raft/tracker.hh b/raft/tracker.hh
--- a/raft/tracker.hh
+++ b/raft/tracker.hh
@@ -176,19 +176,15 @@ class votes {
server_address_set _voters;
election_tracker _current;
std::optional<election_tracker> _previous;
- bool _is_prevote;
public:
- votes(configuration configuration, bool is_prevote = false);
+ votes(configuration configuration);

const server_address_set& voters() const {
return _voters;
}

void register_vote(server_id from, bool granted);
vote_result tally_votes() const;
- bool is_prevote() const {
- return _is_prevote;
- };

friend std::ostream& operator<<(std::ostream& os, const votes& v);
};
diff --git a/test/boost/raft_etcd_test.cc b/test/boost/raft_etcd_test.cc

Commit Bot

<bot@cloudius-systems.com>
unread,
Mar 12, 2021, 10:31:36 AM3/12/21
to scylladb-dev@googlegroups.com, Gleb Natapov
From: Gleb Natapov <gl...@scylladb.com>
Committer: Gleb Natapov <gl...@scylladb.com>
Branch: master

raft: use modern unordered_set::contains instead of find in become_candidate

---
diff --git a/raft/fsm.cc b/raft/fsm.cc
--- a/raft/fsm.cc
+++ b/raft/fsm.cc
@@ -177,7 +177,7 @@ void fsm::become_candidate() {
_votes.emplace(_log.get_configuration());

const auto& voters = _votes->voters();
- if (voters.find(server_address{_my_id}) == voters.end()) {
+ if (!voters.contains(server_address{_my_id})) {
// If the server is not part of the current configuration,
// revert to the follower state without increasing
// the current term.

Commit Bot

<bot@cloudius-systems.com>
unread,
Mar 12, 2021, 10:31:38 AM3/12/21
to scylladb-dev@googlegroups.com, Gleb Natapov
From: Gleb Natapov <gl...@scylladb.com>
Committer: Gleb Natapov <gl...@scylladb.com>
Branch: master

raft: reset the leader on entering candidate state

Not resetting a leader causes vote requests to be ignored instead of
rejected which will make voting round to take more time to fail and may
slow down new leader election.

---
diff --git a/raft/fsm.cc b/raft/fsm.cc
--- a/raft/fsm.cc
+++ b/raft/fsm.cc
@@ -161,6 +161,10 @@ void fsm::become_follower(server_id leader) {
}

void fsm::become_candidate() {
+ // When starting a campain we need to reset current leader otherwise
+ // disruptive server prevention will stall an election if quorum of nodes
+ // start election together since each one will ignore vote requests from others
+ _current_leader = {};
_state = candidate{};

Commit Bot

<bot@cloudius-systems.com>
unread,
Mar 12, 2021, 10:31:39 AM3/12/21
to scylladb-dev@googlegroups.com, Gleb Natapov
From: Gleb Natapov <gl...@scylladb.com>
Committer: Gleb Natapov <gl...@scylladb.com>
Branch: master

diff --git a/raft/fsm.cc b/raft/fsm.cc
--- a/raft/fsm.cc
+++ b/raft/fsm.cc
@@ -31,6 +31,7 @@ fsm::fsm(server_id id, term_t current_term, server_id voted_for, log log,

_observed.advance(*this);
logger.trace("{}: starting log length {}", _my_id, _log.last_idx());
+ reset_election_timeout();

assert(!bool(_current_leader));
}
@@ -118,13 +119,11 @@ void fsm::update_current_term(term_t current_term)
assert(_current_term < current_term);
_current_term = current_term;
_voted_for = server_id{};
+}

+void fsm::reset_election_timeout() {
static thread_local std::default_random_engine re{std::random_device{}()};
static thread_local std::uniform_int_distribution<> dist(1, ELECTION_TIMEOUT.count());
- // Reset the randomized election timeout on each term
- // change, even if we do not plan to campaign during this
- // term: the main purpose of the timeout is to avoid
- // starting our campaign simultaneously with other followers.
_randomized_election_timeout = ELECTION_TIMEOUT + logical_clock::duration{dist(re)};
}

@@ -160,12 +159,15 @@ void fsm::become_follower(server_id leader) {
}
}

-void fsm::become_candidate() {
+void fsm::become_candidate(bool is_prevote) {
// When starting a campain we need to reset current leader otherwise
// disruptive server prevention will stall an election if quorum of nodes
// start election together since each one will ignore vote requests from others
_current_leader = {};
_state = candidate{};
+
+ reset_election_timeout();
+
_tracker = std::nullopt;
_log_limiter_semaphore = std::nullopt;
// 3.4 Leader election
@@ -178,7 +180,7 @@ void fsm::become_candidate() {
// and initiating another round of RequestVote RPCs.
_last_election_time = _clock.now();

- _votes.emplace(_log.get_configuration());
+ _votes.emplace(_log.get_configuration(), is_prevote);

const auto& voters = _votes->voters();
if (!voters.contains(server_address{_my_id})) {
@@ -188,24 +190,35 @@ void fsm::become_candidate() {
become_follower(server_id{});
return;
}
- update_current_term(term_t{_current_term + 1});
+
+ term_t term{_current_term + 1};
+ if (!is_prevote) {
+ update_current_term(term);
+ }
// Replicate RequestVote
for (const auto& server : voters) {
if (server.id == _my_id) {
// Vote for self.
_votes->register_vote(server.id, true);
- _voted_for = _my_id;
+ if (!is_prevote) {
+ // Only record real votes
+ _voted_for = _my_id;
+ }
// Already signaled _sm_events in update_current_term()
continue;
}
- logger.trace("{} [term: {}, index: {}, last log term: {}] sent vote request to {}",
- _my_id, _current_term, _log.last_idx(), _log.last_term(), server.id);
+ logger.trace("{} [term: {}, index: {}, last log term: {}{}] sent vote request to {}",
+ _my_id, term, _log.last_idx(), _log.last_term(), is_prevote ? ", prevote" : "", server.id);

- send_to(server.id, vote_request{_current_term, _log.last_idx(), _log.last_term()});
+ send_to(server.id, vote_request{term, _log.last_idx(), _log.last_term(), is_prevote});
}
if (_votes->tally_votes() == vote_result::WON) {
// A single node cluster.
@@ -597,7 +625,11 @@ void fsm::request_vote_reply(server_id from, vote_reply&& reply) {
case vote_result::UNKNOWN:
break;
case vote_result::WON:
- become_leader();
+ if (_votes->is_prevote()) {
+ become_candidate(false);
+ } else {
+ become_leader();
+ }
break;
case vote_result::LOST:
become_follower(server_id{});
diff --git a/raft/fsm.hh b/raft/fsm.hh
--- a/raft/fsm.hh
+++ b/raft/fsm.hh
@@ -48,6 +48,8 @@ struct fsm_config {
// is configured by the snapshot, otherwise the state
// machine will deadlock.
size_t max_log_size;
+ // If set to true will enable prevoting stage during election
+ bool enable_prevoting;
};

// 3.4 Leader election
@@ -223,7 +225,7 @@ private:

void become_leader();

- void become_candidate();
+ void become_candidate(bool is_prevote);

void become_follower(server_id leader);

@@ -250,6 +252,8 @@ private:
// Tick implementation on a leader
void tick_leader();

+ void reset_election_timeout();
+
public:
explicit fsm(server_id id, term_t current_term, server_id voted_for, log log,
failure_detector& failure_detector, fsm_config conf);
diff --git a/raft/tracker.cc b/raft/tracker.cc
--- a/raft/tracker.cc
+++ b/raft/tracker.cc
@@ -202,9 +202,9 @@ index_t tracker::committed(index_t prev_commit_idx) {
}
}

-votes::votes(configuration configuration)
+votes::votes(configuration configuration, bool is_prevote)
:_voters(configuration.current)
- , _current(configuration.current) {
+ , _current(configuration.current), _is_prevote(is_prevote) {

if (configuration.is_joint()) {
_previous.emplace(configuration.previous);
diff --git a/raft/tracker.hh b/raft/tracker.hh
--- a/raft/tracker.hh
+++ b/raft/tracker.hh
@@ -176,15 +176,19 @@ class votes {
server_address_set _voters;
election_tracker _current;
std::optional<election_tracker> _previous;
+ bool _is_prevote;
public:
- votes(configuration configuration);
+ votes(configuration configuration, bool is_prevote = false);

const server_address_set& voters() const {
return _voters;
}

void register_vote(server_id from, bool granted);
vote_result tally_votes() const;
+ bool is_prevote() const {
+ return _is_prevote;
+ };

Commit Bot

<bot@cloudius-systems.com>
unread,
Mar 12, 2021, 10:31:41 AM3/12/21
to scylladb-dev@googlegroups.com, Gleb Natapov
From: Gleb Natapov <gl...@scylladb.com>
Committer: Gleb Natapov <gl...@scylladb.com>
Branch: master

raft: store leader and candidate state in state variant

We already have server state dependant state in fsm, so there is no need
to maintain "voters" and "tracker" optionals as well. The upside is that
optional and variant sates cannot drift apart now.

---
diff --git a/raft/fsm.cc b/raft/fsm.cc
--- a/raft/fsm.cc
+++ b/raft/fsm.cc
- _tracker = std::nullopt;
_log_limiter_semaphore = std::nullopt;
- _votes = std::nullopt;
if (_current_leader) {
_last_election_time = _clock.now();
}
@@ -164,11 +159,10 @@ void fsm::become_candidate(bool is_prevote) {
// disruptive server prevention will stall an election if quorum of nodes
// start election together since each one will ignore vote requests from others
_current_leader = {};
- _state = candidate{};
+ _state = candidate(_log.get_configuration(), is_prevote);

reset_election_timeout();

- _tracker = std::nullopt;
_log_limiter_semaphore = std::nullopt;
// 3.4 Leader election
//
@@ -180,9 +174,9 @@ void fsm::become_candidate(bool is_prevote) {
// and initiating another round of RequestVote RPCs.
_last_election_time = _clock.now();

- _votes.emplace(_log.get_configuration(), is_prevote);
+ auto& votes = candidate_state().votes;

- const auto& voters = _votes->voters();
+ const auto& voters = votes.voters();
if (!voters.contains(server_address{_my_id})) {
// If the server is not part of the current configuration,
// revert to the follower state without increasing
@@ -199,7 +193,7 @@ void fsm::become_candidate(bool is_prevote) {
for (const auto& server : voters) {
if (server.id == _my_id) {
// Vote for self.
- _votes->register_vote(server.id, true);
+ votes.register_vote(server.id, true);
if (!is_prevote) {
// Only record real votes
_voted_for = _my_id;
@@ -212,7 +206,7 @@ void fsm::become_candidate(bool is_prevote) {

send_to(server.id, vote_request{term, _log.last_idx(), _log.last_term(), is_prevote});
}
- if (_votes->tally_votes() == vote_result::WON) {
+ if (votes.tally_votes() == vote_result::WON) {
// A single node cluster.
case vote_result::UNKNOWN:
break;
case vote_result::WON:
diff --git a/raft/fsm.hh b/raft/fsm.hh
--- a/raft/fsm.hh
+++ b/raft/fsm.hh
+
public:
explicit fsm(server_id id, term_t current_term, server_id voted_for, log log,
failure_detector& failure_detector, fsm_config conf);
@@ -268,7 +290,7 @@ public:
return std::holds_alternative<candidate>(_state);
}
bool is_prevote_candidate() const {
- return is_candidate() && _votes->is_prevote();
+ return is_candidate() && std::get<candidate>(_state).is_prevote;
}
index_t log_last_idx() const {
return _log.last_idx();
diff --git a/raft/tracker.cc b/raft/tracker.cc
--- a/raft/tracker.cc
+++ b/raft/tracker.cc
@@ -202,9 +202,9 @@ index_t tracker::committed(index_t prev_commit_idx) {
}
}

-votes::votes(configuration configuration, bool is_prevote)
+votes::votes(configuration configuration)
:_voters(configuration.current)
- , _current(configuration.current), _is_prevote(is_prevote) {
+ , _current(configuration.current) {

if (configuration.is_joint()) {
_previous.emplace(configuration.previous);
diff --git a/raft/tracker.hh b/raft/tracker.hh
--- a/raft/tracker.hh
+++ b/raft/tracker.hh
@@ -176,19 +176,15 @@ class votes {
server_address_set _voters;
election_tracker _current;
std::optional<election_tracker> _previous;
- bool _is_prevote;
public:
- votes(configuration configuration, bool is_prevote = false);
+ votes(configuration configuration);

const server_address_set& voters() const {
return _voters;
}

void register_vote(server_id from, bool granted);
vote_result tally_votes() const;
- bool is_prevote() const {
- return _is_prevote;
- };

friend std::ostream& operator<<(std::ostream& os, const votes& v);
};

Commit Bot

<bot@cloudius-systems.com>
unread,
Mar 12, 2021, 10:31:42 AM3/12/21
to scylladb-dev@googlegroups.com, Gleb Natapov
From: Gleb Natapov <gl...@scylladb.com>
Committer: Gleb Natapov <gl...@scylladb.com>
Branch: master

raft: add boost tests for prevoting

---
diff --git a/raft/fsm.hh b/raft/fsm.hh
--- a/raft/fsm.hh
+++ b/raft/fsm.hh
@@ -267,6 +267,9 @@ public:
bool is_candidate() const {
return std::holds_alternative<candidate>(_state);
}
+ bool is_prevote_candidate() const {
+ return is_candidate() && _votes->is_prevote();
+ }
index_t log_last_idx() const {
return _log.last_idx();
}
Reply all
Reply to author
Forward
0 new messages