Since the gossiper works on host ids now it is incorrect to leave this
function to work on ip. It makes it impossible to delete outdated entry
since the "gossiper.get_host_id(endpoint) != id" check will always be
false for such entries (get_host_id() always returns most up -to-date
mapping.
---
test/cluster/test_gossiper_orphan_remover.py | 3 ++-
gms/gossiper.hh | 2 +-
api/gossiper.cc | 2 +-
gms/gossiper.cc | 19 +++++++++----------
service/storage_service.cc | 9 ++++-----
5 files changed, 17 insertions(+), 18 deletions(-)
diff --git a/test/cluster/test_gossiper_orphan_remover.py b/test/cluster/test_gossiper_orphan_remover.py
index 54afc28c246..954526bfbe6 100644
--- a/test/cluster/test_gossiper_orphan_remover.py
+++ b/test/cluster/test_gossiper_orphan_remover.py
@@ -50,7 +50,8 @@ async def test_crashed_node_substitution(manager: ManagerClient):
[await manager.api.message_injection(s.ip_addr, 'fast_orphan_removal_fiber') for s in servers]
log = await manager.server_open_log(servers[0].server_id)
- await log.wait_for(f"Finished to force remove node {orphan_ip}")
+ failed_id = await manager.get_host_id(failed_server.server_id)
+ await log.wait_for(f"Finished to force remove node {failed_id}")
post_wait_live_eps = await manager.api.client.get_json("/gossiper/endpoint/live", host=servers[0].ip_addr)
post_wait_down_eps = await manager.api.client.get_json("/gossiper/endpoint/down", host=servers[0].ip_addr)
diff --git a/gms/gossiper.hh b/gms/gossiper.hh
index 218b00864dd..1d1f984cee9 100644
--- a/gms/gossiper.hh
+++ b/gms/gossiper.hh
@@ -347,7 +347,7 @@ class gossiper : public seastar::async_sharded_service<gossiper>, public seastar
*/
future<> remove_endpoint(locator::host_id endpoint, permit_id);
// Returns true if an endpoint was removed
- future<bool> force_remove_endpoint(inet_address endpoint, locator::host_id id, permit_id);
+ future<> force_remove_endpoint(locator::host_id id, permit_id);
private:
/**
* Quarantines the endpoint for QUARANTINE_DELAY
diff --git a/api/gossiper.cc b/api/gossiper.cc
index 5f5d03066cb..b1c756c8538 100644
--- a/api/gossiper.cc
+++ b/api/gossiper.cc
@@ -60,7 +60,7 @@ void set_gossiper(http_context& ctx, routes& r, gms::gossiper& g) {
httpd::gossiper_json::force_remove_endpoint.set(r, [&g](std::unique_ptr<http::request> req) {
gms::inet_address ep(req->get_path_param("addr"));
- return g.force_remove_endpoint(ep, g.get_host_id(ep), gms::null_permit_id).then([] (bool) {
+ return g.force_remove_endpoint(g.get_host_id(ep), gms::null_permit_id).then([] () {
return make_ready_future<json::json_return_type>(json_void());
});
});
diff --git a/gms/gossiper.cc b/gms/gossiper.cc
index 828e9cc11dd..582f6403ae1 100644
--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -682,24 +682,23 @@ future<> gossiper::apply_state_locally(std::map<inet_address, endpoint_state> ma
std::chrono::steady_clock::now() - start).count());
}
-future<bool> gossiper::force_remove_endpoint(inet_address endpoint, locator::host_id id, permit_id pid) {
- return container().invoke_on(0, [this, endpoint, pid, id] (auto& gossiper) mutable -> future<bool> {
+future<> gossiper::force_remove_endpoint(locator::host_id id, permit_id pid) {
+ return container().invoke_on(0, [this, pid, id] (auto& gossiper) mutable -> future<> {
auto permit = co_await gossiper.lock_endpoint(id, pid);
pid =
permit.id();
try {
- if (gossiper.get_host_id(endpoint) != id) {
- co_return false;
+ if (id == my_host_id()) {
+ throw std::runtime_error(format("Can not force remove node {} itself", id));
}
- if (endpoint == get_broadcast_address()) {
- throw std::runtime_error(format("Can not force remove node {} itself", endpoint));
+ if (!gossiper._endpoint_state_map.contains(id)) {
+ logger.debug("Force remove node is called on non exiting endpoint {}", id);
+ co_return;
}
co_await gossiper.remove_endpoint(id, pid);
co_await gossiper.evict_from_membership(id, pid);
-
logger.info("Finished to force remove node {}", endpoint);
- co_return true;
+
logger.info("Finished to force remove node {}", id);
} catch (...) {
- logger.warn("Failed to force remove node {}: {}", endpoint, std::current_exception());
- co_return false;
+ logger.warn("Failed to force remove node {}: {}", id, std::current_exception());
}
});
}
diff --git a/service/storage_service.cc b/service/storage_service.cc
index 47db9971bd7..96c47f2f56b 100644
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -476,7 +476,7 @@ future<> storage_service::raft_topology_update_ip(locator::host_id id, gms::inet
auto old_ip = it->second;
sys_ks_futures.push_back(_sys_ks.local().remove_endpoint(old_ip));
- co_await _gossiper.force_remove_endpoint(old_ip, id, gms::null_permit_id);
+ co_await _gossiper.force_remove_endpoint(id, gms::null_permit_id);
}
}
break;
@@ -517,9 +517,8 @@ future<storage_service::nodes_to_notify_after_sync> storage_service::sync_raft_t
if (ip) {
sys_ks_futures.push_back(_sys_ks.local().remove_endpoint(*ip));
- if (co_await _gossiper.force_remove_endpoint(*ip, host_id, gms::null_permit_id)) {
- nodes_to_notify.left.push_back({*ip, host_id});
- }
+ co_await _gossiper.force_remove_endpoint(host_id, gms::null_permit_id);
+ nodes_to_notify.left.push_back({*ip, host_id});
}
if (t.left_nodes_rs.find(id) != t.left_nodes_rs.end()) {
@@ -942,7 +941,7 @@ class storage_service::ip_address_updater: public gms::i_endpoint_state_change_s
// in gossiper messages and allows for clearer
// expectations of the gossiper state in tests.
- co_await _ss._gossiper.force_remove_endpoint(endpoint, id, permit_id);
+ co_await _ss._gossiper.force_remove_endpoint(id, permit_id);
co_return;
}
--
2.47.1