[PATCH v1 2/4] gossiper: move force_remove_endpoint to work on host id

2 views
Skip to first unread message

Gleb Natapov

<gleb@scylladb.com>
unread,
Apr 7, 2025, 3:41:12 AMApr 7
to scylladb-dev@googlegroups.com
Since the gossiper works on host ids now it is incorrect to leave this
function to work on ip. It makes it impossible to delete outdated entry
since the "gossiper.get_host_id(endpoint) != id" check will always be
false for such entries (get_host_id() always returns most up -to-date
mapping.
---
test/cluster/test_gossiper_orphan_remover.py | 3 ++-
gms/gossiper.hh | 2 +-
api/gossiper.cc | 2 +-
gms/gossiper.cc | 19 +++++++++----------
service/storage_service.cc | 9 ++++-----
5 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/test/cluster/test_gossiper_orphan_remover.py b/test/cluster/test_gossiper_orphan_remover.py
index 54afc28c246..954526bfbe6 100644
--- a/test/cluster/test_gossiper_orphan_remover.py
+++ b/test/cluster/test_gossiper_orphan_remover.py
@@ -50,7 +50,8 @@ async def test_crashed_node_substitution(manager: ManagerClient):
[await manager.api.message_injection(s.ip_addr, 'fast_orphan_removal_fiber') for s in servers]

log = await manager.server_open_log(servers[0].server_id)
- await log.wait_for(f"Finished to force remove node {orphan_ip}")
+ failed_id = await manager.get_host_id(failed_server.server_id)
+ await log.wait_for(f"Finished to force remove node {failed_id}")

post_wait_live_eps = await manager.api.client.get_json("/gossiper/endpoint/live", host=servers[0].ip_addr)
post_wait_down_eps = await manager.api.client.get_json("/gossiper/endpoint/down", host=servers[0].ip_addr)
diff --git a/gms/gossiper.hh b/gms/gossiper.hh
index 218b00864dd..1d1f984cee9 100644
--- a/gms/gossiper.hh
+++ b/gms/gossiper.hh
@@ -347,7 +347,7 @@ class gossiper : public seastar::async_sharded_service<gossiper>, public seastar
*/
future<> remove_endpoint(locator::host_id endpoint, permit_id);
// Returns true if an endpoint was removed
- future<bool> force_remove_endpoint(inet_address endpoint, locator::host_id id, permit_id);
+ future<> force_remove_endpoint(locator::host_id id, permit_id);
private:
/**
* Quarantines the endpoint for QUARANTINE_DELAY
diff --git a/api/gossiper.cc b/api/gossiper.cc
index 5f5d03066cb..b1c756c8538 100644
--- a/api/gossiper.cc
+++ b/api/gossiper.cc
@@ -60,7 +60,7 @@ void set_gossiper(http_context& ctx, routes& r, gms::gossiper& g) {

httpd::gossiper_json::force_remove_endpoint.set(r, [&g](std::unique_ptr<http::request> req) {
gms::inet_address ep(req->get_path_param("addr"));
- return g.force_remove_endpoint(ep, g.get_host_id(ep), gms::null_permit_id).then([] (bool) {
+ return g.force_remove_endpoint(g.get_host_id(ep), gms::null_permit_id).then([] () {
return make_ready_future<json::json_return_type>(json_void());
});
});
diff --git a/gms/gossiper.cc b/gms/gossiper.cc
index 828e9cc11dd..582f6403ae1 100644
--- a/gms/gossiper.cc
+++ b/gms/gossiper.cc
@@ -682,24 +682,23 @@ future<> gossiper::apply_state_locally(std::map<inet_address, endpoint_state> ma
std::chrono::steady_clock::now() - start).count());
}

-future<bool> gossiper::force_remove_endpoint(inet_address endpoint, locator::host_id id, permit_id pid) {
- return container().invoke_on(0, [this, endpoint, pid, id] (auto& gossiper) mutable -> future<bool> {
+future<> gossiper::force_remove_endpoint(locator::host_id id, permit_id pid) {
+ return container().invoke_on(0, [this, pid, id] (auto& gossiper) mutable -> future<> {
auto permit = co_await gossiper.lock_endpoint(id, pid);
pid = permit.id();
try {
- if (gossiper.get_host_id(endpoint) != id) {
- co_return false;
+ if (id == my_host_id()) {
+ throw std::runtime_error(format("Can not force remove node {} itself", id));
}
- if (endpoint == get_broadcast_address()) {
- throw std::runtime_error(format("Can not force remove node {} itself", endpoint));
+ if (!gossiper._endpoint_state_map.contains(id)) {
+ logger.debug("Force remove node is called on non exiting endpoint {}", id);
+ co_return;
}
co_await gossiper.remove_endpoint(id, pid);
co_await gossiper.evict_from_membership(id, pid);
- logger.info("Finished to force remove node {}", endpoint);
- co_return true;
+ logger.info("Finished to force remove node {}", id);
} catch (...) {
- logger.warn("Failed to force remove node {}: {}", endpoint, std::current_exception());
- co_return false;
+ logger.warn("Failed to force remove node {}: {}", id, std::current_exception());
}
});
}
diff --git a/service/storage_service.cc b/service/storage_service.cc
index 47db9971bd7..96c47f2f56b 100644
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -476,7 +476,7 @@ future<> storage_service::raft_topology_update_ip(locator::host_id id, gms::inet
auto old_ip = it->second;
sys_ks_futures.push_back(_sys_ks.local().remove_endpoint(old_ip));

- co_await _gossiper.force_remove_endpoint(old_ip, id, gms::null_permit_id);
+ co_await _gossiper.force_remove_endpoint(id, gms::null_permit_id);
}
}
break;
@@ -517,9 +517,8 @@ future<storage_service::nodes_to_notify_after_sync> storage_service::sync_raft_t
if (ip) {
sys_ks_futures.push_back(_sys_ks.local().remove_endpoint(*ip));

- if (co_await _gossiper.force_remove_endpoint(*ip, host_id, gms::null_permit_id)) {
- nodes_to_notify.left.push_back({*ip, host_id});
- }
+ co_await _gossiper.force_remove_endpoint(host_id, gms::null_permit_id);
+ nodes_to_notify.left.push_back({*ip, host_id});
}

if (t.left_nodes_rs.find(id) != t.left_nodes_rs.end()) {
@@ -942,7 +941,7 @@ class storage_service::ip_address_updater: public gms::i_endpoint_state_change_s
// in gossiper messages and allows for clearer
// expectations of the gossiper state in tests.

- co_await _ss._gossiper.force_remove_endpoint(endpoint, id, permit_id);
+ co_await _ss._gossiper.force_remove_endpoint(id, permit_id);
co_return;
}

--
2.47.1

Reply all
Reply to author
Forward
0 new messages