From: Liu Yuan <
liu...@cmss.chinamobile.com>
This feature allows sheep to rejoin the cluster without sending a zk join event
after being killed as if the node were never dead. It take advantages of the zk
timeout window with temporal znode.
So the live upgrade is possible with this feature(rolling restart):
update sheep -> kill -9 sheep -> start sheep -old args -u within the zk timeout
WARN: We can't recover the block event states from the cluster, so the admin
should make sure there is NO BLOCK EVENT unhandled before rolling upgrade.
One idea to mitigate this problem is to check block events of the queue
carefully and if there IS block events unhandled, simply exit. I'd like to leave
this problem for the future with a (better) solution.
sheep/cluster.h | 1 +
sheep/cluster/zookeeper.c | 153 +++++++++++++++++++++++++++++++++++++---------
sheep/sheep.c | 37 ++++++-----
3 files changed, 142 insertions(+), 49 deletions(-)
diff --git a/sheep/cluster.h b/sheep/cluster.h
index 16c1273..0fde142 100644
--- a/sheep/cluster.h
+++ b/sheep/cluster.h
@@ -20,6 +20,7 @@
#include <memory.h>
#include "sheepdog_proto.h"
+#include "sheep_priv.h"
#include "sheep.h"
#include "config.h"
#include "common.h"
diff --git a/sheep/cluster/zookeeper.c b/sheep/cluster/zookeeper.c
index eb06717..2532db5 100644
--- a/sheep/cluster/zookeeper.c
+++ b/sheep/cluster/zookeeper.c
@@ -1002,6 +1002,122 @@ out_unlock:
sd_rw_unlock(&zk_compete_master_lock);
}
+static int zk_connect(const char *host, watcher_fn watcher, int timeout,
+ clientid_t *sid)
+{
+ int interval, max_retry, retry;
+
+ zhandle = zookeeper_init(host, watcher, timeout, sid, NULL, 0);
+
+ if (!zhandle) {
+ sd_err("failed to initialize zk server %s", host);
+ return -1;
+ }
+
+ interval = 100;
+ retry = 0;
+ max_retry = timeout / interval;
+ while (zoo_state(zhandle) != ZOO_CONNECTED_STATE) {
+ usleep(interval * 1000);
+ if (++retry >= max_retry) {
+ sd_err("failed to connect to zk server %s "
+ "after %d retries", host, retry);
+ return -1;
+ }
+ }
+ return 0;
+}
+
+/*XXX: Check block event */
+static void recover_zk_states(void)
+{
+ struct String_vector strs;
+ char path[MAX_NODE_STR_LEN];
+ clientid_t sid;
+ zhandle_t *tmp_handle = zhandle;
+ int len = sizeof(clientid_t), rc;
+
+ /* Recover the old session at first */
+ snprintf(path, sizeof(path), MEMBER_ZNODE "/%s",
+ node_to_str(&this_node.node));
+ rc = zoo_get(tmp_handle, path, 0, (char *)&sid, &len, NULL);
+ switch (rc) {
+ case ZOK:
+ break;
+ case ZNONODE:
+ sd_err("No node %s, exiting...", path);
+ exit(1);
+ default:
+ sd_err("Failed to get data for %s, %s, exiting", path,
+ zerror(rc));
+ exit(1);
+ }
+ zookeeper_close(tmp_handle);
+ if (zk_connect(zk_hosts, zk_watcher, zk_timeout, &sid) < 0)
+ exit(1);
+
+ /* Now we've recovered the session, then set watchers and nodes */
+ RETURN_VOID_IF_ERROR(zk_get_children(MEMBER_ZNODE, &strs), "");
+ FOR_EACH_ZNODE(MEMBER_ZNODE, path, &strs) {
+ struct sd_node n;
+ struct zk_node zk;
+
+ RETURN_VOID_IF_ERROR(zk_node_exists(path), "");
+ str_to_node(path, &n);
+ mempcpy(&zk.node, &n, sizeof(struct sd_node));
+ zk_tree_add(&zk); /* current sd_nodes just have ip:port */
+ }
+}
+
+static void recover_sheep_states(void)
+{
+ struct sd_req hdr;
+ struct cluster_info cinfo;
+ struct sd_node *n;
+ struct zk_node *zk;
+ int ret = SD_RES_CLUSTER_ERROR;
+
+ rb_for_each_entry(n, &sd_node_root, rb) {
+ if (node_eq(&this_node.node, n))
+ continue;
+ sd_init_req(&hdr, SD_OP_CLUSTER_INFO);
+ hdr.data_length = sizeof(cinfo);
+ ret = sheep_exec_req(&n->nid, &hdr, &cinfo);
+ if (ret == SD_RES_SUCCESS)
+ break;
+ }
+ if (ret != SD_RES_SUCCESS) {
+ sd_err("We can't get cluster state from the cluster, %s. Please"
+ " check the network and the cluster. Exiting...",
+ sd_strerror(ret));
+ exit(1);
+ }
+
+ /* Update nodes from sys->cinfo */
+ rb_for_each_entry(zk, &zk_node_root, rb) {
+ for (int i = 0; i < cinfo.nr_nodes; i++) {
+ if (node_eq(&zk->node, &cinfo.nodes[i])) {
+ zk->node = cinfo.nodes[i];
+ sd_debug("%s", node_to_str(&zk->node));
+ }
+ }
+ }
+
+ joined = true;
+ set_cluster_shutdown(true); /* Fake the node state to avoid recovery */
+ build_node_list();
+ sd_accept_handler(&this_node.node, &sd_node_root, nr_sd_nodes,
+ &cinfo);
+}
+
+static int direct_join(void)
+{
+ recover_zk_states();
+ recover_sheep_states();
+
+ return ZOK;
+}
+
static int zk_join(const struct sd_node *myself,
void *opaque, size_t opaque_len)
{
@@ -1014,13 +1130,16 @@ static int zk_join(const struct sd_node *myself,
rc1 = zk_node_exists(path);
snprintf(path, sizeof(path), QUEUE_POS_ZNODE "/%s",
- node_to_str(myself));
+ node_to_str(myself));
rc2 = zk_node_exists(path);
if (rc1 == ZOK || rc2 == ZOK) {
+ if (sys->upgrade)
+ return direct_join();
sd_err("Previous zookeeper session exist, shoot myself. Please "
- "wait for %d seconds to join me again.",
- DIV_ROUND_UP(zk_timeout, 1000));
+ "wait for %d seconds to join me again. Or you can "
+ "specify --upgrade to make rolling update.",
+ DIV_ROUND_UP(zk_timeout, 1000));
exit(1);
}
@@ -1039,7 +1158,7 @@ static int zk_leave(void)
if (uatomic_is_true(&is_master)) {
snprintf(path, sizeof(path), MASTER_ZNODE "/%010"PRId32,
- my_master_seq);
+ my_master_seq);
zk_delete_node(path, -1);
}
@@ -1434,32 +1553,6 @@ static void zk_unlock(uint64_t lock_id)
sd_debug("unlock %"PRIu64, lock_id);
}
-static int zk_connect(const char *host, watcher_fn watcher, int timeout,
- clientid_t *sid)
-{
- int interval, max_retry, retry;
-
- zhandle = zookeeper_init(host, watcher, timeout, sid, NULL, 0);
-
- if (!zhandle) {
- sd_err("failed to initialize zk server %s", host);
- return -1;
- }
-
- interval = 100;
- retry = 0;
- max_retry = timeout / interval;
- while (zoo_state(zhandle) != ZOO_CONNECTED_STATE) {
- usleep(interval * 1000);
- if (++retry >= max_retry) {
- sd_err("failed to connect to zk server %s "
- "after %d retries", host, retry);
- return -1;
- }
- }
- return 0;
-}
-
static int zk_prepare_root(const char *hosts)
{
char root[MAX_NODE_STR_LEN];
diff --git a/sheep/sheep.c b/sheep/sheep.c
index 7dfa198..0b068dd 100644
--- a/sheep/sheep.c
+++ b/sheep/sheep.c
@@ -839,12 +839,6 @@ int main(int argc, char **argv)
if (ret)
goto cleanup_log;
- ret = create_cluster(port, zone, nr_vnodes, explicit_addr);
- if (ret) {
- sd_err("failed to create sheepdog cluster");
- goto cleanup_log;
- }
-
/* We should init trace for work queue before journal init */
ret = wq_trace_init();
if (ret) {
@@ -863,15 +857,15 @@ int main(int argc, char **argv)
*/
ret = create_work_queues();
if (ret)
- goto cleanup_cluster;
+ goto cleanup_log;
ret = sockfd_init();
if (ret)
- goto cleanup_cluster;
+ goto cleanup_log;
ret = init_store_driver(sys->gateway_only);
if (ret)
- goto cleanup_cluster;
+ goto cleanup_log;
if (sys->enable_object_cache) {
if (!strlen(ocpath))
@@ -879,27 +873,27 @@ int main(int argc, char **argv)
memcpy(ocpath, dir, strlen(dir));
ret = object_cache_init(ocpath);
if (ret)
- goto cleanup_cluster;
+ goto cleanup_log;
}
ret = trace_init();
if (ret)
- goto cleanup_cluster;
+ goto cleanup_log;
ret = livepatch_init(dir);
if (ret)
- goto cleanup_cluster;
+ goto cleanup_log;
if (http_options && http_init(http_options) != 0)
- goto cleanup_cluster;
+ goto cleanup_log;
ret = nfs_init(NULL);
if (ret)
- goto cleanup_cluster;
+ goto cleanup_log;
if (pid_file && (create_pidfile(pid_file) != 0)) {
sd_err("failed to pid file '%s' - %m", pid_file);
- goto cleanup_cluster;
+ goto cleanup_log;
}
if (chdir(dir) < 0) {
@@ -908,6 +902,13 @@ int main(int argc, char **argv)
}
check_host_env();
+
+ ret = create_cluster(port, zone, nr_vnodes, explicit_addr);
+ if (ret) {
+ sd_err("failed to create sheepdog cluster");
+ goto cleanup_pid_file;
+ }
+
sd_info("sheepdog daemon (version %s) started", PACKAGE_VERSION);
while (sys->nr_outstanding_reqs != 0 ||
@@ -918,13 +919,11 @@ int main(int argc, char **argv)
rc = 0;
sd_info("shutdown");
+ leave_cluster();
+
cleanup_pid_file:
if (pid_file)
unlink(pid_file);
-
-cleanup_cluster:
- leave_cluster();
-
cleanup_log:
log_close();
--
1.9.1