[PATCH 00/12] add hot upgrade support for zk driver

9 views
Skip to first unread message

Liu Yuan

unread,
Nov 17, 2016, 4:06:16 AM11/17/16
to sheep...@googlegroups.com
From: Liu Yuan <liu...@cmss.chinamobile.com>

With this patch set, you can hot-upgrade sheep binary without causing recovery
by simply one bash line if you are running sheep with zookeeper driver:

pkill -9 sheep; sleep 1; sheep xxx -u # sheep 1 is not necessarily required

option: -u or --upgrade means to hot upgrade the sheep binary on the fly

Liu Yuan (12):
Revert "zookeeper: don't loop on ZCONNECTIONLOSS endlessly"
zookeeper: Set parameter 'hosts' to be global
zookeeper: add session_id to zk_connect()
zookeeper: join directly if --upgrade is set
zookeeper: recover the queue state for direct_join
zookeeper: compete master for direct join
sheep: get block-event information from sheep
zookeeper: check block event for rolling update
local: use tgkill to send the notify signal
zookeeper: fix next queue pos of rolling-update
plain store: fix init_vdi_state reading stale objects
zookeeper: fix wrong node address for rolling-upgrade with io nic set

dog/cluster.c | 2 +
include/internal_proto.h | 5 +-
include/util.h | 1 +
lib/util.c | 9 +-
sheep/cluster.h | 3 +
sheep/cluster/local.c | 3 +-
sheep/cluster/zookeeper.c | 258 +++++++++++++++++++++++++++++++++++++---------
sheep/ops.c | 5 +
sheep/sheep.c | 37 ++++---
sheep/store/plain_store.c | 76 +++++++-------
10 files changed, 294 insertions(+), 105 deletions(-)

--
1.9.1

Liu Yuan

unread,
Nov 17, 2016, 4:06:18 AM11/17/16
to sheep...@googlegroups.com
From: Liu Yuan <liu...@cmss.chinamobile.com>

This reverts commit 8c95e03f89615e2a6ce25c23f864277d1aab7615.

This commit is just half baked, not considering the block/unblock problem and
other corner cases. So it is better to revert it for furture fix.

Signed-off-by: Liu Yuan <liu...@cmss.chinamobile.com>
---
sheep/cluster/zookeeper.c | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/sheep/cluster/zookeeper.c b/sheep/cluster/zookeeper.c
index ad562f4..47a36ee 100644
--- a/sheep/cluster/zookeeper.c
+++ b/sheep/cluster/zookeeper.c
@@ -196,7 +196,7 @@ static inline ZOOAPI int zk_delete_node(const char *path, int version)
int rc;
do {
rc = zoo_delete(zhandle, path, version);
- } while (rc == ZOPERATIONTIMEOUT);
+ } while (rc == ZOPERATIONTIMEOUT || rc == ZCONNECTIONLOSS);
CHECK_ZK_RC(rc, path);

return rc;
@@ -209,7 +209,7 @@ zk_init_node(const char *path)
do {
rc = zoo_create(zhandle, path, "", 0, &ZOO_OPEN_ACL_UNSAFE, 0,
NULL, 0);
- } while (rc == ZOPERATIONTIMEOUT);
+ } while (rc == ZOPERATIONTIMEOUT || rc == ZCONNECTIONLOSS);
CHECK_ZK_RC(rc, path);

if (rc == ZNODEEXISTS)
@@ -226,7 +226,7 @@ zk_create_node(const char *path, const char *value, int valuelen,
do {
rc = zoo_create(zhandle, path, value, valuelen, acl,
flags, path_buffer, path_buffer_len);
- } while (rc == ZOPERATIONTIMEOUT);
+ } while (rc == ZOPERATIONTIMEOUT || rc == ZCONNECTIONLOSS);
CHECK_ZK_RC(rc, path);

return rc;
@@ -261,7 +261,7 @@ static inline ZOOAPI int zk_get_data(const char *path, void *buffer,
do {
rc = zoo_get(zhandle, path, 1, (char *)buffer,
buffer_len, NULL);
- } while (rc == ZOPERATIONTIMEOUT);
+ } while (rc == ZOPERATIONTIMEOUT || rc == ZCONNECTIONLOSS);
CHECK_ZK_RC(rc, path);

return rc;
@@ -273,7 +273,7 @@ zk_set_data(const char *path, const char *buffer, int buflen, int version)
int rc;
do {
rc = zoo_set(zhandle, path, buffer, buflen, version);
- } while (rc == ZOPERATIONTIMEOUT);
+ } while (rc == ZOPERATIONTIMEOUT || rc == ZCONNECTIONLOSS);
CHECK_ZK_RC(rc, path);

return rc;
@@ -284,7 +284,7 @@ static inline ZOOAPI int zk_node_exists(const char *path)
int rc;
do {
rc = zoo_exists(zhandle, path, 1, NULL);
- } while (rc == ZOPERATIONTIMEOUT);
+ } while (rc == ZOPERATIONTIMEOUT || rc == ZCONNECTIONLOSS);
CHECK_ZK_RC(rc, path);

return rc;
@@ -296,7 +296,7 @@ static inline ZOOAPI int zk_get_children(const char *path,
int rc;
do {
rc = zoo_get_children(zhandle, path, 1, strings);
- } while (rc == ZOPERATIONTIMEOUT);
+ } while (rc == ZOPERATIONTIMEOUT || rc == ZCONNECTIONLOSS);
CHECK_ZK_RC(rc, path);

return rc;
@@ -961,7 +961,7 @@ static void zk_compete_master(void)
MAX_NODE_STR_LEN,
my_compete_path,
MAX_NODE_STR_LEN, true);
- } while (rc == ZOPERATIONTIMEOUT);
+ } while (rc == ZOPERATIONTIMEOUT || rc == ZCONNECTIONLOSS);
CHECK_ZK_RC(rc, MASTER_ZNODE "/");
if (rc != ZOK)
goto out_unlock;
--
1.9.1

Liu Yuan

unread,
Nov 17, 2016, 4:06:20 AM11/17/16
to sheep...@googlegroups.com
From: Liu Yuan <liu...@cmss.chinamobile.com>

This is a preparation patch.

Signed-off-by: Liu Yuan <liu...@cmss.chinamobile.com>
---
sheep/cluster/zookeeper.c | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/sheep/cluster/zookeeper.c b/sheep/cluster/zookeeper.c
index 47a36ee..e5a1839 100644
--- a/sheep/cluster/zookeeper.c
+++ b/sheep/cluster/zookeeper.c
@@ -36,6 +36,7 @@

static int zk_timeout = SESSION_TIMEOUT;
static int my_master_seq;
+static char zk_hosts[MAX_NODE_STR_LEN];

/* structure for distributed lock */
struct cluster_lock {
@@ -1488,7 +1489,6 @@ static int zk_init(const char *option)
{
char *hosts, *to, *p;
int ret, timeo;
- char conn[MAX_NODE_STR_LEN];

if (!option) {
sd_err("You must specify zookeeper servers.");
@@ -1504,17 +1504,17 @@ static int zk_init(const char *option)
p = strstr(hosts, "timeout");
*--p = '\0';
}
- pstrcpy(conn, MAX_NODE_STR_LEN, hosts);
- if (!strchr(conn, '/'))
- strcat(conn, DEFAULT_BASE);
- if (zk_prepare_root(conn) != 0) {
- sd_err("failed to initialize zk server %s", conn);
+ pstrcpy(zk_hosts, MAX_NODE_STR_LEN, hosts);
+ if (!strchr(zk_hosts, '/'))
+ strcat(zk_hosts, DEFAULT_BASE);
+ if (zk_prepare_root(zk_hosts) != 0) {
+ sd_err("failed to initialize zk server %s", zk_hosts);
return -1;
}

sd_info("version %d.%d.%d, address %s, timeout %d", ZOO_MAJOR_VERSION,
- ZOO_MINOR_VERSION, ZOO_PATCH_VERSION, conn, zk_timeout);
- if (zk_connect(conn, zk_watcher, zk_timeout) < 0)
+ ZOO_MINOR_VERSION, ZOO_PATCH_VERSION, zk_hosts, zk_timeout);
+ if (zk_connect(zk_hosts, zk_watcher, zk_timeout) < 0)
return -1;

timeo = zoo_recv_timeout(zhandle);
--
1.9.1

Liu Yuan

unread,
Nov 17, 2016, 4:06:21 AM11/17/16
to sheep...@googlegroups.com
From: Liu Yuan <liu...@cmss.chinamobile.com>

This is a preparation patch for the next one.

Signed-off-by: Liu Yuan <liu...@cmss.chinamobile.com>
---
sheep/cluster/zookeeper.c | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/sheep/cluster/zookeeper.c b/sheep/cluster/zookeeper.c
index e5a1839..eb06717 100644
--- a/sheep/cluster/zookeeper.c
+++ b/sheep/cluster/zookeeper.c
@@ -1434,11 +1434,12 @@ static void zk_unlock(uint64_t lock_id)
sd_debug("unlock %"PRIu64, lock_id);
}

-static int zk_connect(const char *host, watcher_fn watcher, int timeout)
+static int zk_connect(const char *host, watcher_fn watcher, int timeout,
+ clientid_t *sid)
{
int interval, max_retry, retry;

- zhandle = zookeeper_init(host, watcher, timeout, NULL, NULL, 0);
+ zhandle = zookeeper_init(host, watcher, timeout, sid, NULL, 0);

if (!zhandle) {
sd_err("failed to initialize zk server %s", host);
@@ -1474,7 +1475,7 @@ static int zk_prepare_root(const char *hosts)
}
conn[i] = '\0';

- if (zk_connect(conn, zk_watcher, zk_timeout) < 0)
+ if (zk_connect(conn, zk_watcher, zk_timeout, NULL) < 0)
return -1;

sd_debug("sheepdog cluster_id %s", root);
@@ -1514,7 +1515,7 @@ static int zk_init(const char *option)

sd_info("version %d.%d.%d, address %s, timeout %d", ZOO_MAJOR_VERSION,
ZOO_MINOR_VERSION, ZOO_PATCH_VERSION, zk_hosts, zk_timeout);
- if (zk_connect(zk_hosts, zk_watcher, zk_timeout) < 0)
+ if (zk_connect(zk_hosts, zk_watcher, zk_timeout, NULL) < 0)

Liu Yuan

unread,
Nov 17, 2016, 4:06:23 AM11/17/16
to sheep...@googlegroups.com
From: Liu Yuan <liu...@cmss.chinamobile.com>

This feature allows sheep to rejoin the cluster without sending a zk join event
after being killed as if the node were never dead. It take advantages of the zk
timeout window with temporal znode.

So the live upgrade is possible with this feature(rolling restart):

update sheep -> kill -9 sheep -> start sheep -old args -u within the zk timeout

WARN: We can't recover the block event states from the cluster, so the admin
should make sure there is NO BLOCK EVENT unhandled before rolling upgrade.

One idea to mitigate this problem is to check block events of the queue
carefully and if there IS block events unhandled, simply exit. I'd like to leave
this problem for the future with a (better) solution.

Signed-off-by: Liu Yuan <liu...@cmss.chinamobile.com>
---
sheep/cluster.h | 1 +
sheep/cluster/zookeeper.c | 153 +++++++++++++++++++++++++++++++++++++---------
sheep/sheep.c | 37 ++++++-----
3 files changed, 142 insertions(+), 49 deletions(-)

diff --git a/sheep/cluster.h b/sheep/cluster.h
index 16c1273..0fde142 100644
--- a/sheep/cluster.h
+++ b/sheep/cluster.h
@@ -20,6 +20,7 @@
#include <memory.h>

#include "sheepdog_proto.h"
+#include "sheep_priv.h"
#include "sheep.h"
#include "config.h"
#include "common.h"
diff --git a/sheep/cluster/zookeeper.c b/sheep/cluster/zookeeper.c
index eb06717..2532db5 100644
--- a/sheep/cluster/zookeeper.c
+++ b/sheep/cluster/zookeeper.c
@@ -1002,6 +1002,122 @@ out_unlock:
sd_rw_unlock(&zk_compete_master_lock);
}

+static int zk_connect(const char *host, watcher_fn watcher, int timeout,
+ clientid_t *sid)
+{
+ int interval, max_retry, retry;
+
+ zhandle = zookeeper_init(host, watcher, timeout, sid, NULL, 0);
+
+ if (!zhandle) {
+ sd_err("failed to initialize zk server %s", host);
+ return -1;
+ }
+
+ interval = 100;
+ retry = 0;
+ max_retry = timeout / interval;
+ while (zoo_state(zhandle) != ZOO_CONNECTED_STATE) {
+ usleep(interval * 1000);
+ if (++retry >= max_retry) {
+ sd_err("failed to connect to zk server %s "
+ "after %d retries", host, retry);
+ return -1;
+ }
+ }
+ return 0;
+}
+
+/*XXX: Check block event */
+static void recover_zk_states(void)
+{
+ struct String_vector strs;
+ char path[MAX_NODE_STR_LEN];
+ clientid_t sid;
+ zhandle_t *tmp_handle = zhandle;
+ int len = sizeof(clientid_t), rc;
+
+ /* Recover the old session at first */
+ snprintf(path, sizeof(path), MEMBER_ZNODE "/%s",
+ node_to_str(&this_node.node));
+ rc = zoo_get(tmp_handle, path, 0, (char *)&sid, &len, NULL);
+ switch (rc) {
+ case ZOK:
+ break;
+ case ZNONODE:
+ sd_err("No node %s, exiting...", path);
+ exit(1);
+ default:
+ sd_err("Failed to get data for %s, %s, exiting", path,
+ zerror(rc));
+ exit(1);
+ }
+ zookeeper_close(tmp_handle);
+ if (zk_connect(zk_hosts, zk_watcher, zk_timeout, &sid) < 0)
+ exit(1);
+
+ /* Now we've recovered the session, then set watchers and nodes */
+ RETURN_VOID_IF_ERROR(zk_get_children(MEMBER_ZNODE, &strs), "");
+ FOR_EACH_ZNODE(MEMBER_ZNODE, path, &strs) {
+ struct sd_node n;
+ struct zk_node zk;
+
+ RETURN_VOID_IF_ERROR(zk_node_exists(path), "");
+ str_to_node(path, &n);
+ mempcpy(&zk.node, &n, sizeof(struct sd_node));
+ zk_tree_add(&zk); /* current sd_nodes just have ip:port */
+ }
+}
+
+static void recover_sheep_states(void)
+{
+ struct sd_req hdr;
+ struct cluster_info cinfo;
+ struct sd_node *n;
+ struct zk_node *zk;
+ int ret = SD_RES_CLUSTER_ERROR;
+
+ rb_for_each_entry(n, &sd_node_root, rb) {
+ if (node_eq(&this_node.node, n))
+ continue;
+ sd_init_req(&hdr, SD_OP_CLUSTER_INFO);
+ hdr.data_length = sizeof(cinfo);
+ ret = sheep_exec_req(&n->nid, &hdr, &cinfo);
+ if (ret == SD_RES_SUCCESS)
+ break;
+ }
+ if (ret != SD_RES_SUCCESS) {
+ sd_err("We can't get cluster state from the cluster, %s. Please"
+ " check the network and the cluster. Exiting...",
+ sd_strerror(ret));
+ exit(1);
+ }
+
+ /* Update nodes from sys->cinfo */
+ rb_for_each_entry(zk, &zk_node_root, rb) {
+ for (int i = 0; i < cinfo.nr_nodes; i++) {
+ if (node_eq(&zk->node, &cinfo.nodes[i])) {
+ zk->node = cinfo.nodes[i];
+ sd_debug("%s", node_to_str(&zk->node));
+ }
+ }
+ }
+
+ joined = true;
+ set_cluster_shutdown(true); /* Fake the node state to avoid recovery */
+ build_node_list();
+ sd_accept_handler(&this_node.node, &sd_node_root, nr_sd_nodes,
+ &cinfo);
+}
+
+static int direct_join(void)
+{
+ recover_zk_states();
+ recover_sheep_states();
+
+ return ZOK;
+}
+
static int zk_join(const struct sd_node *myself,
void *opaque, size_t opaque_len)
{
@@ -1014,13 +1130,16 @@ static int zk_join(const struct sd_node *myself,
rc1 = zk_node_exists(path);

snprintf(path, sizeof(path), QUEUE_POS_ZNODE "/%s",
- node_to_str(myself));
+ node_to_str(myself));
rc2 = zk_node_exists(path);

if (rc1 == ZOK || rc2 == ZOK) {
+ if (sys->upgrade)
+ return direct_join();
sd_err("Previous zookeeper session exist, shoot myself. Please "
- "wait for %d seconds to join me again.",
- DIV_ROUND_UP(zk_timeout, 1000));
+ "wait for %d seconds to join me again. Or you can "
+ "specify --upgrade to make rolling update.",
+ DIV_ROUND_UP(zk_timeout, 1000));
exit(1);
}

@@ -1039,7 +1158,7 @@ static int zk_leave(void)

if (uatomic_is_true(&is_master)) {
snprintf(path, sizeof(path), MASTER_ZNODE "/%010"PRId32,
- my_master_seq);
+ my_master_seq);
zk_delete_node(path, -1);
}

@@ -1434,32 +1553,6 @@ static void zk_unlock(uint64_t lock_id)
sd_debug("unlock %"PRIu64, lock_id);
}

-static int zk_connect(const char *host, watcher_fn watcher, int timeout,
- clientid_t *sid)
-{
- int interval, max_retry, retry;
-
- zhandle = zookeeper_init(host, watcher, timeout, sid, NULL, 0);
-
- if (!zhandle) {
- sd_err("failed to initialize zk server %s", host);
- return -1;
- }
-
- interval = 100;
- retry = 0;
- max_retry = timeout / interval;
- while (zoo_state(zhandle) != ZOO_CONNECTED_STATE) {
- usleep(interval * 1000);
- if (++retry >= max_retry) {
- sd_err("failed to connect to zk server %s "
- "after %d retries", host, retry);
- return -1;
- }
- }
- return 0;
-}
-
static int zk_prepare_root(const char *hosts)
{
char root[MAX_NODE_STR_LEN];
diff --git a/sheep/sheep.c b/sheep/sheep.c
index 7dfa198..0b068dd 100644
--- a/sheep/sheep.c
+++ b/sheep/sheep.c
@@ -839,12 +839,6 @@ int main(int argc, char **argv)
if (ret)
goto cleanup_log;

- ret = create_cluster(port, zone, nr_vnodes, explicit_addr);
- if (ret) {
- sd_err("failed to create sheepdog cluster");
- goto cleanup_log;
- }
-
/* We should init trace for work queue before journal init */
ret = wq_trace_init();
if (ret) {
@@ -863,15 +857,15 @@ int main(int argc, char **argv)
*/
ret = create_work_queues();
if (ret)
- goto cleanup_cluster;
+ goto cleanup_log;

ret = sockfd_init();
if (ret)
- goto cleanup_cluster;
+ goto cleanup_log;

ret = init_store_driver(sys->gateway_only);
if (ret)
- goto cleanup_cluster;
+ goto cleanup_log;

if (sys->enable_object_cache) {
if (!strlen(ocpath))
@@ -879,27 +873,27 @@ int main(int argc, char **argv)
memcpy(ocpath, dir, strlen(dir));
ret = object_cache_init(ocpath);
if (ret)
- goto cleanup_cluster;
+ goto cleanup_log;
}

ret = trace_init();
if (ret)
- goto cleanup_cluster;
+ goto cleanup_log;

ret = livepatch_init(dir);
if (ret)
- goto cleanup_cluster;
+ goto cleanup_log;

if (http_options && http_init(http_options) != 0)
- goto cleanup_cluster;
+ goto cleanup_log;

ret = nfs_init(NULL);
if (ret)
- goto cleanup_cluster;
+ goto cleanup_log;

if (pid_file && (create_pidfile(pid_file) != 0)) {
sd_err("failed to pid file '%s' - %m", pid_file);
- goto cleanup_cluster;
+ goto cleanup_log;
}

if (chdir(dir) < 0) {
@@ -908,6 +902,13 @@ int main(int argc, char **argv)
}

check_host_env();
+
+ ret = create_cluster(port, zone, nr_vnodes, explicit_addr);
+ if (ret) {
+ sd_err("failed to create sheepdog cluster");
+ goto cleanup_pid_file;
+ }
+
sd_info("sheepdog daemon (version %s) started", PACKAGE_VERSION);

while (sys->nr_outstanding_reqs != 0 ||
@@ -918,13 +919,11 @@ int main(int argc, char **argv)
rc = 0;
sd_info("shutdown");

+ leave_cluster();
+
cleanup_pid_file:
if (pid_file)
unlink(pid_file);
-
-cleanup_cluster:
- leave_cluster();
-
cleanup_log:
log_close();

--
1.9.1

Liu Yuan

unread,
Nov 17, 2016, 4:06:25 AM11/17/16
to sheep...@googlegroups.com
From: Liu Yuan <liu...@cmss.chinamobile.com>

zk sequence node's Stat.numChildren is the next sequence number of the counter.

Signed-off-by: Liu Yuan <liu...@cmss.chinamobile.com>
---
sheep/cluster/zookeeper.c | 27 +++++++++++++++++++++++++++
1 file changed, 27 insertions(+)

diff --git a/sheep/cluster/zookeeper.c b/sheep/cluster/zookeeper.c
index 2532db5..78dde37 100644
--- a/sheep/cluster/zookeeper.c
+++ b/sheep/cluster/zookeeper.c
@@ -1034,8 +1034,11 @@ static void recover_zk_states(void)
struct String_vector strs;
char path[MAX_NODE_STR_LEN];
clientid_t sid;
+ struct Stat stat;
zhandle_t *tmp_handle = zhandle;
int len = sizeof(clientid_t), rc;
+ char buf[512];
+ int buflen = sizeof(buf);

/* Recover the old session at first */
snprintf(path, sizeof(path), MEMBER_ZNODE "/%s",
@@ -1067,6 +1070,30 @@ static void recover_zk_states(void)
mempcpy(&zk.node, &n, sizeof(struct sd_node));
zk_tree_add(&zk); /* current sd_nodes just have ip:port */
}
+
+ /* Set queue position and corresponding watcher */
+ snprintf(path, sizeof(path), QUEUE_ZNODE);
+ rc = zoo_get(zhandle, path, 0, buf, &buflen, &stat);
+ switch (rc) {
+ case ZOK:
+ break;
+ case ZNONODE:
+ sd_err("No node %s, exiting...", path);
+ exit(1);
+ default:
+ sd_err("Failed to get data for %s, %s, exiting", path,
+ zerror(rc));
+ exit(1);
+ }
+ sd_debug("next queue pos: %d", stat.numChildren);
+ queue_pos = stat.numChildren;
+ first_push = false;
+ snprintf(path, sizeof(path), QUEUE_ZNODE "/%010"PRId32, queue_pos);
+ if (zk_node_exists(path) != ZNONODE) {
+ sd_err("Failed to watch %s, %s, exiting", path,
+ zerror(rc));
+ exit(1);
+ }
}

static void recover_sheep_states(void)
--
1.9.1

Liu Yuan

unread,
Nov 17, 2016, 4:06:27 AM11/17/16
to sheep...@googlegroups.com
From: Liu Yuan <liu...@cmss.chinamobile.com>

Signed-off-by: Liu Yuan <liu...@cmss.chinamobile.com>
---
sheep/cluster/zookeeper.c | 3 +++
1 file changed, 3 insertions(+)

diff --git a/sheep/cluster/zookeeper.c b/sheep/cluster/zookeeper.c
index 78dde37..5c4c3c8 100644
--- a/sheep/cluster/zookeeper.c
+++ b/sheep/cluster/zookeeper.c
@@ -1094,6 +1094,9 @@ static void recover_zk_states(void)
zerror(rc));
exit(1);
}
+
+ /* If the master is roll-updating, we need recover the mastership */
+ zk_compete_master();

Liu Yuan

unread,
Nov 17, 2016, 4:06:29 AM11/17/16
to sheep...@googlegroups.com
From: Liu Yuan <liu...@cmss.chinamobile.com>

pending_block_list only have partial blocking events of this node. Only cluster
driver have knowledge of the whole block event state.

block-event information is useful for rolling update to check if it is safe to
direct_join the cluster.

Signed-off-by: Liu Yuan <liu...@cmss.chinamobile.com>
---
dog/cluster.c | 2 ++
include/internal_proto.h | 5 +++--
sheep/cluster.h | 2 ++
sheep/cluster/zookeeper.c | 16 ++++++++++++++++
sheep/ops.c | 5 +++++
5 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/dog/cluster.c b/dog/cluster.c
index b5bdbea..4eeed63 100644
--- a/dog/cluster.c
+++ b/dog/cluster.c
@@ -320,6 +320,8 @@ retry:
printf("disk");
else
printf("node");
+
+ printf("\nCluster block event: %d\n", logs->block_event_number);
}

if (!raw_output && rsp->data_length > 0) {
diff --git a/include/internal_proto.h b/include/internal_proto.h
index 4c8c2f8..352b315 100644
--- a/include/internal_proto.h
+++ b/include/internal_proto.h
@@ -250,7 +250,8 @@ struct cluster_info {
uint8_t copy_policy;
enum sd_status status : 8;
enum sd_status last_status : 8;
- uint8_t __pad[3];
+ bool have_block_event;
+ uint8_t __pad[2];
uint8_t store[STORE_LEN];

/* Node list at cluster_info->epoch */
@@ -262,7 +263,7 @@ struct epoch_log {
uint64_t time; /* treated as time_t */
uint32_t epoch;
uint32_t nr_nodes;
- uint8_t pad;
+ uint8_t block_event_number;
uint8_t nr_copies;
uint8_t copy_policy;
uint8_t __pad[3];
diff --git a/sheep/cluster.h b/sheep/cluster.h
index 0fde142..f0b2753 100644
--- a/sheep/cluster.h
+++ b/sheep/cluster.h
@@ -51,6 +51,8 @@ struct cluster_driver {
*/
int (*get_local_addr)(uint8_t *myaddr);

+ uint8_t (*block_event_number)(void);
+
/*
* Join the cluster
*
diff --git a/sheep/cluster/zookeeper.c b/sheep/cluster/zookeeper.c
index 5c4c3c8..fda7e6c 100644
--- a/sheep/cluster/zookeeper.c
+++ b/sheep/cluster/zookeeper.c
@@ -1686,6 +1686,21 @@ static int zk_update_node(struct sd_node *node)
return add_event(EVENT_UPDATE_NODE, &znode, NULL, 0);
}

+static uint8_t zk_block_event_number(void)
+{
+ struct list_node *tmp;
+ int num = 0;
+
+ list_for_each(tmp, &zk_block_list) {
+ num++;
+ }
+
+ if (num > UINT8_MAX)
+ num = UINT8_MAX;
+
+ return num;
+}
+
static struct cluster_driver cdrv_zookeeper = {
.name = "zookeeper",

@@ -1699,6 +1714,7 @@ static struct cluster_driver cdrv_zookeeper = {
.unlock = zk_unlock,
.update_node = zk_update_node,
.get_local_addr = get_local_addr,
+ .block_event_number = zk_block_event_number,
};

cdrv_register(cdrv_zookeeper);
diff --git a/sheep/ops.c b/sheep/ops.c
index d54f528..4fd7395 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -486,6 +486,9 @@ static int local_stat_cluster(struct request *req)
elog->nr_copies = sys->cinfo.nr_copies;
elog->copy_policy = sys->cinfo.copy_policy;
elog->flags = sys->cinfo.flags;
+ if (sys->cdrv->block_event_number)
+ elog->block_event_number =
+ sys->cdrv->block_event_number();
pstrcpy(elog->drv_name, STORE_LEN,
(char *)sys->cinfo.store);
}
@@ -1125,6 +1128,8 @@ static int local_oids_exist(const struct sd_req *req, struct sd_rsp *rsp,
static int local_cluster_info(const struct sd_req *req, struct sd_rsp *rsp,
void *data, const struct sd_node *sender)
{
+ if (sys->cdrv->block_event_number)
+ sys->cinfo.have_block_event = !!sys->cdrv->block_event_number();
memcpy(data, &sys->cinfo, sizeof(sys->cinfo));
rsp->data_length = sizeof(sys->cinfo);
return SD_RES_SUCCESS;
--
1.9.1

Liu Yuan

unread,
Nov 17, 2016, 4:06:30 AM11/17/16
to sheep...@googlegroups.com
From: Liu Yuan <liu...@cmss.chinamobile.com>

Admin is advised to run 'cluster info -v' to get the block event before doing
rolling-update.

If there IS block event outstanding, sheep will exit with an error message.

Signed-off-by: Liu Yuan <liu...@cmss.chinamobile.com>
---
sheep/cluster/zookeeper.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/sheep/cluster/zookeeper.c b/sheep/cluster/zookeeper.c
index fda7e6c..0549b1e 100644
--- a/sheep/cluster/zookeeper.c
+++ b/sheep/cluster/zookeeper.c
@@ -1028,7 +1028,6 @@ static int zk_connect(const char *host, watcher_fn watcher, int timeout,
return 0;
}

-/*XXX: Check block event */
static void recover_zk_states(void)
{
struct String_vector strs;
@@ -1123,6 +1122,12 @@ static void recover_sheep_states(void)
exit(1);
}

+ if (cinfo.have_block_event) {
+ sd_err("There are blocking events unhandled in the cluster, we "
+ "can't join directly. Exiting...");
+ exit(1);
+ }
+
/* Update nodes from sys->cinfo */
rb_for_each_entry(zk, &zk_node_root, rb) {
for (int i = 0; i < cinfo.nr_nodes; i++) {
--
1.9.1

Liu Yuan

unread,
Nov 17, 2016, 4:06:32 AM11/17/16
to sheep...@googlegroups.com
From: Liu Yuan <liu...@cmss.chinamobile.com>

With rolling update, create_cluster is called in a multithreaded env then the
signal handler is only registered for the main thread, so we have to use tgkill
to send signal to the main thread only instead the whole process by kill.

Signed-off-by: Liu Yuan <liu...@cmss.chinamobile.com>
---
include/util.h | 1 +
lib/util.c | 9 ++++++++-
sheep/cluster/local.c | 3 ++-
3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/include/util.h b/include/util.h
index f034e01..bb74f60 100644
--- a/include/util.h
+++ b/include/util.h
@@ -146,6 +146,7 @@ bool is_numeric(const char *p);
const char *data_to_str(void *data, size_t data_length);
pid_t gettid(void);
int tkill(int tid, int sig);
+int tgkill(int tgid, int tid, int sig);
bool is_xattr_enabled(const char *path);
const char *my_exe_path(void);

diff --git a/lib/util.c b/lib/util.c
index eca0c5f..4134e12 100644
--- a/lib/util.c
+++ b/lib/util.c
@@ -399,9 +399,16 @@ pid_t gettid(void)
return syscall(SYS_gettid);
}

+int tgkill(int tgid, int tid, int sig)
+{
+
+ return syscall(SYS_tgkill, tgid, tid, sig);
+}
+
+/* Kill the thread of the tid in current process */
int tkill(int tid, int sig)
{
- return syscall(SYS_tgkill, getpid(), tid, sig);
+ return tgkill(getpid(), tid, sig);
}

bool is_xattr_enabled(const char *path)
diff --git a/sheep/cluster/local.c b/sheep/cluster/local.c
index 72b64fe..5da47ef 100644
--- a/sheep/cluster/local.c
+++ b/sheep/cluster/local.c
@@ -223,7 +223,8 @@ static void shm_queue_notify(void)

for (i = 0; i < nr; i++) {
sd_debug("send signal to %s", lnode_to_str(lnodes + i));
- kill(lnodes[i].pid, SIGUSR1);
+ /* pid == tid for main thread */
+ tgkill(lnodes[i].pid, lnodes[i].pid, SIGUSR1);
}
}

--
1.9.1

Liu Yuan

unread,
Nov 17, 2016, 4:06:34 AM11/17/16
to sheep...@googlegroups.com
From: Liu Yuan <liu...@cmss.chinamobile.com>

zk_control may delete znode of the queue for memory footprint and so
.numChildren is not a reliable approach to get the next_queue_pos.

Warn: queue counter may wrap-around, so for this case, you need to make sure
the sequence id is sane before rolling update

Signed-off-by: Liu Yuan <liu...@cmss.chinamobile.com>
---
sheep/cluster/zookeeper.c | 46 ++++++++++++++++++++++++++++------------------
1 file changed, 28 insertions(+), 18 deletions(-)

diff --git a/sheep/cluster/zookeeper.c b/sheep/cluster/zookeeper.c
index 0549b1e..2ec5f95 100644
--- a/sheep/cluster/zookeeper.c
+++ b/sheep/cluster/zookeeper.c
@@ -1028,16 +1028,39 @@ static int zk_connect(const char *host, watcher_fn watcher, int timeout,
return 0;
}

+static int32_t zk_get_max_queue_seq(void)
+{
+ char path[MAX_NODE_STR_LEN], *p, *tmp;
+ struct String_vector strs;
+ int32_t max_seq, seq;
+ int rc;
+
+ max_seq = INT32_MIN;
+
+ rc = zk_get_children(QUEUE_ZNODE, &strs);
+ if (rc != ZOK) {
+ sd_err("failed to get queue sequence, %s, exiting.",
+ zerror(rc));
+ exit(1);
+ }
+
+ FOR_EACH_ZNODE(QUEUE_ZNODE, path, &strs) {
+ p = strrchr(path, '/');
+ seq = strtol(++p, &tmp, 10);
+ if (seq > max_seq)
+ max_seq = seq;
+ }
+
+ return max_seq;
+}
+
static void recover_zk_states(void)
{
struct String_vector strs;
char path[MAX_NODE_STR_LEN];
clientid_t sid;
- struct Stat stat;
zhandle_t *tmp_handle = zhandle;
int len = sizeof(clientid_t), rc;
- char buf[512];
- int buflen = sizeof(buf);

/* Recover the old session at first */
snprintf(path, sizeof(path), MEMBER_ZNODE "/%s",
@@ -1071,21 +1094,8 @@ static void recover_zk_states(void)
}

/* Set queue position and corresponding watcher */
- snprintf(path, sizeof(path), QUEUE_ZNODE);
- rc = zoo_get(zhandle, path, 0, buf, &buflen, &stat);
- switch (rc) {
- case ZOK:
- break;
- case ZNONODE:
- sd_err("No node %s, exiting...", path);
- exit(1);
- default:
- sd_err("Failed to get data for %s, %s, exiting", path,
- zerror(rc));
- exit(1);
- }
- sd_debug("next queue pos: %d", stat.numChildren);
- queue_pos = stat.numChildren;
+ queue_pos = zk_get_max_queue_seq() + 1;
+ sd_debug("next queue pos: %d", queue_pos);
first_push = false;
snprintf(path, sizeof(path), QUEUE_ZNODE "/%010"PRId32, queue_pos);
if (zk_node_exists(path) != ZNONODE) {
--
1.9.1

Liu Yuan

unread,
Nov 17, 2016, 4:06:36 AM11/17/16
to sheep...@googlegroups.com
From: Liu Yuan <liu...@cmss.chinamobile.com>

Rolling-update (commit:320a622) move the create_cluster() in the rear and cause
sys_epoch() return 0 at init phase for init_vdi_state(). We sholdn't rely on the
calling order of the init_vdi_state() and we should directly to read object
where as is directed by the parameters.

Signed-off-by: Liu Yuan <liu...@cmss.chinamobile.com>
---
sheep/store/plain_store.c | 76 +++++++++++++++++++++++++----------------------
1 file changed, 40 insertions(+), 36 deletions(-)

diff --git a/sheep/store/plain_store.c b/sheep/store/plain_store.c
index 00307a8..85d4ce4 100644
--- a/sheep/store/plain_store.c
+++ b/sheep/store/plain_store.c
@@ -141,6 +141,38 @@ int default_cleanup(void)
return SD_RES_SUCCESS;
}

+static int default_read_from_path(uint64_t oid, const char *path,
+ const struct siocb *iocb)
+{
+ int flags = prepare_iocb(oid, iocb, false), fd,
+ ret = SD_RES_SUCCESS;
+ ssize_t size;
+
+ /*
+ * Make sure oid is in the right place because oid might be misplaced
+ * in a wrong place, due to 'shutdown/restart with less disks' or any
+ * bugs. We need call err_to_sderr() to return EIO if disk is broken.
+ *
+ * For stale path, get_store_stale_path already does default_exist job.
+ */
+ if (!is_stale_path(path) && !default_exist(oid, iocb->ec_index))
+ return err_to_sderr(path, oid, ENOENT);
+
+ fd = open(path, flags);
+ if (fd < 0)
+ return err_to_sderr(path, oid, errno);
+
+ size = xpread(fd, iocb->buf, iocb->length, iocb->offset);
+ if (size < 0) {
+ sd_err("failed to read object %"PRIx64", path=%s, offset=%"
+ PRId32", size=%"PRId32", result=%zd, %m", oid, path,
+ iocb->offset, iocb->length, size);
+ ret = err_to_sderr(path, oid, errno);
+ }
+ close(fd);
+ return ret;
+}
+
static int init_vdi_state(uint64_t oid, const char *wd, uint32_t epoch)
{
int ret;
@@ -150,16 +182,20 @@ static int init_vdi_state(uint64_t oid, const char *wd, uint32_t epoch)
.buf = inode,
.length = SD_INODE_HEADER_SIZE,
};
+ char path[PATH_MAX];

- ret = default_read(oid, &iocb);
+ if (epoch == 0)
+ get_store_path(oid, iocb.ec_index, path);
+ else
+ get_store_stale_path(oid, iocb.epoch, iocb.ec_index, path);
+
+ ret = default_read_from_path(oid, path, &iocb);
if (ret != SD_RES_SUCCESS) {
sd_err("failed to read inode header %" PRIx64 " %" PRId32
- "wat %s", oid, epoch, wd);
+ "at %s", oid, epoch, path);
goto out;
}
atomic_set_bit(oid_to_vid(oid), sys->vdi_inuse);
-
- ret = SD_RES_SUCCESS;
out:
free(inode);
return ret;
@@ -199,38 +235,6 @@ int default_init(void)
return for_each_object_in_wd(init_objlist_and_vdi_bitmap, true, NULL);
}

-static int default_read_from_path(uint64_t oid, const char *path,
- const struct siocb *iocb)
-{
- int flags = prepare_iocb(oid, iocb, false), fd,
- ret = SD_RES_SUCCESS;
- ssize_t size;
-
- /*
- * Make sure oid is in the right place because oid might be misplaced
- * in a wrong place, due to 'shutdown/restart with less disks' or any
- * bugs. We need call err_to_sderr() to return EIO if disk is broken.
- *
- * For stale path, get_store_stale_path already does default_exist job.
- */
- if (!is_stale_path(path) && !default_exist(oid, iocb->ec_index))
- return err_to_sderr(path, oid, ENOENT);
-
- fd = open(path, flags);
- if (fd < 0)
- return err_to_sderr(path, oid, errno);
-
- size = xpread(fd, iocb->buf, iocb->length, iocb->offset);
- if (size < 0) {
- sd_err("failed to read object %"PRIx64", path=%s, offset=%"
- PRId32", size=%"PRId32", result=%zd, %m", oid, path,
- iocb->offset, iocb->length, size);
- ret = err_to_sderr(path, oid, errno);
- }
- close(fd);
- return ret;
-}
-
int default_read(uint64_t oid, const struct siocb *iocb)
{
int ret;
--
1.9.1

Liu Yuan

unread,
Nov 17, 2016, 4:06:39 AM11/17/16
to sheep...@googlegroups.com
From: Liu Yuan <liu...@cmss.chinamobile.com>

io_address is not initialized yet so we can only use node address.

Signed-off-by: Liu Yuan <liu...@cmss.chinamobile.com>
---
sheep/cluster/zookeeper.c | 15 +++++++++++++--
1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/sheep/cluster/zookeeper.c b/sheep/cluster/zookeeper.c
index 2ec5f95..9e0decb 100644
--- a/sheep/cluster/zookeeper.c
+++ b/sheep/cluster/zookeeper.c
@@ -1117,12 +1117,23 @@ static void recover_sheep_states(void)
int ret = SD_RES_CLUSTER_ERROR;

rb_for_each_entry(n, &sd_node_root, rb) {
+ struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
+ int fd;
+
if (node_eq(&this_node.node, n))
continue;
sd_init_req(&hdr, SD_OP_CLUSTER_INFO);
hdr.data_length = sizeof(cinfo);
- ret = sheep_exec_req(&n->nid, &hdr, &cinfo);
- if (ret == SD_RES_SUCCESS)
+ fd = connect_to_addr(n->nid.addr, n->nid.port);
+ if (fd < 0)
+ continue;
+ sd_debug("try to get cinfo from node: %s",
+ addr_to_str(n->nid.addr, n->nid.port));
+ ret = exec_req(fd, &hdr, &cinfo, NULL, 0, 0);
+ close(fd);
+ if (ret)
+ continue;
+ if (rsp->result == SD_RES_SUCCESS)
break;
}
if (ret != SD_RES_SUCCESS) {
--
1.9.1

Reply all
Reply to author
Forward
0 new messages