From: Gui Hecheng <
guihe...@cmss.chinamobile.com>
Suppose we have 3 store nodes and 1 gateway node in a cluster
Id Host:Port V-Nodes Zone
0
10.0.0.1:7000 98 1
1
10.0.0.1:7001 0 2
2
10.0.0.2:7000 98 3
3
10.0.0.3:7000 98 4
1.dog vdi create test 1g
2.dog node kill 2
3.dog vdi track test
we'll get the following:
Tracking the inode object 0x7c2b25 with 3 nodes
obj 807c2b2500000000 locations at epoch 2, copies = 3
---------------------------------------------------
PANIC: can't find a valid vnode
dog exits unexpectedly (Aborted).
dog.c:375: crash_handler
/lib64/libpthread.so.0(+0xf0ff) [0x7f81c538c0ff]
/lib64/libc.so.6(gsignal+0x36) [0x7f81c4de95f6]
/lib64/libc.so.6(abort+0x147) [0x7f81c4deace7]
sheep.h:81: oid_to_vnodes
sheep.h:80: oid_to_vnodes
dog.c:576: main
/lib64/libc.so.6(__libc_start_main+0xf4) [0x7f81c4dd5b14]
dog() [0x403dc8]
The above says one copy has no node to store, because we only have
two store nodes while expecting three.
Actually the vdi track cmd tries to handle the situation, but it does not
cover the case when there are gateway only nodes:
/*
* When # of nodes is less than nr_copies, we only print
* remaining nodes that holds all the remaining copies.
*/
if (log->nr_nodes < nr_copies) {
...
continue; // also a bug, fixed together
}
Here we should kick out gateway only nodes.
Also, we should goto handle next epoch instead of just "continue".
Signed-off-by: Gui Hecheng <
guihe...@cmss.chinamobile.com>
---
dog/vdi.c | 14 ++++++++++++--
1 file changed, 12 insertions(+), 2 deletions(-)
diff --git a/dog/vdi.c b/dog/vdi.c
index 58265b4..4018087 100644
--- a/dog/vdi.c
+++ b/dog/vdi.c
@@ -1090,6 +1090,7 @@ retry:
for (i = nr_logs - 1; i >= 0; i--) {
struct rb_root vroot = RB_ROOT;
struct rb_root nroot = RB_ROOT;
+ uint32_t store_nodes_nr = 0;
log = (struct epoch_log *)next_log;
@@ -1102,17 +1103,25 @@ retry:
data, parity);
printf("---------------------------------------------------\n");
+ for (int k = 0; k < log->nr_nodes; k++)
+ if (log->nodes[k].nr_vnodes != 0)
+ store_nodes_nr++;
+
/*
* When # of nodes is less than nr_copies, we only print
* remaining nodes that holds all the remaining copies.
+ * Here # of nodes do not include gateway only nodes.
*/
- if (log->nr_nodes < nr_copies) {
+ if (store_nodes_nr < nr_copies) {
for (j = 0; j < log->nr_nodes; j++) {
+ if (log->nodes[j].nr_vnodes == 0)
+ continue;
+
const struct node_id *n = &log->nodes[j].nid;
printf("%s\n", addr_to_str(n->addr, n->port));
}
- continue;
+ goto next;
}
for (int k = 0; k < log->nr_nodes; k++)
rb_insert(&nroot, &log->nodes[k], rb, node_cmp);
@@ -1127,6 +1136,7 @@ retry:
printf("%s\n", addr_to_str(n->addr, n->port));
}
rb_destroy(&vroot, struct sd_vnode, rb);
+next:
next_log = (char *)log->nodes
+ nodes_nr * sizeof(struct sd_node);
}
--
1.8.3.1