[PATCH] sheep/recovery: schedule prio_oids to be recovered really in FIFO order

2 views

Skip to first unread message

Meng Lingkun

unread,

Aug 29, 2016, 5:25:07 AM8/29/16

to sheep...@googlegroups.com, menglingkun, Xu Shenghong

From: menglingkun <mengl...@cmss.chinamobile.com>

Prepare_schedule_oid place oids to recovery_info.prio_oids in order,
and finish_schedule_oids place recovery_info.prio_oids to the next
to be recovered. So the recovery_info.prio_oids will take the place
of the LAST recovery_info.prio_oids which hasn't been recovered.
Seriously, some of the recovery_info.prio_oids will always be taken
the place of, and make the gateway timeout. This patch fixes it.

In our tests (6 nodes, 3 disks per node):
the IO lantency will increase to 40s(write)/210s(read)
Applying this patch, the max lantency is 2.52s(write)/2.39s(read).

Signed-off-by: Meng Lingkun <mengl...@cmss.chinamobile.com>
Signed-off-by: Xu Shenghong <xushe...@cmss.chinamobile.com>
---
sheep/recovery.c | 30 +++++++++++++++++++++++++-----
1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/sheep/recovery.c b/sheep/recovery.c
index e61f4fb..415fba8 100644
--- a/sheep/recovery.c
+++ b/sheep/recovery.c
@@ -52,6 +52,8 @@ struct recovery_info {
uint32_t tgt_epoch;
uint64_t done;
uint64_t next;
+ /* indicate the next idx of the LAST prio_oids put in oids*/
+ uint64_t last_prio;

bool notify_complete;

@@ -594,6 +596,13 @@ static inline void prepare_schedule_oid(uint64_t oid)
return;
}

+ if (rinfo->last_prio > rinfo->next
+ && xlfind(&oid, rinfo->oids + rinfo->next,
+ rinfo->last_prio - rinfo->next, oid_cmp)) {
+ sd_debug("%" PRIx64 " has been already scheduled", oid);
+ return;
+ }
+
rinfo->nr_prio_oids++;
rinfo->prio_oids = xrealloc(rinfo->prio_oids,
rinfo->nr_prio_oids * sizeof(uint64_t));
@@ -817,6 +826,7 @@ static inline bool oid_in_prio_oids(struct recovery_info *rinfo, uint64_t oid)
static inline void finish_schedule_oids(struct recovery_info *rinfo)
{
uint64_t i, nr_recovered = rinfo->next, new_idx;
+ uint64_t last_prio = rinfo->last_prio;
uint64_t *new_oids;

/* If I am the last oid, done */
@@ -824,12 +834,22 @@ static inline void finish_schedule_oids(struct recovery_info *rinfo)
goto done;

new_oids = xmalloc(list_buffer_size);
- memcpy(new_oids, rinfo->oids, nr_recovered * sizeof(uint64_t));
- memcpy(new_oids + nr_recovered, rinfo->prio_oids,
- rinfo->nr_prio_oids * sizeof(uint64_t));
- new_idx = nr_recovered + rinfo->nr_prio_oids;
+ if (last_prio > 0 && last_prio > nr_recovered) {
+ memcpy(new_oids, rinfo->oids, last_prio * sizeof(uint64_t));
+ memcpy(new_oids + last_prio, rinfo->prio_oids,
+ rinfo->nr_prio_oids * sizeof(uint64_t));
+ new_idx = last_prio + rinfo->nr_prio_oids;
+ i = last_prio;
+ } else {
+ memcpy(new_oids, rinfo->oids, nr_recovered * sizeof(uint64_t));
+ memcpy(new_oids + nr_recovered, rinfo->prio_oids,
+ rinfo->nr_prio_oids * sizeof(uint64_t));
+ new_idx = nr_recovered + rinfo->nr_prio_oids;
+ i = nr_recovered;
+ }
+ rinfo->last_prio = new_idx;

- for (i = rinfo->next; i < rinfo->count; i++) {
+ for (; i < rinfo->count; i++) {
if (oid_in_prio_oids(rinfo, rinfo->oids[i]))
continue;
new_oids[new_idx++] = rinfo->oids[i];
--
1.8.3.1

Liu Yuan

unread,

Aug 30, 2016, 1:12:06 AM8/30/16

to Meng Lingkun, sheep...@googlegroups.com, menglingkun, Xu Shenghong

On Mon, Aug 29, 2016 at 02:24:33AM -0700, Meng Lingkun wrote:
> From: menglingkun <mengl...@cmss.chinamobile.com>
>
> Prepare_schedule_oid place oids to recovery_info.prio_oids in order,
> and finish_schedule_oids place recovery_info.prio_oids to the next
> to be recovered. So the recovery_info.prio_oids will take the place
> of the LAST recovery_info.prio_oids which hasn't been recovered.
> Seriously, some of the recovery_info.prio_oids will always be taken
> the place of, and make the gateway timeout. This patch fixes it.
>
> In our tests (6 nodes, 3 disks per node):
> the IO lantency will increase to 40s(write)/210s(read)
> Applying this patch, the max lantency is 2.52s(write)/2.39s(read).
>
> Signed-off-by: Meng Lingkun <mengl...@cmss.chinamobile.com>
> Signed-off-by: Xu Shenghong <xushe...@cmss.chinamobile.com>