[RFC v1] add new io-scheduler to use cgroup on high-speed device

Showing 1-12 of 12 messages
[RFC v1] add new io-scheduler to use cgroup on high-speed device Robin Dong 6/4/13 7:30 PM
We want to use blkio.cgroup on high-speed device (like fusionio) for our mysql clusters.
After testing different io-scheduler, we found that  cfq is too slow and deadline can't run on cgroup.
So we developed a new io-scheduler: tpps (Tiny Parallel Proportion Scheduler).It dispatch requests
only by using their individual weight and total weight (proportion) therefore it's simply and efficient.

Test case: fusionio card, 4 cgroups, iodepth-512

groupname  weight
test1      1000
test2      800
test3      600
test4      400

Use tpps, the result is:

groupname  iops    avg-rt(ms)   max-rt(ms)
test1      30220   16           54
test2      28261   18           56
test3      26333   19           69
test4      20152   25           87

Use cfq, the result is:

groupname  iops    avg-rt(ms)   max-rt(ms)
test1      16478   30           242
test2      13015   39           347
test3       9300   54           371
test4       5806   87           393

Signed-off-by: Robin Dong <san...@taobao.com>
Signed-off-by: Zhu Yanhai <gaoya...@taobao.com>
Cc: Tejun Heo <t...@kernel.org>
Cc: Vivek Goyal <vgo...@redhat.com>
Cc: Jens Axboe <ax...@kernel.dk>
Cc: Tao Ma <taom...@gmail.com>
---
 block/Kconfig.iosched  |   13 +
 block/Makefile         |    1 +
 block/tpps-iosched.c   | 1272 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/blkdev.h |    2 +-
 4 files changed, 1287 insertions(+), 1 deletions(-)
 create mode 100644 block/tpps-iosched.c

diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 421bef9..e5e28c2 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -21,6 +21,16 @@ config IOSCHED_DEADLINE
          a new point in the service tree and doing a batch of IO from there
          in case of expiry.

+config IOSCHED_TPPS
+       tristate "TPPS I/O scheduler"
+       # If BLK_CGROUP is a module, TPPS has to be built as module.
+       default y
+       ---help---
+         The TPPS I/O scheduler tries to distribute iops proportional
+         among all cgroups in the system. It should also provide a low
+         latency working environment, suitable for flash-based device.
+         Note: If BLK_CGROUP=m, then TPPS can be built only as module.
+
 config IOSCHED_CFQ
        tristate "CFQ I/O scheduler"
        default y
@@ -49,6 +59,9 @@ choice
        config DEFAULT_DEADLINE
                bool "Deadline" if IOSCHED_DEADLINE=y

+       config DEFAULT_TPPS
+               bool "Tiny Parallel Proportion" if IOSCHED_TPPS=y
+
        config DEFAULT_CFQ
                bool "CFQ" if IOSCHED_CFQ=y

diff --git a/block/Makefile b/block/Makefile
index 39b76ba..6e30ef4 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -15,6 +15,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING)      += blk-throttle.o
 obj-$(CONFIG_IOSCHED_NOOP)     += noop-iosched.o
 obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
 obj-$(CONFIG_IOSCHED_CFQ)      += cfq-iosched.o
+obj-$(CONFIG_IOSCHED_TPPS)     += tpps-iosched.o

 obj-$(CONFIG_BLOCK_COMPAT)     += compat_ioctl.o
 obj-$(CONFIG_BLK_DEV_INTEGRITY)        += blk-integrity.o
diff --git a/block/tpps-iosched.c b/block/tpps-iosched.c
new file mode 100644
index 0000000..981fde2
--- /dev/null
+++ b/block/tpps-iosched.c
@@ -0,0 +1,1272 @@
+/*
+ *  TPPS, or Tiny Parallel Proportion disk Scheduler.
+ *
+ *  Based on ideas from Zhu Yanhai <gaoya...@taobao.com>
+ *
+ *  Copyright (C) 2013 Robin Dong <san...@taobao.com>
+ */
+#include <linux/module.h>
+#include <linux/blkdev.h>
+#include <linux/elevator.h>
+#include <linux/jiffies.h>
+#include <linux/rbtree.h>
+#include <linux/ioprio.h>
+#include <linux/blktrace_api.h>
+#include "blk-cgroup.h"
+#include "blk.h"
+
+static struct kmem_cache *tpps_pool;
+
+struct tpps_queue {
+       /* reference count */
+       int ref;
+       /* parent tpps_data */
+       struct tpps_data *tppd;
+       /* tpps_group member */
+       struct list_head tppg_node;
+       /* sorted list of pending requests */
+       struct list_head sort_list;
+       struct tpps_group *tppg;
+       pid_t pid;
+       int online;
+       int rq_queued;
+};
+
+struct tppg_stats {
+       /* total bytes transferred */
+       struct blkg_rwstat              service_bytes;
+       /* total IOs serviced, post merge */
+       struct blkg_rwstat              serviced;
+       /* number of ios merged */
+       struct blkg_rwstat              merged;
+       /* total time spent on device in ns, may not be accurate w/ queueing */
+       struct blkg_rwstat              service_time;
+       /* total time spent waiting in scheduler queue in ns */
+       struct blkg_rwstat              wait_time;
+       /* number of IOs queued up */
+       struct blkg_rwstat              queued;
+       /* total sectors transferred */
+       struct blkg_stat                sectors;
+       /* total disk time and nr sectors dispatched by this group */
+       struct blkg_stat                time;
+};
+
+struct tpps_group {
+       struct blkg_policy_data pd;
+       /* tpps_data member */
+       struct list_head tppd_node;
+       struct list_head *cur_dispatcher;
+
+       unsigned int weight;
+       unsigned int new_weight;
+       unsigned int dev_weight;
+       unsigned int leaf_weight;
+       unsigned int new_leaf_weight;
+       unsigned int dev_leaf_weight;
+
+       bool needs_update;
+
+       /*
+        * lists of queues with requests.
+        */
+       struct list_head queue_list;
+       int nr_tppq;
+       int rq_queued;
+       int rq_in_driver;
+
+       struct tppg_stats stats;        /* stats for this tppg */
+       struct tppg_stats dead_stats;   /* stats pushed from dead children */
+};
+
+struct tpps_io_cq {
+       struct io_cq            icq;            /* must be the first member */
+       struct tpps_queue       *tppq;
+       uint64_t                        blkcg_id;       /* the current blkcg ID */
+};
+
+struct tpps_data {
+       struct request_queue *queue;
+       struct tpps_group *root_group;
+
+       /* List of tpps groups being managed on this device*/
+       struct list_head group_list;
+
+       unsigned int busy_queues;
+       int dispatched;
+       int rq_in_driver;
+
+       struct work_struct unplug_work;
+
+       /* Number of groups which are on blkcg->blkg_list */
+       unsigned int nr_blkcg_linked_grps;
+
+       unsigned total_weight;
+};
+
+static inline struct blkcg_gq *tppg_to_blkg(struct tpps_group *tppg)
+{
+       return pd_to_blkg(&tppg->pd);
+}
+
+#define tpps_log_tppq(tppd, tppq, fmt, args...)        do {                    \
+       char __pbuf[128];                                               \
+                                                                       \
+       blkg_path(tppg_to_blkg((tppq)->tppg), __pbuf, sizeof(__pbuf));  \
+       blk_add_trace_msg((tppd)->queue, "tpps%d %s " fmt, (tppq)->pid, \
+                         __pbuf, ##args);                              \
+} while (0)
+
+#define tpps_log_tppg(tppd, tppg, fmt, args...)        do {                    \
+       char __pbuf[128];                                               \
+                                                                       \
+       blkg_path(tppg_to_blkg(tppg), __pbuf, sizeof(__pbuf));          \
+       blk_add_trace_msg((tppd)->queue, "%s " fmt, __pbuf, ##args);    \
+} while (0)
+#define tpps_log(tppd, fmt, args...)   \
+       blk_add_trace_msg((tppd)->queue, "tpps " fmt, ##args)
+
+static inline struct tpps_io_cq *icq_to_tic(struct io_cq *icq)
+{
+       /* tic->icq is the first member, %NULL will convert to %NULL */
+       return container_of(icq, struct tpps_io_cq, icq);
+}
+
+#define RQ_TIC(rq)     icq_to_tic((rq)->elv.icq)
+#define RQ_TPPQ(rq)    (struct tpps_queue *) ((rq)->elv.priv[0])
+#define RQ_TPPG(rq)    (struct tpps_group *) ((rq)->elv.priv[1])
+
+#define TPPS_WEIGHT_DEFAULT    (500)
+#define MIN_DISPATCH_RQ                (8)
+
+static struct blkcg_policy blkcg_policy_tpps;
+
+static inline struct tpps_group *pd_to_tppg(struct blkg_policy_data *pd)
+{
+       return pd ? container_of(pd, struct tpps_group, pd) : NULL;
+}
+
+static inline struct tpps_group *blkg_to_tppg(struct blkcg_gq *blkg)
+{
+       return pd_to_tppg(blkg_to_pd(blkg, &blkcg_policy_tpps));
+}
+
+static inline struct tpps_io_cq *
+tpps_tic_lookup(struct tpps_data *tppd, struct io_context *ioc)
+{
+       if (ioc)
+               return icq_to_tic(ioc_lookup_icq(ioc, tppd->queue));
+       return NULL;
+}
+
+static inline struct tpps_queue *tic_to_tppq(struct tpps_io_cq *tic)
+{
+       return tic->tppq;
+}
+
+static inline void tic_set_tppq(struct tpps_io_cq *tic, struct tpps_queue *tppq)
+{
+       tic->tppq = tppq;
+}
+
+static inline struct tpps_data *tic_to_tppd(struct tpps_io_cq *tic)
+{
+       return tic->icq.q->elevator->elevator_data;
+}
+
+static inline void tppg_get(struct tpps_group *tppg)
+{
+       return blkg_get(tppg_to_blkg(tppg));
+}
+
+static inline void tppg_put(struct tpps_group *tppg)
+{
+       return blkg_put(tppg_to_blkg(tppg));
+}
+
+static inline void tppg_stats_update_io_add(struct tpps_group *tppg,
+                                           struct tpps_group *curr_tppg, int rw)
+{
+       blkg_rwstat_add(&tppg->stats.queued, rw, 1);
+}
+
+static inline void tppg_stats_update_io_remove(struct tpps_group *tppg, int rw)
+{
+       blkg_rwstat_add(&tppg->stats.queued, rw, -1);
+}
+
+static inline void tppg_stats_update_io_merged(struct tpps_group *tppg, int rw)
+{
+       blkg_rwstat_add(&tppg->stats.merged, rw, 1);
+}
+
+static inline void tppg_stats_update_dispatch(struct tpps_group *tppg,
+                                             uint64_t bytes, int rw)
+{
+       blkg_stat_add(&tppg->stats.sectors, bytes >> 9);
+       blkg_rwstat_add(&tppg->stats.serviced, rw, 1);
+       blkg_rwstat_add(&tppg->stats.service_bytes, rw, bytes);
+}
+
+static inline void tppg_stats_update_completion(struct tpps_group *tppg,
+                       uint64_t start_time, uint64_t io_start_time, int rw)
+{
+       struct tppg_stats *stats = &tppg->stats;
+       unsigned long long now = sched_clock();
+
+       if (time_after64(now, io_start_time))
+               blkg_rwstat_add(&stats->service_time, rw, now - io_start_time);
+       if (time_after64(io_start_time, start_time))
+               blkg_rwstat_add(&stats->wait_time, rw,
+                               io_start_time - start_time);
+}
+
+static void tpps_del_queue(struct tpps_queue *tppq)
+{
+       struct tpps_data *tppd = tppq->tppd;
+       struct tpps_group *tppg = tppq->tppg;
+
+       if (!list_empty(&tppq->tppg_node)) {
+               list_del_init(&tppq->tppg_node);
+               tpps_log_tppq(tppd, tppq, "del queue\n");
+               tppg->cur_dispatcher = NULL;
+               tppq->tppg = NULL;
+       }
+
+       printk("%p nr_tppq:%d\n", tppg, tppg->nr_tppq);
+       BUG_ON(tppg->nr_tppq < 1);
+       tppg->nr_tppq--;
+       if (!tppg->nr_tppq)
+               tppd->total_weight -= tppg->pd.blkg->blkcg->cfq_weight;
+
+       BUG_ON(!tppd->busy_queues);
+       tppd->busy_queues--;
+}
+
+/*
+ * task holds one reference to the queue, dropped when task exits. each rq
+ * in-flight on this queue also holds a reference, dropped when rq is freed.
+ *
+ * Each tpps queue took a reference on the parent group. Drop it now.
+ * queue lock must be held here.
+ */
+static void tpps_put_queue(struct tpps_queue *tppq)
+{
+       struct tpps_data *tppd = tppq->tppd;
+       struct tpps_group *tppg;
+
+       BUG_ON(tppq->ref <= 0);
+
+       tppq->ref--;
+       if (tppq->ref)
+               return;
+
+       tpps_log_tppq(tppd, tppq, "put_queue");
+       BUG_ON(!list_empty(&tppq->sort_list));
+       tppg = tppq->tppg;
+
+       tpps_del_queue(tppq);
+       kmem_cache_free(tpps_pool, tppq);
+       tppg_put(tppg);
+}
+
+static void tpps_init_tppq(struct tpps_data *tppd, struct tpps_queue *tppq,
+                         pid_t pid)
+{
+       INIT_LIST_HEAD(&tppq->tppg_node);
+       INIT_LIST_HEAD(&tppq->sort_list);
+
+       tppq->ref = 0;
+       tppq->tppd = tppd;
+       tppq->pid = pid;
+
+}
+
+static void tpps_link_tppq_tppg(struct tpps_queue *tppq,
+               struct tpps_group *tppg)
+{
+       tppq->tppg = tppg;
+       /* tppq reference on tppg */
+       tppg_get(tppg);
+}
+
+static struct tpps_group *tpps_lookup_create_tppg(struct tpps_data *tppd,
+                                               struct blkcg *blkcg)
+{
+       struct request_queue *q = tppd->queue;
+       struct tpps_group *tppg = NULL;
+
+       /* avoid lookup for the common case where there's no blkcg */
+       if (blkcg == &blkcg_root) {
+               tppg = tppd->root_group;
+       } else {
+               struct blkcg_gq *blkg;
+
+               blkg = blkg_lookup_create(blkcg, q);
+               if (!IS_ERR(blkg))
+                       tppg = blkg_to_tppg(blkg);
+       }
+
+       return tppg;
+}
+
+static struct tpps_queue *
+tpps_find_alloc_queue(struct tpps_data *tppd, struct tpps_io_cq* tic, struct bio *bio,
+               gfp_t gfp_mask)
+{
+       struct tpps_queue *tppq, *new_tppq = NULL;
+       struct tpps_group *tppg;
+       struct blkcg *blkcg;
+
+retry:
+       rcu_read_lock();
+
+       blkcg = bio_blkcg(bio);
+       tppg = tpps_lookup_create_tppg(tppd, blkcg);
+       tppq = tic_to_tppq(tic);
+
+       if (!tppq) {
+               if (new_tppq) {
+                       tppq = new_tppq;
+                       new_tppq = NULL;
+               } else if (gfp_mask & __GFP_WAIT) {
+                       rcu_read_unlock();
+                       spin_unlock_irq(tppd->queue->queue_lock);
+                       new_tppq = kmem_cache_alloc_node(tpps_pool,
+                                       gfp_mask | __GFP_ZERO,
+                                       tppd->queue->node);
+                       spin_lock_irq(tppd->queue->queue_lock);
+                       if (new_tppq)
+                               goto retry;
+               } else
+                       tppq = kmem_cache_alloc_node(tpps_pool,
+                                       gfp_mask | __GFP_ZERO,
+                                       tppd->queue->node);
+
+               if (tppq) {
+                       tpps_init_tppq(tppd, tppq, current->pid);
+                       tpps_link_tppq_tppg(tppq, tppg);
+                       tpps_log_tppq(tppd, tppq, "alloced");
+               }
+       }
+
+       if (new_tppq)
+               kmem_cache_free(tpps_pool, new_tppq);
+
+       rcu_read_unlock();
+       return tppq;
+}
+
+static struct tpps_queue *
+tpps_get_queue(struct tpps_data *tppd, struct tpps_io_cq *tic, struct bio *bio,
+                       gfp_t gfp_mask)
+{
+       struct tpps_queue *tppq;
+
+       tppq = tpps_find_alloc_queue(tppd, tic, bio, gfp_mask);
+       tppq->ref++;
+       return tppq;
+}
+
+/*
+ * scheduler run of queue, if there are requests pending and no one in the
+ * driver that will restart queueing
+ */
+static inline void tpps_schedule_dispatch(struct tpps_data *tppd)
+{
+       if (tppd->busy_queues) {
+               tpps_log(tppd, "schedule dispatch");
+               kblockd_schedule_work(tppd->queue, &tppd->unplug_work);
+       }
+}
+
+static void check_blkcg_changed(struct tpps_io_cq *tic, struct bio *bio)
+{
+       struct tpps_data *tppd = tic_to_tppd(tic);
+       struct tpps_queue *tppq;
+       uint64_t id;
+
+       rcu_read_lock();
+       id = bio_blkcg(bio)->id;
+       rcu_read_unlock();
+
+       /*
+        * Check whether blkcg has changed.  The condition may trigger
+        * spuriously on a newly created tic but there's no harm.
+        */
+       if (unlikely(!tppd) || likely(tic->blkcg_id == id))
+               return;
+
+       tppq = tic_to_tppq(tic);
+       if (tppq) {
+               /*
+                * Drop reference to sync queue. A new sync queue will be
+                * assigned in new group upon arrival of a fresh request.
+                */
+               tpps_log_tppq(tppd, tppq, "changed cgroup");
+               tic_set_tppq(tic, NULL);
+               tpps_put_queue(tppq);
+       }
+
+       tic->blkcg_id = id;
+}
+
+static int
+tpps_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
+                       gfp_t gfp_mask)
+{
+       struct tpps_data *tppd = q->elevator->elevator_data;
+       struct tpps_io_cq *tic = icq_to_tic(rq->elv.icq);
+       struct tpps_queue *tppq;
+
+       might_sleep_if(gfp_mask & __GFP_WAIT);
+
+       spin_lock_irq(q->queue_lock);
+
+       check_blkcg_changed(tic, bio);
+
+       tppq = tic_to_tppq(tic);
+       if (!tppq) {
+               tppq = tpps_get_queue(tppd, tic, bio, gfp_mask);
+               tic_set_tppq(tic, tppq);
+       }
+
+       tppq->ref++;
+       tppg_get(tppq->tppg);
+       rq->elv.priv[0] = tppq;
+       rq->elv.priv[1] = tppq->tppg;
+       spin_unlock_irq(q->queue_lock);
+       return 0;
+}
+
+/*
+ * queue lock held here
+ */
+static void tpps_put_request(struct request *rq)
+{
+       struct tpps_queue *tppq = RQ_TPPQ(rq);
+
+       if (tppq) {
+               WARN_ON(tppq->tppg != RQ_TPPG(rq));
+
+               /* Put down rq reference on cfqg */
+               tppg_put(RQ_TPPG(rq));
+               rq->elv.priv[0] = NULL;
+               rq->elv.priv[1] = NULL;
+
+               tpps_put_queue(tppq);
+       }
+}
+
+static void
+tpps_update_group_weight(struct tpps_group *tppg)
+{
+       if (tppg->needs_update) {
+               tppg->weight = tppg->new_weight;
+               tppg->needs_update = false;
+       }
+}
+
+static void tpps_add_queue(struct tpps_data *tppd, struct tpps_queue *tppq)
+{
+       struct tpps_group *tppg;
+
+       if (!tppq->online) {
+               tppq->online = 1;
+               tppg = tppq->tppg;
+               tpps_log_tppq(tppd, tppq, "add queue");
+               tppg->nr_tppq++;
+               tppd->busy_queues++;
+               list_add(&tppq->tppg_node, &tppg->queue_list);
+               printk("add tppq %p to %p\n", tppq, tppg);
+               tpps_update_group_weight(tppg);
+               if (tppg->nr_tppq <= 1) {
+                       tppd->total_weight += tppg->pd.blkg->blkcg->cfq_weight;
+                       list_add(&tppg->tppd_node, &tppd->group_list);
+                       printk("twt:%u, wt:%u %u %d %p\n", tppd->total_weight, tppg->weight,
+                                       tppg->pd.blkg->blkcg->cfq_weight,
+                                       tppg->nr_tppq,
+                                       tppg);
+               }
+       }
+}
+
+static void tpps_insert_request(struct request_queue *q, struct request *rq)
+{
+       struct tpps_data *tppd = q->elevator->elevator_data;
+       struct tpps_queue *tppq = RQ_TPPQ(rq);
+
+       tpps_log_tppq(tppd, tppq, "insert_request");
+
+       list_add_tail(&rq->queuelist, &tppq->sort_list);
+       tppq->rq_queued++;
+       tppq->tppg->rq_queued++;
+       tppd->dispatched++;
+       tpps_add_queue(tppd, tppq);
+       tppg_stats_update_io_add(RQ_TPPG(rq), tppq->tppg, rq->cmd_flags);
+}
+
+static void tpps_remove_request(struct request *rq)
+{
+       struct tpps_queue *tppq = RQ_TPPQ(rq);
+
+       list_del_init(&rq->queuelist);
+       tppq->rq_queued--;
+       tppq->tppg->rq_queued--;
+       tppg_stats_update_io_remove(RQ_TPPG(rq), rq->cmd_flags);
+}
+
+/*
+ * Move request from internal lists to the request queue dispatch list.
+ */
+static int tpps_dispatch_insert(struct request_queue *q,
+                               struct tpps_queue *tppq)
+{
+       struct list_head *rbnext = tppq->sort_list.next;
+       struct request *rq;
+
+       if (rbnext == &tppq->sort_list)
+               return 0;
+
+       rq = rq_entry_fifo(rbnext);
+       tpps_remove_request(rq);
+       elv_dispatch_sort(q, rq);
+       tppg_stats_update_dispatch(tppq->tppg, blk_rq_bytes(rq), rq->cmd_flags);
+       return 1;
+}
+
+static int tpps_dispatch_requests_nr(struct tpps_data *tppd,
+                               struct tpps_queue *tppq, int count)
+{
+       int cnt = 0, ret;
+
+       if (!tppq->rq_queued)
+               return cnt;
+
+       do {
+               ret = tpps_dispatch_insert(tppd->queue, tppq);
+               if (ret) {
+                       cnt++;
+                       tppd->dispatched--;
+               }
+       } while (ret && cnt < count);
+
+       return cnt;
+}
+
+static int tpps_dispatch_requests(struct request_queue *q, int force)
+{
+       struct tpps_data *tppd = q->elevator->elevator_data;
+       struct tpps_group *tppg, *group_n;
+       struct tpps_queue *tppq;
+       struct list_head *next;
+       int count = 0, total = 0, ret;
+       int quota, grp_quota;
+
+       if (!tppd->total_weight)
+               return 0;
+
+       quota = q->nr_requests - tppd->rq_in_driver;
+       if (quota < MIN_DISPATCH_RQ && !force)
+               return 0;
+
+       list_for_each_entry_safe(tppg, group_n, &tppd->group_list, tppd_node) {
+               if (!tppg->nr_tppq)
+                       continue;
+               grp_quota = (quota * tppg->pd.blkg->blkcg->cfq_weight
+                                       / tppd->total_weight) - tppg->rq_in_driver;
+               tpps_log_tppg(tppd, tppg,
+                       "nr:%d, wt:%u total_wt:%u in_driver:%d %d quota:%d grp_quota:%d",
+                       tppg->nr_tppq, tppg->pd.blkg->blkcg->cfq_weight,
+                       tppd->total_weight, tppg->rq_in_driver, tppg->rq_queued,
+                       quota, grp_quota);
+               if (grp_quota <= 0 && !force)
+                       continue;
+               BUG_ON(tppg->queue_list.next == &tppg->queue_list);
+               if (!tppg->cur_dispatcher)
+                       tppg->cur_dispatcher = tppg->queue_list.next;
+               next = tppg->cur_dispatcher;
+               count = 0;
+               do {
+                       tppq = list_entry(next, struct tpps_queue, tppg_node);
+                       tpps_log_tppq(tppd, tppq, "tppq: %d\n", tppq->rq_queued);
+                       if (force)
+                               ret = tpps_dispatch_requests_nr(tppd, tppq, -1);
+                       else
+                               ret = tpps_dispatch_requests_nr(tppd, tppq, 1);
+                       count += ret;
+                       total += ret;
+                       next = next->next;
+                       if (next == &tppg->queue_list)
+                               next = tppg->queue_list.next;
+                       if (count >= grp_quota && !force) {
+                               tppg->cur_dispatcher = next;
+                               break;
+                       }
+                       BUG_ON(tppg->cur_dispatcher == &tppg->queue_list);
+               } while (next != tppg->cur_dispatcher);
+       }
+       return total > 0;
+}
+
+static void tpps_kick_queue(struct work_struct *work)
+{
+       struct tpps_data *tppd =
+               container_of(work, struct tpps_data, unplug_work);
+       struct request_queue *q = tppd->queue;
+
+       spin_lock_irq(q->queue_lock);
+       __blk_run_queue(q);
+       spin_unlock_irq(q->queue_lock);
+}
+
+static void tpps_init_tppg_base(struct tpps_group *tppg)
+{
+       INIT_LIST_HEAD(&tppg->tppd_node);
+       INIT_LIST_HEAD(&tppg->queue_list);
+       tppg->cur_dispatcher = NULL;
+
+}
+
+static int tpps_init_queue(struct request_queue *q)
+{
+       struct tpps_data *tppd;
+       struct tpps_group *tppg;
+       int ret;
+
+       tppd = kmalloc_node(sizeof(*tppd), GFP_KERNEL | __GFP_ZERO, q->node);
+       if (!tppd)
+               return -ENOMEM;
+
+       tppd->queue = q;
+       q->elevator->elevator_data = tppd;
+
+       INIT_LIST_HEAD(&tppd->group_list);
+
+       ret = blkcg_activate_policy(q, &blkcg_policy_tpps);
+       if (ret)
+               goto out_free;
+
+       /* Init root group */
+       tppd->root_group = blkg_to_tppg(q->root_blkg);
+       tppg = tppd->root_group;
+       tpps_init_tppg_base(tppg);
+
+       /* Give preference to root group over other groups */
+       tppg->weight = 2 * TPPS_WEIGHT_DEFAULT;
+       tppg->leaf_weight = 2 * TPPS_WEIGHT_DEFAULT;
+
+       INIT_WORK(&tppd->unplug_work, tpps_kick_queue);
+
+       return 0;
+
+out_free:
+       kfree(tppd);
+       return ret;
+}
+
+static void tpps_exit_queue(struct elevator_queue *e)
+{
+       struct tpps_data *tppd = e->elevator_data;
+       struct request_queue *q = tppd->queue;
+
+       cancel_work_sync(&tppd->unplug_work);
+
+       blkcg_deactivate_policy(q, &blkcg_policy_tpps);
+       kfree(tppd->root_group);
+       kfree(tppd);
+}
+
+static void tpps_activate_request(struct request_queue *q, struct request *rq)
+{
+       struct tpps_queue *tppq = RQ_TPPQ(rq);
+       struct tpps_data *tppd = q->elevator->elevator_data;
+       tppd->rq_in_driver++;
+       tppq->tppg->rq_in_driver++;
+       tpps_log_tppq(tppd, RQ_TPPQ(rq), "activate rq, drv=%d",
+                                               tppd->rq_in_driver);
+}
+
+static void tpps_deactivate_request(struct request_queue *q, struct request *rq)
+{
+       struct tpps_queue *tppq = RQ_TPPQ(rq);
+       struct tpps_data *tppd = q->elevator->elevator_data;
+
+       WARN_ON(!tppd->rq_in_driver);
+       tppd->rq_in_driver--;
+       tppq->tppg->rq_in_driver--;
+       tpps_log_tppq(tppd, RQ_TPPQ(rq), "deactivate rq, drv=%d",
+                                               tppd->rq_in_driver);
+}
+
+static void tpps_completed_request(struct request_queue *q, struct request *rq)
+{
+       struct tpps_queue *tppq = RQ_TPPQ(rq);
+       struct tpps_data *tppd = tppq->tppd;
+
+       WARN_ON(!tppq);
+       WARN_ON(tppq->tppg != RQ_TPPG(rq));
+
+       tpps_log_tppq(tppd, tppq, "complete rqnoidle %d",
+                       !!(rq->cmd_flags & REQ_NOIDLE));
+       WARN_ON(!tppd->rq_in_driver);
+       tppd->rq_in_driver--;
+       tppq->tppg->rq_in_driver--;
+       tppg_stats_update_completion(tppq->tppg,
+                       rq_start_time_ns(rq), rq_io_start_time_ns(rq), rq->cmd_flags);
+
+       if (!tppd->rq_in_driver)
+               tpps_schedule_dispatch(tppd);
+}
+
+static void
+tpps_merged_request(struct request_queue *q, struct request *rq, int type)
+{
+       if (type == ELEVATOR_FRONT_MERGE) {
+               struct tpps_queue *tppq = RQ_TPPQ(rq);
+               list_del_init(&rq->queuelist);
+               tppq->rq_queued--;
+               tppg_stats_update_io_remove(RQ_TPPG(rq), rq->cmd_flags);
+               list_add_tail(&rq->queuelist, &tppq->sort_list);
+               tppq->rq_queued++;
+               tppg_stats_update_io_add(RQ_TPPG(rq), tppq->tppg, rq->cmd_flags);
+       }
+}
+
+static void
+tpps_merged_requests(struct request_queue *q, struct request *rq,
+                       struct request *next)
+{
+       tpps_remove_request(next);
+       tppg_stats_update_io_merged(RQ_TPPG(rq), rq->cmd_flags);
+}
+
+static void tpps_init_icq(struct io_cq *icq)
+{ }
+
+static void tpps_exit_icq(struct io_cq *icq)
+{
+       struct tpps_io_cq *tic = icq_to_tic(icq);
+
+       if (tic->tppq) {
+               tpps_put_queue(tic->tppq);
+               tic->tppq = NULL;
+       }
+}
+
+static struct elevator_type iosched_tpps = {
+       .ops = {
+               .elevator_merged_fn =           tpps_merged_request,
+               .elevator_merge_req_fn =        tpps_merged_requests,
+               .elevator_dispatch_fn =         tpps_dispatch_requests,
+               .elevator_add_req_fn =          tpps_insert_request,
+               .elevator_activate_req_fn =     tpps_activate_request,
+               .elevator_deactivate_req_fn =   tpps_deactivate_request,
+               .elevator_completed_req_fn =    tpps_completed_request,
+               .elevator_init_icq_fn =         tpps_init_icq,
+               .elevator_exit_icq_fn =         tpps_exit_icq,
+               .elevator_set_req_fn =          tpps_set_request,
+               .elevator_put_req_fn =          tpps_put_request,
+               .elevator_init_fn =             tpps_init_queue,
+               .elevator_exit_fn =             tpps_exit_queue,
+       },
+       .icq_size               = sizeof(struct tpps_io_cq),
+       .icq_align              = __alignof__(struct tpps_io_cq),
+       .elevator_name  =       "tpps",
+       .elevator_owner =       THIS_MODULE,
+};
+
+static u64 tppg_prfill_weight_device(struct seq_file *sf,
+                                    struct blkg_policy_data *pd, int off)
+{
+       struct tpps_group *tppg = pd_to_tppg(pd);
+
+       if (!tppg->dev_weight)
+               return 0;
+       return __blkg_prfill_u64(sf, pd, tppg->dev_weight);
+}
+
+static int tppg_print_weight_device(struct cgroup *cgrp, struct cftype *cft,
+                                   struct seq_file *sf)
+{
+       blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp),
+                         tppg_prfill_weight_device, &blkcg_policy_tpps, 0,
+                         false);
+       return 0;
+}
+
+static u64 tppg_prfill_leaf_weight_device(struct seq_file *sf,
+                                         struct blkg_policy_data *pd, int off)
+{
+       struct tpps_group *tppg = pd_to_tppg(pd);
+
+       if (!tppg->dev_leaf_weight)
+               return 0;
+       return __blkg_prfill_u64(sf, pd, tppg->dev_leaf_weight);
+}
+
+static int tppg_print_leaf_weight_device(struct cgroup *cgrp,
+                                        struct cftype *cft,
+                                        struct seq_file *sf)
+{
+       blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp),
+                         tppg_prfill_leaf_weight_device, &blkcg_policy_tpps, 0,
+                         false);
+       return 0;
+}
+
+static int tppg_print_weight(struct cgroup *cgrp, struct cftype *cft,
+                           struct seq_file *sf)
+{
+       seq_printf(sf, "%u\n", cgroup_to_blkcg(cgrp)->cfq_weight);
+       return 0;
+}
+
+static int tppg_print_leaf_weight(struct cgroup *cgrp, struct cftype *cft,
+                                struct seq_file *sf)
+{
+       seq_printf(sf, "%u\n",
+                  cgroup_to_blkcg(cgrp)->cfq_leaf_weight);
+       return 0;
+}
+
+static int __tppg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
+                                   const char *buf, bool is_leaf_weight)
+{
+       struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+       struct blkg_conf_ctx ctx;
+       struct tpps_group *tppg;
+       int ret;
+
+       ret = blkg_conf_prep(blkcg, &blkcg_policy_tpps, buf, &ctx);
+       if (ret)
+               return ret;
+
+       ret = -EINVAL;
+       tppg = blkg_to_tppg(ctx.blkg);
+       if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) {
+               if (!is_leaf_weight) {
+                       tppg->dev_weight = ctx.v;
+                       tppg->new_weight = ctx.v ?: blkcg->cfq_weight;
+               } else {
+                       tppg->dev_leaf_weight = ctx.v;
+                       tppg->new_leaf_weight = ctx.v ?: blkcg->cfq_leaf_weight;
+               }
+               ret = 0;
+       }
+
+       blkg_conf_finish(&ctx);
+       return ret;
+}
+
+static int tppg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
+                                 const char *buf)
+{
+       return __tppg_set_weight_device(cgrp, cft, buf, false);
+}
+
+static int tppg_set_leaf_weight_device(struct cgroup *cgrp, struct cftype *cft,
+                                      const char *buf)
+{
+       return __tppg_set_weight_device(cgrp, cft, buf, true);
+}
+
+static int __tpps_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val,
+                           bool is_leaf_weight)
+{
+       struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+       struct blkcg_gq *blkg;
+
+       if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX)
+               return -EINVAL;
+
+       spin_lock_irq(&blkcg->lock);
+
+       if (!is_leaf_weight)
+               blkcg->cfq_weight = val;
+       else
+               blkcg->cfq_leaf_weight = val;
+
+       hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
+               struct tpps_group *tppg = blkg_to_tppg(blkg);
+
+               if (!tppg)
+                       continue;
+
+               if (!is_leaf_weight) {
+                       if (!tppg->dev_weight)
+                               tppg->new_weight = blkcg->cfq_weight;
+               } else {
+                       if (!tppg->dev_leaf_weight)
+                               tppg->new_leaf_weight = blkcg->cfq_leaf_weight;
+               }
+       }
+
+       spin_unlock_irq(&blkcg->lock);
+       return 0;
+}
+
+static int tpps_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
+{
+       return __tpps_set_weight(cgrp, cft, val, false);
+}
+
+static int tpps_set_leaf_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
+{
+       return __tpps_set_weight(cgrp, cft, val, true);
+}
+
+/* offset delta from tppg->stats to tppg->dead_stats */
+static const int dead_stats_off_delta = offsetof(struct tpps_group, dead_stats) -
+                                       offsetof(struct tpps_group, stats);
+
+/* to be used by recursive prfill, sums live and dead rwstats recursively */
+static struct blkg_rwstat tppg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd,
+                                                      int off)
+{
+       struct blkg_rwstat a, b;
+
+       a = blkg_rwstat_recursive_sum(pd, off);
+       b = blkg_rwstat_recursive_sum(pd, off + dead_stats_off_delta);
+       blkg_rwstat_merge(&a, &b);
+       return a;
+}
+
+/* to be used by recursive prfill, sums live and dead stats recursively */
+static u64 tppg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off)
+{
+       u64 sum = 0;
+
+       sum += blkg_stat_recursive_sum(pd, off);
+       sum += blkg_stat_recursive_sum(pd, off + dead_stats_off_delta);
+       return sum;
+}
+
+static int tppg_print_stat(struct cgroup *cgrp, struct cftype *cft,
+                          struct seq_file *sf)
+{
+       struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+
+       blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat, &blkcg_policy_tpps,
+                         cft->private, false);
+       return 0;
+}
+
+static int tppg_print_rwstat(struct cgroup *cgrp, struct cftype *cft,
+                            struct seq_file *sf)
+{
+       struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+
+       blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat, &blkcg_policy_tpps,
+                         cft->private, true);
+       return 0;
+}
+
+static u64 tppg_prfill_stat_recursive(struct seq_file *sf,
+                                     struct blkg_policy_data *pd, int off)
+{
+       u64 sum = tppg_stat_pd_recursive_sum(pd, off);
+
+       return __blkg_prfill_u64(sf, pd, sum);
+}
+
+static u64 tppg_prfill_rwstat_recursive(struct seq_file *sf,
+                                       struct blkg_policy_data *pd, int off)
+{
+       struct blkg_rwstat sum = tppg_rwstat_pd_recursive_sum(pd, off);
+
+       return __blkg_prfill_rwstat(sf, pd, &sum);
+}
+
+static int tppg_print_stat_recursive(struct cgroup *cgrp, struct cftype *cft,
+                                    struct seq_file *sf)
+{
+       struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+
+       blkcg_print_blkgs(sf, blkcg, tppg_prfill_stat_recursive,
+                         &blkcg_policy_tpps, cft->private, false);
+       return 0;
+}
+
+static int tppg_print_rwstat_recursive(struct cgroup *cgrp, struct cftype *cft,
+                                      struct seq_file *sf)
+{
+       struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+
+       blkcg_print_blkgs(sf, blkcg, tppg_prfill_rwstat_recursive,
+                         &blkcg_policy_tpps, cft->private, true);
+       return 0;
+}
+
+static struct cftype tpps_blkcg_files[] = {
+       /* on root, weight is mapped to leaf_weight */
+       {
+               .name = "tpps.weight_device",
+               .flags = CFTYPE_ONLY_ON_ROOT,
+               .read_seq_string = tppg_print_leaf_weight_device,
+               .write_string = tppg_set_leaf_weight_device,
+               .max_write_len = 256,
+       },
+       {
+               .name = "tpps.weight",
+               .flags = CFTYPE_ONLY_ON_ROOT,
+               .read_seq_string = tppg_print_leaf_weight,
+               .write_u64 = tpps_set_leaf_weight,
+       },
+
+       /* no such mapping necessary for !roots */
+       {
+               .name = "tpps.weight_device",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .read_seq_string = tppg_print_weight_device,
+               .write_string = tppg_set_weight_device,
+               .max_write_len = 256,
+       },
+       {
+               .name = "tpps.weight",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .read_seq_string = tppg_print_weight,
+               .write_u64 = tpps_set_weight,
+       },
+
+       {
+               .name = "tpps.leaf_weight_device",
+               .read_seq_string = tppg_print_leaf_weight_device,
+               .write_string = tppg_set_leaf_weight_device,
+               .max_write_len = 256,
+       },
+       {
+               .name = "tpps.leaf_weight",
+               .read_seq_string = tppg_print_leaf_weight,
+               .write_u64 = tpps_set_leaf_weight,
+       },
+
+       /* statistics, covers only the tasks in the tppg */
+       {
+               .name = "tpps.time",
+               .private = offsetof(struct tpps_group, stats.time),
+               .read_seq_string = tppg_print_stat,
+       },
+       {
+               .name = "tpps.sectors",
+               .private = offsetof(struct tpps_group, stats.sectors),
+               .read_seq_string = tppg_print_stat,
+       },
+       {
+               .name = "tpps.io_service_bytes",
+               .private = offsetof(struct tpps_group, stats.service_bytes),
+               .read_seq_string = tppg_print_rwstat,
+       },
+       {
+               .name = "tpps.io_serviced",
+               .private = offsetof(struct tpps_group, stats.serviced),
+               .read_seq_string = tppg_print_rwstat,
+       },
+       {
+               .name = "tpps.io_service_time",
+               .private = offsetof(struct tpps_group, stats.service_time),
+               .read_seq_string = tppg_print_rwstat,
+       },
+       {
+               .name = "tpps.io_wait_time",
+               .private = offsetof(struct tpps_group, stats.wait_time),
+               .read_seq_string = tppg_print_rwstat,
+       },
+       {
+               .name = "tpps.io_merged",
+               .private = offsetof(struct tpps_group, stats.merged),
+               .read_seq_string = tppg_print_rwstat,
+       },
+       {
+               .name = "tpps.io_queued",
+               .privat...
[RFC v1] add new io-scheduler to use cgroup on high-speed device Robin Dong 6/4/13 7:30 PM
From: Robin Dong <san...@taobao.com>
+                .private = offsetof(struct tpps_group, stats.queued),
+                .read_seq_string = tppg_print_rwstat,
+        },
+
+        /* the same statictics which cover the tppg and its descendants */
+        {
+                .name = "tpps.time_recursive",
+                .private = offsetof(struct tpps_group, stats.time),
+                .read_seq_string = tppg_print_stat_recursive,
+        },
+        {
+                .name = "tpps.sectors_recursive",
+                .private = offsetof(struct tpps_group, stats.sectors),
+                .read_seq_string = tppg_print_stat_recursive,
+        },
+        {
+                .name = "tpps.io_service_bytes_recursive",
+                .private = offsetof(struct tpps_group, stats.service_bytes),
+                .read_seq_string = tppg_print_rwstat_recursive,
+        },
+        {
+                .name = "tpps.io_serviced_recursive",
+                .private = offsetof(struct tpps_group, stats.serviced),
+                .read_seq_string = tppg_print_rwstat_recursive,
+        },
+        {
+                .name = "tpps.io_service_time_recursive",
+                .private = offsetof(struct tpps_group, stats.service_time),
+                .read_seq_string = tppg_print_rwstat_recursive,
+        },
+        {
+                .name = "tpps.io_wait_time_recursive",
+                .private = offsetof(struct tpps_group, stats.wait_time),
+                .read_seq_string = tppg_print_rwstat_recursive,
+        },
+        {
+                .name = "tpps.io_merged_recursive",
+                .private = offsetof(struct tpps_group, stats.merged),
+                .read_seq_string = tppg_print_rwstat_recursive,
+        },
+        {
+                .name = "tpps.io_queued_recursive",
+                .private = offsetof(struct tpps_group, stats.queued),
+                .read_seq_string = tppg_print_rwstat_recursive,
+        },
+        { }        /* terminate */
+};
+
+static void tpps_pd_init(struct blkcg_gq *blkg)
+{
+        struct tpps_group *tppg = blkg_to_tppg(blkg);
+
+        tpps_init_tppg_base(tppg);
+        tppg->weight = blkg->blkcg->cfq_weight;
+        tppg->leaf_weight = blkg->blkcg->cfq_leaf_weight;
+}
+
+static inline struct tpps_group *tppg_parent(struct tpps_group *tppg)
+{
+        struct blkcg_gq *pblkg = tppg_to_blkg(tppg)->parent;
+
+        return pblkg ? blkg_to_tppg(pblkg) : NULL;
+}
+
+static void tppg_stats_reset(struct tppg_stats *stats)
+{
+        /* queued stats shouldn't be cleared */
+        blkg_rwstat_reset(&stats->service_bytes);
+        blkg_rwstat_reset(&stats->serviced);
+        blkg_rwstat_reset(&stats->merged);
+        blkg_rwstat_reset(&stats->service_time);
+        blkg_rwstat_reset(&stats->wait_time);
+        blkg_stat_reset(&stats->time);
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+        blkg_stat_reset(&stats->unaccounted_time);
+        blkg_stat_reset(&stats->avg_queue_size_sum);
+        blkg_stat_reset(&stats->avg_queue_size_samples);
+        blkg_stat_reset(&stats->dequeue);
+        blkg_stat_reset(&stats->group_wait_time);
+        blkg_stat_reset(&stats->idle_time);
+        blkg_stat_reset(&stats->empty_time);
+#endif
+}
+
+/* @to += @from */
+static void tppg_stats_merge(struct tppg_stats *to, struct tppg_stats *from)
+{
+        /* queued stats shouldn't be cleared */
+        blkg_rwstat_merge(&to->service_bytes, &from->service_bytes);
+        blkg_rwstat_merge(&to->serviced, &from->serviced);
+        blkg_rwstat_merge(&to->merged, &from->merged);
+        blkg_rwstat_merge(&to->service_time, &from->service_time);
+        blkg_rwstat_merge(&to->wait_time, &from->wait_time);
+        blkg_stat_merge(&from->time, &from->time);
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+        blkg_stat_merge(&to->unaccounted_time, &from->unaccounted_time);
+        blkg_stat_merge(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
+        blkg_stat_merge(&to->avg_queue_size_samples, &from->avg_queue_size_samples);
+        blkg_stat_merge(&to->dequeue, &from->dequeue);
+        blkg_stat_merge(&to->group_wait_time, &from->group_wait_time);
+        blkg_stat_merge(&to->idle_time, &from->idle_time);
+        blkg_stat_merge(&to->empty_time, &from->empty_time);
+#endif
+}
+
+static void tppg_stats_xfer_dead(struct tpps_group *tppg)
+{
+        struct tpps_group *parent = tppg_parent(tppg);
+
+        lockdep_assert_held(tppg_to_blkg(tppg)->q->queue_lock);
+
+        if (unlikely(!parent))
+                return;
+
+        tppg_stats_merge(&parent->dead_stats, &tppg->stats);
+        tppg_stats_merge(&parent->dead_stats, &tppg->dead_stats);
+        tppg_stats_reset(&tppg->stats);
+        tppg_stats_reset(&tppg->dead_stats);
+}
+
+static void tpps_pd_offline(struct blkcg_gq *blkg)
+{
+        struct tpps_group *tppg = blkg_to_tppg(blkg);
+        /*
+         * @blkg is going offline and will be ignored by
+         * blkg_[rw]stat_recursive_sum().  Transfer stats to the parent so
+         * that they don't get lost.  If IOs complete after this point, the
+         * stats for them will be lost.  Oh well...
+         */
+        tppg_stats_xfer_dead(tppg);
+
+        if (!list_empty(&tppg->tppd_node))
+                list_del_init(&tppg->tppd_node);
+
+        //BUG_ON(!list_empty(&(tppg->queue_list)));
+}
+
+static void tpps_pd_reset_stats(struct blkcg_gq *blkg)
+{
+        struct tpps_group *tppg = blkg_to_tppg(blkg);
+
+        tppg_stats_reset(&tppg->stats);
+        tppg_stats_reset(&tppg->dead_stats);
+}
+
+static struct blkcg_policy blkcg_policy_tpps = {
+        .pd_size                        = sizeof(struct tpps_group),
+        .cftypes                        = tpps_blkcg_files,
+        .pd_init_fn                        = tpps_pd_init,
+        .pd_offline_fn                = tpps_pd_offline,
+        .pd_reset_stats_fn        = tpps_pd_reset_stats,
+};
+
+static int __init tpps_init(void)
+{
+        int ret;
+
+        ret = blkcg_policy_register(&blkcg_policy_tpps);
+        if (ret)
+                return ret;
+
+        ret = -ENOMEM;
+        tpps_pool = KMEM_CACHE(tpps_queue, 0);
+        if (!tpps_pool)
+                goto err_pol_unreg;
+
+        ret = elv_register(&iosched_tpps);
+        if (ret)
+                goto err_free_pool;
+
+        return 0;
+
+err_free_pool:
+        kmem_cache_destroy(tpps_pool);
+err_pol_unreg:
+        blkcg_policy_unregister(&blkcg_policy_tpps);
+        return ret;
+}
+
+static void __exit tpps_exit(void)
+{
+        blkcg_policy_unregister(&blkcg_policy_tpps);
+        elv_unregister(&iosched_tpps);
+        kmem_cache_destroy(tpps_pool);
+}
+
+module_init(tpps_init);
+module_exit(tpps_exit);
+
+MODULE_AUTHOR("Robin Dong");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Tiny Parallel Proportion io Scheduler");
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2fdb4a4..489257a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -42,7 +42,7 @@ struct blkcg_gq;
  * Maximum number of blkcg policies allowed to be registered concurrently.
  * Defined here to simplify include dependency.
  */
-#define BLKCG_MAX_POLS                2
+#define BLKCG_MAX_POLS                3
 
 struct request;
 typedef void (rq_end_io_fn)(struct request *, int);
--
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Re: [RFC v1] add new io-scheduler to use cgroup on high-speed device Tejun Heo 6/4/13 8:10 PM
(cc'ing Kent.  Original posting at
 http://thread.gmane.org/gmane.linux.kernel/1502484 )

Hello,

On Wed, Jun 05, 2013 at 10:09:31AM +0800, Robin Dong wrote:
> We want to use blkio.cgroup on high-speed device (like fusionio) for our mysql clusters.
> After testing different io-scheduler, we found that  cfq is too slow and deadline can't run on cgroup.
> So we developed a new io-scheduler: tpps (Tiny Parallel Proportion Scheduler).It dispatch requests
> only by using their individual weight and total weight (proportion) therefore it's simply and efficient.
>
> Test case: fusionio card, 4 cgroups, iodepth-512

So, while I understand the intention behind it, I'm not sure a
separate io-sched for this is what we want.  Kent and Jens have been
thinking about this lately so they'll probably chime in.  From my POV,
I see a few largish issues.

* It has to be scalable with relatively large scale SMP / NUMA
  configurations.  It better integrate with blk-mq support currently
  being brewed.

* It definitely has to support hierarchy.  Nothing which doesn't
  support full hierarchy can be added to cgroup at this point.

* We already have separate implementations in blk-throtl and
  cfq-iosched.  Maybe it's too late and too different for cfq-iosched
  given that it's primarily targeted at disks, but I wonder whether we
  can make blk-throtl generic and scalable enough to cover all other
  use cases.

Thanks.

--
tejun
Re: [RFC v1] add new io-scheduler to use cgroup on high-speed device sanbai 6/4/13 8:40 PM
On 2013年06月05日 11:03, Tejun Heo wrote:
> (cc'ing Kent.  Original posting at
>   http://thread.gmane.org/gmane.linux.kernel/1502484 )
>
> Hello,
>
> On Wed, Jun 05, 2013 at 10:09:31AM +0800, Robin Dong wrote:
>> We want to use blkio.cgroup on high-speed device (like fusionio) for our mysql clusters.
>> After testing different io-scheduler, we found that  cfq is too slow and deadline can't run on cgroup.
>> So we developed a new io-scheduler: tpps (Tiny Parallel Proportion Scheduler).It dispatch requests
>> only by using their individual weight and total weight (proportion) therefore it's simply and efficient.
>>
>> Test case: fusionio card, 4 cgroups, iodepth-512
> So, while I understand the intention behind it, I'm not sure a
> separate io-sched for this is what we want.  Kent and Jens have been
> thinking about this lately so they'll probably chime in.  From my POV,
> I see a few largish issues.
>
> * It has to be scalable with relatively large scale SMP / NUMA
>    configurations.  It better integrate with blk-mq support currently
>    being brewed.
Ok, I will go on to look Jens's blk-mq branch.

>
> * It definitely has to support hierarchy.  Nothing which doesn't
>    support full hierarchy can be added to cgroup at this point.
Thanks for your note, hierarchy supporting will be added in my next
version patch.
>
> * We already have separate implementations in blk-throtl and
>    cfq-iosched.  Maybe it's too late and too different for cfq-iosched
>    given that it's primarily targeted at disks, but I wonder whether we
>    can make blk-throtl generic and scalable enough to cover all other
>    use cases.
I have the same feeling. Let's wait other person's response.
>
> Thanks.
>


--

Robin Dong
董昊(花名:三百)
阿里巴巴 集团 核心系统部 内核组
分机:72370
手机:13520865473
email:san...@taobao.com
Re: [RFC v1] add new io-scheduler to use cgroup on high-speed device Vivek Goyal 6/5/13 6:40 AM
On Wed, Jun 05, 2013 at 10:09:31AM +0800, Robin Dong wrote:
> We want to use blkio.cgroup on high-speed device (like fusionio) for our mysql clusters.
> After testing different io-scheduler, we found that  cfq is too slow and deadline can't run on cgroup.

So why not enhance deadline to be able to be used with cgroups instead of
coming up with a new scheduler?

> So we developed a new io-scheduler: tpps (Tiny Parallel Proportion Scheduler).It dispatch requests
> only by using their individual weight and total weight (proportion) therefore it's simply and efficient.

Can you give more details. Do you idle? Idling kills performance. If not,
then without idling how do you achieve performance differentiation.

>
> Test case: fusionio card, 4 cgroups, iodepth-512
>
> groupname  weight
> test1      1000
> test2      800
> test3      600
> test4      400
>

What's the workload used for this?

> Use tpps, the result is:
>
> groupname  iops    avg-rt(ms)   max-rt(ms)
> test1      30220   16           54
> test2      28261   18           56
> test3      26333   19           69
> test4      20152   25           87
>
> Use cfq, the result is:
>
> groupname  iops    avg-rt(ms)   max-rt(ms)
> test1      16478   30           242
> test2      13015   39           347
> test3       9300   54           371
> test4       5806   87           393

How do results look like with cfq if this is run with slice_idle=0 and
quatum=128 or higher.

cfqq idles on 3 things. queue (cfqq), service tree and cfq group.
slice_idle will disable idling on cfqq but not no service tree. If
we provide a knob for that, then idling on service tree can be disabled
too and then we will be left with group idling only and then it should
be much better.

Thanks
Vivek
Re: [RFC v1] add new io-scheduler to use cgroup on high-speed device Vivek Goyal 6/5/13 7:00 AM
On Tue, Jun 04, 2013 at 08:03:37PM -0700, Tejun Heo wrote:
> (cc'ing Kent.  Original posting at
>  http://thread.gmane.org/gmane.linux.kernel/1502484 )
>
> Hello,
>
> On Wed, Jun 05, 2013 at 10:09:31AM +0800, Robin Dong wrote:
> > We want to use blkio.cgroup on high-speed device (like fusionio) for our mysql clusters.
> > After testing different io-scheduler, we found that  cfq is too slow and deadline can't run on cgroup.
> > So we developed a new io-scheduler: tpps (Tiny Parallel Proportion Scheduler).It dispatch requests
> > only by using their individual weight and total weight (proportion) therefore it's simply and efficient.
> >
> > Test case: fusionio card, 4 cgroups, iodepth-512
>
> So, while I understand the intention behind it, I'm not sure a
> separate io-sched for this is what we want.  Kent and Jens have been
> thinking about this lately so they'll probably chime in.  From my POV,
> I see a few largish issues.
>
> * It has to be scalable with relatively large scale SMP / NUMA
>   configurations.  It better integrate with blk-mq support currently
>   being brewed.

Agreed that any new algorithm to do proportional IO should integrate
well will blk-mq support. I have yet to look at that implementation but
my understanding was that current algorithm is per queue and one
queue would not know about other queue.

As you suggested in the past, may be some kind of token based scheme
will work better instead of trying to service differentation based
on time slice.

>
> * It definitely has to support hierarchy.  Nothing which doesn't
>   support full hierarchy can be added to cgroup at this point.
>
> * We already have separate implementations in blk-throtl and
>   cfq-iosched.  Maybe it's too late and too different for cfq-iosched
>   given that it's primarily targeted at disks, but I wonder whether we
>   can make blk-throtl generic and scalable enough to cover all other
>   use cases.

I think it will be hard to cover all the use cases. There is a reason
why CFQ got so complicated and bulky because it tried to cover all the
use cases and provide service differentiation among workloads. blk-cgroup
will try to do the same thing at group level. All these question will
arise when to idle, how much to idle, how much device queue depth we
should drive to keep service differention better, how much outstanding
IO from each group we should allow in the queue.

And all of this affects what kind of service differentation you see
on different devices for different workloads.

I think generic implementation can be written with the goal of trying to
make it work with faster flash devices (which will typically use blk-mq).
And for slower disks, one can continue to use CFQ's cgroup implementation.

On a side note, it would be nice if we handle problem of managing buffered
writes using cgroup first. Otherwise there are very few practical
scenarios where proportional IO thing can be used.

Robin, what's the workload/setup which will benefit from this even without
buffered write support.

Thanks
Vivek
Re: [RFC v1] add new io-scheduler to use cgroup on high-speed device Vivek Goyal 6/5/13 7:50 AM
On Tue, Jun 04, 2013 at 08:03:37PM -0700, Tejun Heo wrote:
> (cc'ing Kent.  Original posting at
>  http://thread.gmane.org/gmane.linux.kernel/1502484 )
>
> Hello,
>
> On Wed, Jun 05, 2013 at 10:09:31AM +0800, Robin Dong wrote:
> > We want to use blkio.cgroup on high-speed device (like fusionio) for our mysql clusters.
> > After testing different io-scheduler, we found that  cfq is too slow and deadline can't run on cgroup.
> > So we developed a new io-scheduler: tpps (Tiny Parallel Proportion Scheduler).It dispatch requests
> > only by using their individual weight and total weight (proportion) therefore it's simply and efficient.
> >
> > Test case: fusionio card, 4 cgroups, iodepth-512
>
> So, while I understand the intention behind it, I'm not sure a
> separate io-sched for this is what we want.  Kent and Jens have been
> thinking about this lately so they'll probably chime in.  From my POV,
> I see a few largish issues.
>
> * It has to be scalable with relatively large scale SMP / NUMA
>   configurations.  It better integrate with blk-mq support currently
>   being brewed.
>
> * It definitely has to support hierarchy.  Nothing which doesn't
>   support full hierarchy can be added to cgroup at this point.
>
> * We already have separate implementations in blk-throtl and
>   cfq-iosched.  Maybe it's too late and too different for cfq-iosched
>   given that it's primarily targeted at disks, but I wonder whether we
>   can make blk-throtl generic and scalable enough to cover all other
>   use cases.

A generic implementation at block layer also has the advantage that we
can use it for any block device which are not using IO scheduler (dm,md)
and we can enforce the algorithm higher up in the stack.

Thanks
Vivek
Re: [RFC v1] add new io-scheduler to use cgroup on high-speed device Tejun Heo 6/5/13 10:40 AM
Hello, Vivek.

On Wed, Jun 05, 2013 at 09:55:12AM -0400, Vivek Goyal wrote:
> I think it will be hard to cover all the use cases. There is a reason
> why CFQ got so complicated and bulky because it tried to cover all the
> use cases and provide service differentiation among workloads. blk-cgroup
> will try to do the same thing at group level. All these question will
> arise when to idle, how much to idle, how much device queue depth we
> should drive to keep service differention better, how much outstanding
> IO from each group we should allow in the queue.

Yes but that's because we were trying to serve disks with rotating
platters.  I don't think we can use the same thing for disks and
non-rotational devices.  For the latter, things get a lot simpler.
Note that even the current blk-throttle doesn't care about the
underlying device at all.  We can do the same thing for proportional
control in a hopefully better and scalable way of course.

> And all of this affects what kind of service differentation you see
> on different devices for different workloads.
>
> I think generic implementation can be written with the goal of trying to
> make it work with faster flash devices (which will typically use blk-mq).
> And for slower disks, one can continue to use CFQ's cgroup implementation.

Yeap.  Pretty much the same feeling here.

> On a side note, it would be nice if we handle problem of managing buffered
> writes using cgroup first. Otherwise there are very few practical
> scenarios where proportional IO thing can be used.

Indeed.  It's currently below unified hierarchy in my todo list but if
you wanna tackle it, I'll be happy to help.

Thanks.

--
tejun
Re: [RFC v1] add new io-scheduler to use cgroup on high-speed device sanbai 6/6/13 8:20 PM
On 2013年06月05日 21:30, Vivek Goyal wrote:
> On Wed, Jun 05, 2013 at 10:09:31AM +0800, Robin Dong wrote:
>> We want to use blkio.cgroup on high-speed device (like fusionio) for our mysql clusters.
>> After testing different io-scheduler, we found that  cfq is too slow and deadline can't run on cgroup.
> So why not enhance deadline to be able to be used with cgroups instead of
> coming up with a new scheduler?
I think if we add cgroups support into deadline, it will not be suitable
to call "deadline" anymore...so a new ioscheduler and a new name may not
confuse users.
>
>> So we developed a new io-scheduler: tpps (Tiny Parallel Proportion Scheduler).It dispatch requests
>> only by using their individual weight and total weight (proportion) therefore it's simply and efficient.
> Can you give more details. Do you idle? Idling kills performance. If not,
> then without idling how do you achieve performance differentiation.
We don't idle, when comes to .elevator_dispatch_fn,we just compute
quota for every group:

quota = nr_requests - rq_in_driver;
group_quota = quota * group_weight / total_weight;

and dispatch 'group_quota' requests for the coordinate group. Therefore
high-weight group
will dispatch more requests than low-weight group.
I do the test again for cfq (slice_idle=0, quatum=128) and tpps

cfq (slice_idle=0, quatum=128)
groupname iops avg-rt(ms) max-rt(ms)
test1 16148 15 188
test2 12756 20 117
test3 9778 26 268
test4 6198 41 209

tpps
groupname iops avg-rt(ms) max-rt(ms)
test1 17292 14 65
test2 15221 16 80
test3 12080 21 66
test4 7995 32 90

Looks cfq with is much better than before.

My fio script is :
[global]
direct=1
ioengine=libaio
#ioengine=psync
runtime=30
bs=4k
rw=randread
iodepth=256

filename=/dev/fioa
numjobs=2
#group_reporting

[read1]
cgroup=test1
cgroup_weight=1000

[read2]
cgroup=test2
cgroup_weight=800

[read3]
cgroup=test3
cgroup_weight=600

[read4]
cgroup=test4
cgroup_weight=400


>
> Thanks
> Vivek


--

Robin Dong
董昊(花名:三百)
阿里巴巴 集团 核心系统部 内核组
分机:72370
手机:13520865473
email:san...@taobao.com

Re: [RFC v1] add new io-scheduler to use cgroup on high-speed device Vivek Goyal 6/7/13 1:00 PM
On Fri, Jun 07, 2013 at 11:09:54AM +0800, sanbai wrote:
> On 2013年06月05日 21:30, Vivek Goyal wrote:
> >On Wed, Jun 05, 2013 at 10:09:31AM +0800, Robin Dong wrote:
> >>We want to use blkio.cgroup on high-speed device (like fusionio) for our mysql clusters.
> >>After testing different io-scheduler, we found that  cfq is too slow and deadline can't run on cgroup.
> >So why not enhance deadline to be able to be used with cgroups instead of
> >coming up with a new scheduler?
> I think if we add cgroups support into deadline, it will not be
> suitable to call "deadline" anymore...so a new ioscheduler and a new
> name may not confuse users.

Nobody got confused when we added cgroup support to CFQ. Not that
I am saying go add support to deadline. I am just saying that need
for cgroup support does not sound like it justfies need of a new
IO scheduler.

[..]
> >Can you give more details. Do you idle? Idling kills performance. If not,
> >then without idling how do you achieve performance differentiation.
> We don't idle, when comes to .elevator_dispatch_fn,we just compute
> quota for every group:
>
> quota = nr_requests - rq_in_driver;
> group_quota = quota * group_weight / total_weight;
>
> and dispatch 'group_quota' requests for the coordinate group.
> Therefore high-weight group
> will dispatch more requests than low-weight group.

Ok, this works only if all the groups are full all the time otherwise
groups will lose their fair share. This simplifies the things a lot.
That is fairness is provided only if group is always backlogged. In
practice, this happens only if a group is doing IO at very high rate
(like your fio scripts). Have you tried running any real life workload
in these cgroups (apache, databases etc) and see how good is service
differentiation.

Anyway, sounds like this can be done at generic block layer like
blk-throtl and it can sit on top so that it can work with all schedulers
and can also work with bio based block drivers.  

[..]
> I do the test again for cfq (slice_idle=0, quatum=128) and tpps
>
> cfq (slice_idle=0, quatum=128)
> groupname iops avg-rt(ms) max-rt(ms)
> test1 16148 15 188
> test2 12756 20 117
> test3 9778 26 268
> test4 6198 41 209
>
> tpps
> groupname iops avg-rt(ms) max-rt(ms)
> test1 17292 14 65
> test2 15221 16 80
> test3 12080 21 66
> test4 7995 32 90
>
> Looks cfq with is much better than before.

Yep, I am sure there are more simple opportunites for optimization
where it can help. Can you try couple more things.

- Drive even deeper queue depth. Set quantum=512.

- set group_idle=0.

  Ideally this should effectively emulate what you are doing. That is try
  to provide fairness without idling on group.

  In practice I could not keep group queue full and before group exhausted
  its slice, it got empty and got deleted from service tree and lost its
  fair share. So if group_idle=0 leads to no service differentiation,
  try slice_sync=10 and see what happens.

Thanks
Vivek
Re: [RFC v1] add new io-scheduler to use cgroup on high-speed device sanbai 6/7/13 9:00 PM
That's a new idea, I will give a try later.
>    
>
> [..]
>> I do the test again for cfq (slice_idle=0, quatum=128) and tpps
>>
>> cfq (slice_idle=0, quatum=128)
>> groupname iops avg-rt(ms) max-rt(ms)
>> test1 16148 15 188
>> test2 12756 20 117
>> test3 9778 26 268
>> test4 6198 41 209
>>
>> tpps
>> groupname iops avg-rt(ms) max-rt(ms)
>> test1 17292 14 65
>> test2 15221 16 80
>> test3 12080 21 66
>> test4 7995 32 90
>>
>> Looks cfq with is much better than before.
> Yep, I am sure there are more simple opportunites for optimization
> where it can help. Can you try couple more things.
>
> - Drive even deeper queue depth. Set quantum=512.
>
> - set group_idle=0.
I changed the iodepth to 512 in fio script and the new result is:

cfq (group_idle=0, quantum=512)
groupname    iops        avg-rt(ms)   max-rt(ms)
test1               15259    33                305
test2               11858    42                345
test3               8885      57                335
test4               5738      89                355

cfq (group_idle=0, quantum=512, slice_sync=10)
groupname    iops        avg-rt(ms)   max-rt(ms)
test1               16507    31                177
test2               12896    39                366
test3               9301      55                188
test4               6023      84                545

tpps
groupname    iops        avg-rt(ms)   max-rt(ms)
test1               16316    31                99
test2               15066    33                106
test3               12182    42                101
test4               8350      61                180

looks cfq works much better now.
>
>    Ideally this should effectively emulate what you are doing. That is try
>    to provide fairness without idling on group.
>
>    In practice I could not keep group queue full and before group exhausted
>    its slice, it got empty and got deleted from service tree and lost its
>    fair share. So if group_idle=0 leads to no service differentiation,
>    try slice_sync=10 and see what happens.
>
> Thanks
> Vivek


--

Robin Dong
董昊(花名:三百)
阿里巴巴 集团 核心系统部 内核组
分机:72370
手机:13520865473
email:san...@taobao.com

Re: [RFC v1] add new io-scheduler to use cgroup on high-speed device sanbai 6/7/13 9:40 PM
But after I changed to 'randrw', the condition is a little different:

cfq (group_idle=0, quantum=512, slice_sync=10,slice_async=10)
groupname    iops(r/w)        avg-rt(ms)   max-rt(ms)
test1               8717/8726    26/31           553/576
test2               6944/6943    34/39           507/514
test3               4974/4961    49/53           725/658
test4               3117/3109    79/84           1107/1094

tpps
groupname    iops(r/w)        avg-rt(ms)    max-rt(ms)
test1               9130/9147    25/30            85/98
test2               7644/7652    30/36            103/118
test3               5727/5733    41/47           132/146
test4               3889/3891    62/68           193/214
>>
>>    Ideally this should effectively emulate what you are doing. That
>> is try
>>    to provide fairness without idling on group.
>>
>>    In practice I could not keep group queue full and before group
>> exhausted
>>    its slice, it got empty and got deleted from service tree and lost
>> its
>>    fair share. So if group_idle=0 leads to no service differentiation,
>>    try slice_sync=10 and see what happens.
>>
>> Thanks
>> Vivek
>
>


--

Robin Dong