Google Groups no longer supports new Usenet posts or subscriptions. Historical content remains viewable.
Dismiss

[PATCH 2/1] io-controller: Add a new interface "policy" for IO Controller

5 views
Skip to first unread message

Gui Jianfeng

unread,
Mar 3, 2010, 1:30:02 AM3/3/10
to
Currently, IO Controller makes use of blkio.weight to assign weight for
all devices. Here a new user interface "blkio.policy" is introduced to
assign different weights for different devices. blkio.weight becomes the
default value for devices which are not configured by blkio.policy.

You can use the following format to assigned specific weight for a given
device:
#echo "major:minor weight" > blkio.policy

major:minor represents device number.

And you can remove a policy as following:
#echo "major:minor 0" > blkio.policy

Signed-off-by: Gui Jianfeng <guiji...@cn.fujitsu.com>
---
block/blk-cgroup.c | 248 +++++++++++++++++++++++++++++++++++++++++++++++++++
block/blk-cgroup.h | 9 ++
block/cfq-iosched.c | 2 +-
3 files changed, 258 insertions(+), 1 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index e7dbbaf..1fddb98 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -16,6 +16,7 @@
#include <linux/module.h>
#include <linux/err.h>
#include "blk-cgroup.h"
+#include <linux/genhd.h>

static DEFINE_SPINLOCK(blkio_list_lock);
static LIST_HEAD(blkio_list);
@@ -23,6 +24,12 @@ static LIST_HEAD(blkio_list);
struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
EXPORT_SYMBOL_GPL(blkio_root_cgroup);

+static inline void policy_insert_node(struct blkio_cgroup *blkcg,
+ struct io_policy_node *pn);
+static inline void policy_delete_node(struct io_policy_node *pn);
+static struct io_policy_node *policy_search_node(const struct blkio_cgroup *blkcg,
+ dev_t dev);
+
bool blkiocg_css_tryget(struct blkio_cgroup *blkcg)
{
if (!css_tryget(&blkcg->css))
@@ -142,6 +149,7 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
struct blkio_group *blkg;
struct hlist_node *n;
struct blkio_policy_type *blkiop;
+ struct io_policy_node *pn;

if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
return -EINVAL;
@@ -150,7 +158,13 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
spin_lock(&blkio_list_lock);
spin_lock_irq(&blkcg->lock);
blkcg->weight = (unsigned int)val;
+
hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
+ pn = policy_search_node(blkcg, blkg->dev);
+
+ if (pn)
+ continue;
+
list_for_each_entry(blkiop, &blkio_list, list)
blkiop->ops.blkio_update_group_weight_fn(blkg,
blkcg->weight);
@@ -199,8 +213,235 @@ void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg,
EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_dequeue_stats);
#endif

+static int check_dev_num(dev_t dev)
+{
+ int part = 0;
+ struct gendisk *disk;
+
+ disk = get_gendisk(dev, &part);
+
+ if (!disk || part)
+ return -ENODEV;
+
+ return 0;
+}
+
+static int policy_parse_and_set(char *buf, struct io_policy_node *newpn)
+{
+ char *s[4], *p, *major_s = NULL, *minor_s = NULL;
+ int ret;
+ unsigned long major, minor, temp;
+ int i = 0;
+ dev_t dev;
+
+ memset(s, 0, sizeof(s));
+
+ while ((p = strsep(&buf, " ")) != NULL) {
+ if (!*p)
+ continue;
+
+ s[i++] = p;
+
+ /* Prevent from inputing too many things */
+ if (i == 3)
+ break;
+ }
+
+ if (i != 2)
+ return -EINVAL;
+
+ p = strsep(&s[0], ":");
+
+ if (p != NULL)
+ major_s = p;
+ else
+ return -EINVAL;
+
+ minor_s = s[0];
+
+ if (!minor_s)
+ return -EINVAL;
+
+ ret = strict_strtoul(major_s, 10, &major);
+ if (ret)
+ return -EINVAL;
+
+ ret = strict_strtoul(minor_s, 10, &minor);
+ if (ret)
+ return -EINVAL;
+
+ dev = MKDEV(major, minor);
+
+ ret = check_dev_num(dev);
+ if (ret)
+ return ret;
+
+ newpn->dev = dev;
+
+ if (s[1] == NULL)
+ return -EINVAL;
+
+ ret = strict_strtoul(s[1], 10, &temp);
+ if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) ||
+ temp > BLKIO_WEIGHT_MAX)
+ return -EINVAL;
+
+ newpn->weight = temp;
+
+ return 0;
+}
+
+unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
+ dev_t dev)
+{
+ struct io_policy_node *pn;
+
+ pn = policy_search_node(blkcg, dev);
+
+ if (pn)
+ return pn->weight;
+ else
+ return blkcg->weight;
+}
+EXPORT_SYMBOL_GPL(blkcg_get_weight);
+
+static inline void policy_insert_node(struct blkio_cgroup *blkcg,
+ struct io_policy_node *pn)
+{
+ list_add(&pn->node, &blkcg->policy_list);
+}
+
+/* Must be called with blkcg->lock held */
+static inline void policy_delete_node(struct io_policy_node *pn)
+{
+ list_del(&pn->node);
+}
+
+/* Must be called with blkcg->lock held */
+static struct io_policy_node *policy_search_node(const struct blkio_cgroup *blkcg,
+ dev_t dev)
+{
+ struct io_policy_node *pn;
+
+ if (list_empty(&blkcg->policy_list))
+ return NULL;
+
+ list_for_each_entry(pn, &blkcg->policy_list, node) {
+ if (pn->dev == dev)
+ return pn;
+ }
+
+ return NULL;
+}
+
+
+static int blkiocg_policy_write(struct cgroup *cgrp, struct cftype *cft,
+ const char *buffer)
+{
+ int ret = 0;
+ char *buf;
+ struct io_policy_node *newpn, *pn;
+ struct blkio_cgroup *blkcg;
+ struct blkio_group *blkg;
+ int keep_newpn = 0;
+ struct hlist_node *n;
+ struct blkio_policy_type *blkiop;
+
+ buf = kstrdup(buffer, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ newpn = kzalloc(sizeof(*newpn), GFP_KERNEL);
+ if (!newpn) {
+ ret = -ENOMEM;
+ goto free_buf;
+ }
+
+ ret = policy_parse_and_set(buf, newpn);
+ if (ret)
+ goto free_newpn;
+
+ blkcg = cgroup_to_blkio_cgroup(cgrp);
+
+ spin_lock_irq(&blkcg->lock);
+
+ pn = policy_search_node(blkcg, newpn->dev);
+ if (!pn) {
+ if (newpn->weight != 0) {
+ policy_insert_node(blkcg, newpn);
+ keep_newpn = 1;
+ }
+ spin_unlock_irq(&blkcg->lock);
+ goto update_io_group;
+ }
+
+ if (newpn->weight == 0) {
+ /* weight == 0 means deleteing a policy */
+ policy_delete_node(pn);
+ spin_unlock_irq(&blkcg->lock);
+ goto update_io_group;
+ }
+ spin_unlock_irq(&blkcg->lock);
+
+ pn->weight = newpn->weight;
+
+update_io_group:
+ /* update weight for each cfqg */
+ spin_lock(&blkio_list_lock);
+ spin_lock_irq(&blkcg->lock);
+
+ hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
+ if (newpn->dev == blkg->dev) {
+ list_for_each_entry(blkiop, &blkio_list, list)
+ blkiop->ops.blkio_update_group_weight_fn(blkg,
+ newpn->weight ?
+ newpn->weight :
+ blkcg->weight);
+ }
+ }
+
+ spin_unlock_irq(&blkcg->lock);
+ spin_unlock(&blkio_list_lock);
+
+free_newpn:
+ if (!keep_newpn)
+ kfree(newpn);
+free_buf:
+ kfree(buf);
+ return ret;
+}
+
+static int blkiocg_policy_read(struct cgroup *cgrp, struct cftype *cft,
+ struct seq_file *m)
+{
+ struct blkio_cgroup *blkcg;
+ struct io_policy_node *pn;
+
+ blkcg = cgroup_to_blkio_cgroup(cgrp);
+
+ if (list_empty(&blkcg->policy_list))
+ goto out;
+
+ seq_printf(m, "dev\tweight\n");
+
+ spin_lock_irq(&blkcg->lock);
+ list_for_each_entry(pn, &blkcg->policy_list, node) {
+ seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
+ MINOR(pn->dev), pn->weight);
+ }
+ spin_unlock_irq(&blkcg->lock);
+out:
+ return 0;
+}
+
struct cftype blkio_files[] = {
{
+ .name = "policy",
+ .read_seq_string = blkiocg_policy_read,
+ .write_string = blkiocg_policy_write,
+ .max_write_len = 256,
+ },
+ {
.name = "weight",
.read_u64 = blkiocg_weight_read,
.write_u64 = blkiocg_weight_write,
@@ -234,6 +475,7 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
struct blkio_group *blkg;
void *key;
struct blkio_policy_type *blkiop;
+ struct io_policy_node *pn, *pntmp;

rcu_read_lock();
remove_entry:
@@ -264,7 +506,12 @@ remove_entry:
blkiop->ops.blkio_unlink_group_fn(key, blkg);
spin_unlock(&blkio_list_lock);
goto remove_entry;
+
done:
+ list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) {
+ policy_delete_node(pn);
+ kfree(pn);
+ }
free_css_id(&blkio_subsys, &blkcg->css);
rcu_read_unlock();
kfree(blkcg);
@@ -294,6 +541,7 @@ done:
spin_lock_init(&blkcg->lock);
INIT_HLIST_HEAD(&blkcg->blkg_list);

+ INIT_LIST_HEAD(&blkcg->policy_list);
return &blkcg->css;
}

diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 4d316df..0551aa1 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -22,6 +22,7 @@ struct blkio_cgroup {
unsigned int weight;
spinlock_t lock;
struct hlist_head blkg_list;
+ struct list_head policy_list; /* list of io_policy_node */
};

struct blkio_group {
@@ -43,8 +44,16 @@ struct blkio_group {
unsigned long sectors;
};

+struct io_policy_node {
+ struct list_head node;
+ dev_t dev;
+ unsigned int weight;
+};
+
extern bool blkiocg_css_tryget(struct blkio_cgroup *blkcg);
extern void blkiocg_css_put(struct blkio_cgroup *blkcg);
+extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
+ dev_t dev);

typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg);
typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg,
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 023f4e6..3d0dea1 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -963,7 +963,6 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
if (!cfqg)
goto done;

- cfqg->weight = blkcg->weight;
for_each_cfqg_st(cfqg, i, j, st)
*st = CFQ_RB_ROOT;
RB_CLEAR_NODE(&cfqg->rb_node);
@@ -980,6 +979,7 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
MKDEV(major, minor));
+ cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);

/* Add group on cfqd list */
hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
--
1.5.4.rc3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Jens Axboe

unread,
Mar 3, 2010, 4:10:02 AM3/3/10
to
On Wed, Mar 03 2010, Gui Jianfeng wrote:
> Currently, IO Controller makes use of blkio.weight to assign weight for
> all devices. Here a new user interface "blkio.policy" is introduced to
> assign different weights for different devices. blkio.weight becomes the
> default value for devices which are not configured by blkio.policy.
>
> You can use the following format to assigned specific weight for a given
> device:
> #echo "major:minor weight" > blkio.policy
>
> major:minor represents device number.
>
> And you can remove a policy as following:
> #echo "major:minor 0" > blkio.policy

Looks good to me. Vivek, any objections?

--
Jens Axboe

Vivek Goyal

unread,
Mar 3, 2010, 9:40:02 AM3/3/10
to
On Wed, Mar 03, 2010 at 02:21:11PM +0800, Gui Jianfeng wrote:
> Currently, IO Controller makes use of blkio.weight to assign weight for
> all devices. Here a new user interface "blkio.policy" is introduced to
> assign different weights for different devices. blkio.weight becomes the
> default value for devices which are not configured by blkio.policy.
>
> You can use the following format to assigned specific weight for a given
> device:
> #echo "major:minor weight" > blkio.policy
>

Hi Gui,

Can we use a different name for per device weight file name than "blkio.policy".
May be "blkio.weight_device" or "blkio.weight_per_device" etc.

The reason being that "blkio.policy" name might be more suitable to switch
between differnt kind of BW control policies (proportional, max bandwidth etc),
once we implement some kind of max BW controlling policies also. So it
might be a good idea to not use that file name for specifying per device
weights.

Secondly, where is it useful. I mean do you have a scenario where you will
use it. Why I am asking is that we never had ioprio per device kind of
interface. ioprio was associated with task and was global and same on
every device in the system.

I am wondering, what are the scenarios where an application is more important
on one device and less important on other device to warrant different weights
on different devices.

Thanks
Vivek

Vivek Goyal

unread,
Mar 3, 2010, 10:40:01 AM3/3/10
to
On Wed, Mar 03, 2010 at 02:21:11PM +0800, Gui Jianfeng wrote:
^^^^^^^^^^
How about blkio_policy_node?

So this structure will represent per cgroup per device policy as specified
by user. We can re-use the same structure if there are more per cgroup
per device policies down the line.

> +static inline void policy_delete_node(struct io_policy_node *pn);
> +static struct io_policy_node *policy_search_node(const struct blkio_cgroup *blkcg,
> + dev_t dev);

Not very sure about it but may be a good idea to prefix "blkio_" or
"blkcg_" before all the functions you are defining. Just some uniformity in
function names like cfq.

> +
> bool blkiocg_css_tryget(struct blkio_cgroup *blkcg)

You might want to rebase it on Jen's for-linus branch as blkiocg_css_tryget()
is gone in that branch

Apart from above minor nits, this patch looks good to me. Will do some
testing on it.

Thanks
Vivek

> {
> if (!css_tryget(&blkcg->css))
> @@ -142,6 +149,7 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
> struct blkio_group *blkg;
> struct hlist_node *n;
> struct blkio_policy_type *blkiop;
> + struct io_policy_node *pn;
>
> if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
> return -EINVAL;
> @@ -150,7 +158,13 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
> spin_lock(&blkio_list_lock);
> spin_lock_irq(&blkcg->lock);
> blkcg->weight = (unsigned int)val;
> +
> hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
> + pn = policy_search_node(blkcg, blkg->dev);
> +
> + if (pn)
> + continue;

Ok, so if we have specified a weight for a device, then blkio.weight
update does not affect the weight on that device. Makes sense.

Vivek Goyal

unread,
Mar 3, 2010, 10:50:02 AM3/3/10
to
On Wed, Mar 03, 2010 at 09:33:11AM -0500, Vivek Goyal wrote:
> On Wed, Mar 03, 2010 at 02:21:11PM +0800, Gui Jianfeng wrote:
> > Currently, IO Controller makes use of blkio.weight to assign weight for
> > all devices. Here a new user interface "blkio.policy" is introduced to
> > assign different weights for different devices. blkio.weight becomes the
> > default value for devices which are not configured by blkio.policy.
> >
> > You can use the following format to assigned specific weight for a given
> > device:
> > #echo "major:minor weight" > blkio.policy
> >
>
> Hi Gui,
>
> Can we use a different name for per device weight file name than "blkio.policy".
> May be "blkio.weight_device" or "blkio.weight_per_device" etc.
>
> The reason being that "blkio.policy" name might be more suitable to switch
> between differnt kind of BW control policies (proportional, max bandwidth etc),
> once we implement some kind of max BW controlling policies also. So it
> might be a good idea to not use that file name for specifying per device
> weights.

Well, thinking more about it, what kind of policy you implement on a block
device will not be a per cgroup property. It will not be the case that on
a device you are implementing max-bw for one cgroup and proportional
weight for other cgroup. It probably will be a per device attribute and
if need be should be controlled by /sys/block/<dev> interface.

Still, being very clear what a particular cgroup file does makes sense. So
that in future for max-bw control, we can bring in more cgroup files like
blkio.max_bw or blkio.max_iops etc which can co-exist with blkio.weight_device
etc.

Thanks
Vivek

Gui Jianfeng

unread,
Mar 3, 2010, 7:40:01 PM3/3/10
to
Hi Vivek,

Vivek Goyal wrote:
> On Wed, Mar 03, 2010 at 02:21:11PM +0800, Gui Jianfeng wrote:
>> Currently, IO Controller makes use of blkio.weight to assign weight for
>> all devices. Here a new user interface "blkio.policy" is introduced to
>> assign different weights for different devices. blkio.weight becomes the
>> default value for devices which are not configured by blkio.policy.
>>
>> You can use the following format to assigned specific weight for a given
>> device:
>> #echo "major:minor weight" > blkio.policy
>>
>
> Hi Gui,
>
> Can we use a different name for per device weight file name than "blkio.policy".
> May be "blkio.weight_device" or "blkio.weight_per_device" etc.

ok, will change.

>
> The reason being that "blkio.policy" name might be more suitable to switch
> between differnt kind of BW control policies (proportional, max bandwidth etc),
> once we implement some kind of max BW controlling policies also. So it
> might be a good idea to not use that file name for specifying per device
> weights.
>
> Secondly, where is it useful. I mean do you have a scenario where you will
> use it. Why I am asking is that we never had ioprio per device kind of
> interface. ioprio was associated with task and was global and same on
> every device in the system.
>
> I am wondering, what are the scenarios where an application is more important
> on one device and less important on other device to warrant different weights
> on different devices.

Imaging that, if most of user's data stores on sdb, and most of system data stores
on sda for system application. So we can assign more weight on sdb and less weight
on sda for that user. So that user application achieve high speed on sdb, and will
not stall system application when accessing into sda.

Thanks
Gui

--
Regards
Gui Jianfeng

Gui Jianfeng

unread,
Mar 3, 2010, 7:40:01 PM3/3/10
to

Thanks for the comments vivek, will post V2.

Thanks
Gui

--
Regards
Gui Jianfeng

Chad Talbott

unread,
Mar 3, 2010, 9:30:01 PM3/3/10
to
On Wed, Mar 3, 2010 at 3:19 PM, Nauman Rafique <nau...@google.com> wrote:
> On Wed, Mar 03, 2010 at 09:33:11AM -0500, Vivek Goyal wrote:
>> On Wed, Mar 03, 2010 at 02:21:11PM +0800, Gui Jianfeng wrote:
>> > You can use the following format to assigned specific weight for a given
>> > device:
>> > #echo "major:minor weight" > blkio.policy

>> Can we use a different name for per device weight file name than "blkio.policy".


>> May be "blkio.weight_device" or "blkio.weight_per_device" etc.

I agree with Vivek here, and his reasoning below. This becomes more
important as more attributes are added.

>> The reason being that "blkio.policy" name might be more suitable to switch
>> between differnt kind of BW control policies (proportional, max bandwidth etc),
>> once we implement some kind of max BW controlling policies also. So it
>> might be a good idea to not use that file name for specifying per device
>> weights.
>
> Well, thinking more about it, what kind of policy you implement on a block
> device will not be a per cgroup property. It will not be the case that on
> a device you are implementing max-bw for one cgroup and proportional
> weight for other cgroup. It probably will be a per device attribute and
> if need be should be controlled by /sys/block/<dev> interface.
>
> Still, being very clear what a particular cgroup file does makes sense. So
> that in future for max-bw control, we can bring in more cgroup files like
> blkio.max_bw or blkio.max_iops etc which can co-exist with blkio.weight_device
> etc.

Agreed. I'd like to add that since we are already thinking about
expanding the policy with more attributes, perhaps
blkio_update_group_weight_fn in blkio_policy_ops should be renamed to
blkio_policy_updated_fn. Then it could be called if the user changed
any part of the policy. Further, instead of storing "weight" in
blkio_cgroup, we could store a blkio_policy_node there instead. Then
the handling of whole-cgroup and per-block-device policy items could
be more regular.

Some quick code comments:

policy_parse_and_set() could be simplified with scanf, like:

if (sscanf(buf, "%d:%d %d", &major, &minor, &temp) != 3)
return -EINVAL;

blkcg_get_weight() might better be blkcg_get_policy() and it could
return a per-disk policy node, or fall back to the cgroup policy if
none existed for this dev. This would be across policy attributes,
rather than just weight.

Thanks,
Chad

Gui Jianfeng

unread,
Mar 3, 2010, 10:40:01 PM3/3/10
to

Hi Chad,

Thanks for the comments.

>
> policy_parse_and_set() could be simplified with scanf, like:
>
> if (sscanf(buf, "%d:%d %d", &major, &minor, &temp) != 3)
> return -EINVAL;

This can't handle the invalid format like following
echo "8:16 500 500 500 ...." > blkio.policy

>
> blkcg_get_weight() might better be blkcg_get_policy() and it could
> return a per-disk policy node, or fall back to the cgroup policy if
> none existed for this dev. This would be across policy attributes,
> rather than just weight.

For the time being, i still choose blkcg_get_weight(). For there's
only one attributes here, when more attributs is added, then we might
change the name.

Thanks
Gui

Gui Jianfeng

unread,
Mar 4, 2010, 2:50:01 AM3/4/10
to
Currently, IO Controller makes use of blkio.weight to assign weight for
all devices. Here a new user interface "blkio.weight_device" is introduced to
assign different weights for different devices. blkio.weight becomes the
default value for devices which are not configured by "blkio.weight_device"

You can use the following format to assigned specific weight for a given
device:

#echo "major:minor weight" > blkio.weight_device

major:minor represents device number.

And you can remove a specific weight as following:
#echo "major:minor 0" > blkio.weight_device

V1->V2 changes:
- use user interface "weight_device" instead of "policy" suggested by Vivek
- rename some struct suggested by Vivek
- rebase to 2.6-block "for-linus" branch
- remove an useless list_empty check pointed out by Li Zefan
- some trivial typo fix

Signed-off-by: Gui Jianfeng <guiji...@cn.fujitsu.com>
---

block/blk-cgroup.c | 240 +++++++++++++++++++++++++++++++++++++++++++++++++++
block/blk-cgroup.h | 10 ++
block/cfq-iosched.c | 2 +-
3 files changed, 251 insertions(+), 1 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index c85d74c..37f1d8f 100644


--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -16,6 +16,7 @@
#include <linux/module.h>
#include <linux/err.h>
#include "blk-cgroup.h"
+#include <linux/genhd.h>

static DEFINE_SPINLOCK(blkio_list_lock);
static LIST_HEAD(blkio_list);
@@ -23,6 +24,12 @@ static LIST_HEAD(blkio_list);
struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
EXPORT_SYMBOL_GPL(blkio_root_cgroup);

+static inline void policy_insert_node(struct blkio_cgroup *blkcg,

+ struct blkio_policy_node *pn);
+static inline void policy_delete_node(struct blkio_policy_node *pn);
+static struct blkio_policy_node *policy_search_node(const struct blkio_cgroup *blkcg,
+ dev_t dev);
+
struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
{
return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
@@ -128,6 +135,7 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)


struct blkio_group *blkg;
struct hlist_node *n;
struct blkio_policy_type *blkiop;

+ struct blkio_policy_node *pn;



if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
return -EINVAL;

@@ -136,7 +144,13 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)


spin_lock(&blkio_list_lock);
spin_lock_irq(&blkcg->lock);
blkcg->weight = (unsigned int)val;
+
hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
+ pn = policy_search_node(blkcg, blkg->dev);
+
+ if (pn)
+ continue;
+
list_for_each_entry(blkiop, &blkio_list, list)
blkiop->ops.blkio_update_group_weight_fn(blkg,
blkcg->weight);

@@ -185,8 +199,227 @@ void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg,


EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_dequeue_stats);
#endif

+static int check_dev_num(dev_t dev)
+{
+ int part = 0;
+ struct gendisk *disk;
+
+ disk = get_gendisk(dev, &part);
+ if (!disk || part)
+ return -ENODEV;
+
+ return 0;
+}
+

+static int policy_parse_and_set(char *buf, struct blkio_policy_node *newpn)

+ struct blkio_policy_node *pn;


+
+ pn = policy_search_node(blkcg, dev);
+ if (pn)
+ return pn->weight;
+ else
+ return blkcg->weight;
+}
+EXPORT_SYMBOL_GPL(blkcg_get_weight);
+
+static inline void policy_insert_node(struct blkio_cgroup *blkcg,

+ struct blkio_policy_node *pn)


+{
+ list_add(&pn->node, &blkcg->policy_list);
+}
+
+/* Must be called with blkcg->lock held */

+static inline void policy_delete_node(struct blkio_policy_node *pn)


+{
+ list_del(&pn->node);
+}
+
+/* Must be called with blkcg->lock held */

+static struct blkio_policy_node *policy_search_node(const struct blkio_cgroup *blkcg,
+ dev_t dev)
+{
+ struct blkio_policy_node *pn;


+
+ list_for_each_entry(pn, &blkcg->policy_list, node) {
+ if (pn->dev == dev)
+ return pn;
+ }
+
+ return NULL;
+}
+
+

+static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,


+ const char *buffer)
+{
+ int ret = 0;
+ char *buf;

+ struct blkio_policy_node *newpn, *pn;

+ /* weight == 0 means deleteing a specific weight */

+static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft,


+ struct seq_file *m)
+{
+ struct blkio_cgroup *blkcg;

+ struct blkio_policy_node *pn;


+
+ seq_printf(m, "dev\tweight\n");
+

+ blkcg = cgroup_to_blkio_cgroup(cgrp);


+ if (list_empty(&blkcg->policy_list))
+ goto out;
+

+ spin_lock_irq(&blkcg->lock);
+ list_for_each_entry(pn, &blkcg->policy_list, node) {
+ seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
+ MINOR(pn->dev), pn->weight);
+ }
+ spin_unlock_irq(&blkcg->lock);
+out:
+ return 0;
+}
+
struct cftype blkio_files[] = {
{

+ .name = "weight_device",
+ .read_seq_string = blkiocg_weight_device_read,
+ .write_string = blkiocg_weight_device_write,


+ .max_write_len = 256,
+ },
+ {
.name = "weight",
.read_u64 = blkiocg_weight_read,
.write_u64 = blkiocg_weight_write,

@@ -220,6 +453,7 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)


struct blkio_group *blkg;
void *key;
struct blkio_policy_type *blkiop;

+ struct blkio_policy_node *pn, *pntmp;

rcu_read_lock();
remove_entry:
@@ -250,7 +484,12 @@ remove_entry:


blkiop->ops.blkio_unlink_group_fn(key, blkg);
spin_unlock(&blkio_list_lock);
goto remove_entry;
+
done:
+ list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) {
+ policy_delete_node(pn);
+ kfree(pn);
+ }
free_css_id(&blkio_subsys, &blkcg->css);
rcu_read_unlock();
kfree(blkcg);

@@ -280,6 +519,7 @@ done:


spin_lock_init(&blkcg->lock);
INIT_HLIST_HEAD(&blkcg->blkg_list);

+ INIT_LIST_HEAD(&blkcg->policy_list);
return &blkcg->css;
}

diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h

index 84bf745..c8144a2 100644


--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -22,6 +22,7 @@ struct blkio_cgroup {
unsigned int weight;
spinlock_t lock;
struct hlist_head blkg_list;

+ struct list_head policy_list; /* list of blkio_policy_node */
};

struct blkio_group {
@@ -43,6 +44,15 @@ struct blkio_group {
unsigned long sectors;
};

+struct blkio_policy_node {


+ struct list_head node;
+ dev_t dev;
+ unsigned int weight;
+};
+

+extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
+ dev_t dev);

+


typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg);
typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg,

unsigned int weight);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index dee9d93..6945e9b 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -954,7 +954,6 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)


if (!cfqg)
goto done;

- cfqg->weight = blkcg->weight;
for_each_cfqg_st(cfqg, i, j, st)
*st = CFQ_RB_ROOT;
RB_CLEAR_NODE(&cfqg->rb_node);

@@ -971,6 +970,7 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)


sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
MKDEV(major, minor));
+ cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);

/* Add group on cfqd list */
hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
--
1.5.4.rc3

--

Gui Jianfeng

unread,
Mar 4, 2010, 2:50:02 AM3/4/10
to
Here is the document for blkio.weight_device

Signed-off-by: Gui Jianfeng <guiji...@cn.fujitsu.com>
---

Documentation/cgroups/blkio-controller.txt | 35 +++++++++++++++++++++++++++-
1 files changed, 34 insertions(+), 1 deletions(-)

diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt
index 630879c..dcd6bdc 100644
--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
@@ -76,7 +76,9 @@ CONFIG_DEBUG_BLK_CGROUP
Details of cgroup files
=======================
- blkio.weight
- - Specifies per cgroup weight.
+ - Specifies per cgroup weight. This is default weight of the group
+ on all the devices until and unless overridden by per device rule.
+ (See blkio.weight_device).

Currently allowed range of weights is from 100 to 1000.

@@ -99,6 +101,37 @@ Details of cgroup files
and minor number of the device and third field specifies the number
of times a group was dequeued from a particular device.

+- blkio.weight_device
+ - One can specify per cgroup per device rules using this interface.
+ These rules override the default value of group weight as specified
+ by blkio.weight.
+
+ Following is the format.
+
+ #echo dev_maj:dev_minor weight > /patch/to/cgroup/blkio.weight_device
+
+ weight=0 means removing weight on a given device.
+
+ Examples:
+ Configure weight=300 on /dev/sdb (8:16) in this cgroup
+ # echo 8:16 300 > blkio.weight_device
+ # cat blkio.weight_device
+ dev weight
+ 8:16 300
+
+ Configure weight=500 on /dev/sda (8:0) in this cgroup
+ # echo 8:0 500 > blkio.weight_device
+ # cat blkio.weight_device
+ dev weight
+ 8:0 500
+ 8:16 300
+
+ Remove specific weight for /dev/sda in this cgroup
+ # echo 8:0 0 > blkio.weight_device
+ # cat blkio.weight_device
+ dev weight
+ 8:16 300
+
CFQ sysfs tunable
=================
/sys/block/<disk>/queue/iosched/group_isolation
--
1.5.4.rc3

Takuya Yoshikawa

unread,
Mar 4, 2010, 6:30:03 AM3/4/10
to
Hi, I found a typo.

Gui Jianfeng wrote:
> +- blkio.weight_device
> + - One can specify per cgroup per device rules using this interface.
> + These rules override the default value of group weight as specified
> + by blkio.weight.
> +
> + Following is the format.
> +
> + #echo dev_maj:dev_minor weight > /patch/to/cgroup/blkio.weight_device

You mean "path to"?

Thanks,
Takuya

--

Vivek Goyal

unread,
Mar 4, 2010, 10:10:01 AM3/4/10
to

Currently there is only one policy (proportional weight) and there is
only one attribute (weight). So I think we can leave the function name
as it is. We can make it more generic as we introduce new policies or
policy attributes.

> Further, instead of storing "weight" in
> blkio_cgroup, we could store a blkio_policy_node there instead.

Sorry, did not get it. Are you suggesting that don't store group weight in
blkio_cgroup, instead store it in blkio_policy_node? But we don't create
blkio_policy_node, until and unless user create a specific weight policy
for a device.

> Then
> the handling of whole-cgroup and per-block-device policy items could
> be more regular.
>
> Some quick code comments:
>
> policy_parse_and_set() could be simplified with scanf, like:
>
> if (sscanf(buf, "%d:%d %d", &major, &minor, &temp) != 3)
> return -EINVAL;
>
> blkcg_get_weight() might better be blkcg_get_policy() and it could
> return a per-disk policy node, or fall back to the cgroup policy if
> none existed for this dev. This would be across policy attributes,
> rather than just weight.

This is an interesting idea. But again we can implement it once we have
more policy attributes. For the time being, it is simple to understand
that get group weight from blk-cgroup layer for that particular device.

Thanks
Vivek

Vivek Goyal

unread,
Mar 4, 2010, 10:30:02 AM3/4/10
to

Hi Gui,

Thanks. Can we move above functions up in the file and get rid of these
forward declarations?

Also as I suggested last time, it might be good idea to prefix "blkio_"
before function names.

policy_insert_node ---> blkio_policy_insert_node()
policy_delete_node ---> blkio_policy_delete_node()
policy_search_node ---> blkio_policy_search_node()
chekc_dev_num ---> blkio_check_dev_num()
.
.

Thanks
Vivek

Gui Jianfeng

unread,
Mar 4, 2010, 7:40:01 PM3/4/10
to

Sure, will post again.

Thanks,
Gui

--
Regards
Gui Jianfeng

Gui Jianfeng

unread,
Mar 4, 2010, 7:50:01 PM3/4/10
to
Takuya Yoshikawa wrote:
> Hi, I found a typo.
>
> Gui Jianfeng wrote:
>> +- blkio.weight_device
>> + - One can specify per cgroup per device rules using this interface.
>> + These rules override the default value of group weight as
>> specified
>> + by blkio.weight.
>> +
>> + Following is the format.
>> +
>> + #echo dev_maj:dev_minor weight >
>> /patch/to/cgroup/blkio.weight_device
>
> You mean "path to"?
>

yeah, will fix.

Thanks
Gui

Gui Jianfeng

unread,
Mar 4, 2010, 9:40:01 PM3/4/10
to
Currently, IO Controller makes use of blkio.weight to assign weight for
all devices. Here a new user interface "blkio.weight_device" is introduced to
assign different weights for different devices. blkio.weight becomes the
default value for devices which are not configured by "blkio.weight_device"

You can use the following format to assigned specific weight for a given
device:

major:minor represents device number.

And you can remove a specific weight as following:

V1->V2 changes:


- use user interface "weight_device" instead of "policy" suggested by Vivek
- rename some struct suggested by Vivek
- rebase to 2.6-block "for-linus" branch
- remove an useless list_empty check pointed out by Li Zefan
- some trivial typo fix

V2->V3 changes:
- Move policy_*_node() functions up to get rid of forward declarations
- rename related functions by adding prefix "blkio_"

Signed-off-by: Gui Jianfeng <guiji...@cn.fujitsu.com>
---

block/blk-cgroup.c | 236 ++++++++++++++++++++++++++++++++++++++++++++++++++-


block/blk-cgroup.h | 10 ++
block/cfq-iosched.c | 2 +-

3 files changed, 246 insertions(+), 2 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index c85d74c..8825e49 100644


--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -16,6 +16,7 @@
#include <linux/module.h>
#include <linux/err.h>
#include "blk-cgroup.h"
+#include <linux/genhd.h>

static DEFINE_SPINLOCK(blkio_list_lock);
static LIST_HEAD(blkio_list);

@@ -23,6 +24,32 @@ static LIST_HEAD(blkio_list);


struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
EXPORT_SYMBOL_GPL(blkio_root_cgroup);

+static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg,


+ struct blkio_policy_node *pn)
+{
+ list_add(&pn->node, &blkcg->policy_list);
+}
+
+/* Must be called with blkcg->lock held */

+static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)


+{
+ list_del(&pn->node);
+}
+
+/* Must be called with blkcg->lock held */
+static struct blkio_policy_node *

+blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev)


+{
+ struct blkio_policy_node *pn;
+
+ list_for_each_entry(pn, &blkcg->policy_list, node) {
+ if (pn->dev == dev)
+ return pn;
+ }
+
+ return NULL;
+}
+

struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
{
return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),

@@ -128,6 +155,7 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)


struct blkio_group *blkg;
struct hlist_node *n;
struct blkio_policy_type *blkiop;
+ struct blkio_policy_node *pn;

if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
return -EINVAL;

@@ -136,7 +164,13 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)


spin_lock(&blkio_list_lock);
spin_lock_irq(&blkcg->lock);
blkcg->weight = (unsigned int)val;
+
hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {

+ pn = blkio_policy_search_node(blkcg, blkg->dev);


+
+ if (pn)
+ continue;
+
list_for_each_entry(blkiop, &blkio_list, list)
blkiop->ops.blkio_update_group_weight_fn(blkg,
blkcg->weight);

@@ -178,15 +212,208 @@ SHOW_FUNCTION_PER_GROUP(dequeue);

#ifdef CONFIG_DEBUG_BLK_CGROUP
void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg,
- unsigned long dequeue)
+ unsigned long dequeue)
{
blkg->dequeue += dequeue;
}
EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_dequeue_stats);
#endif

+static int blkio_check_dev_num(dev_t dev)


+{
+ int part = 0;
+ struct gendisk *disk;
+
+ disk = get_gendisk(dev, &part);
+ if (!disk || part)
+ return -ENODEV;
+
+ return 0;
+}
+

+static int blkio_policy_parse_and_set(char *buf,
+ struct blkio_policy_node *newpn)

+ ret = blkio_check_dev_num(dev);


+ if (ret)
+ return ret;
+
+ newpn->dev = dev;
+
+ if (s[1] == NULL)
+ return -EINVAL;
+
+ ret = strict_strtoul(s[1], 10, &temp);
+ if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) ||
+ temp > BLKIO_WEIGHT_MAX)
+ return -EINVAL;
+
+ newpn->weight = temp;
+
+ return 0;
+}
+
+unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
+ dev_t dev)
+{
+ struct blkio_policy_node *pn;
+

+ pn = blkio_policy_search_node(blkcg, dev);


+ if (pn)
+ return pn->weight;
+ else
+ return blkcg->weight;
+}
+EXPORT_SYMBOL_GPL(blkcg_get_weight);
+

+static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
+ const char *buffer)
+{
+ int ret = 0;
+ char *buf;
+ struct blkio_policy_node *newpn, *pn;
+ struct blkio_cgroup *blkcg;
+ struct blkio_group *blkg;
+ int keep_newpn = 0;
+ struct hlist_node *n;
+ struct blkio_policy_type *blkiop;
+
+ buf = kstrdup(buffer, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ newpn = kzalloc(sizeof(*newpn), GFP_KERNEL);
+ if (!newpn) {
+ ret = -ENOMEM;
+ goto free_buf;
+ }
+

+ ret = blkio_policy_parse_and_set(buf, newpn);


+ if (ret)
+ goto free_newpn;
+
+ blkcg = cgroup_to_blkio_cgroup(cgrp);
+
+ spin_lock_irq(&blkcg->lock);
+

+ pn = blkio_policy_search_node(blkcg, newpn->dev);


+ if (!pn) {
+ if (newpn->weight != 0) {

+ blkio_policy_insert_node(blkcg, newpn);


+ keep_newpn = 1;
+ }
+ spin_unlock_irq(&blkcg->lock);
+ goto update_io_group;
+ }
+
+ if (newpn->weight == 0) {
+ /* weight == 0 means deleteing a specific weight */

+ blkio_policy_delete_node(pn);

@@ -220,6 +447,7 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)


struct blkio_group *blkg;
void *key;
struct blkio_policy_type *blkiop;
+ struct blkio_policy_node *pn, *pntmp;

rcu_read_lock();
remove_entry:

@@ -250,7 +478,12 @@ remove_entry:


blkiop->ops.blkio_unlink_group_fn(key, blkg);
spin_unlock(&blkio_list_lock);
goto remove_entry;
+
done:
+ list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) {

+ blkio_policy_delete_node(pn);


+ kfree(pn);
+ }
free_css_id(&blkio_subsys, &blkcg->css);
rcu_read_unlock();
kfree(blkcg);

@@ -280,6 +513,7 @@ done:

Gui Jianfeng

unread,
Mar 4, 2010, 9:40:02 PM3/4/10
to
Here is the document for blkio.weight_device

Signed-off-by: Gui Jianfeng <guiji...@cn.fujitsu.com>
---
Documentation/cgroups/blkio-controller.txt | 35 +++++++++++++++++++++++++++-
1 files changed, 34 insertions(+), 1 deletions(-)

diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt
index 630879c..e551c85 100644


--- a/Documentation/cgroups/blkio-controller.txt
+++ b/Documentation/cgroups/blkio-controller.txt
@@ -76,7 +76,9 @@ CONFIG_DEBUG_BLK_CGROUP
Details of cgroup files
=======================
- blkio.weight
- - Specifies per cgroup weight.
+ - Specifies per cgroup weight. This is default weight of the group
+ on all the devices until and unless overridden by per device rule.
+ (See blkio.weight_device).

Currently allowed range of weights is from 100 to 1000.

@@ -99,6 +101,37 @@ Details of cgroup files
and minor number of the device and third field specifies the number
of times a group was dequeued from a particular device.

+- blkio.weight_device
+ - One can specify per cgroup per device rules using this interface.
+ These rules override the default value of group weight as specified
+ by blkio.weight.
+
+ Following is the format.
+

+ #echo dev_maj:dev_minor weight > /pach/to/cgroup/blkio.weight_device

Takuya Yoshikawa

unread,
Mar 4, 2010, 9:50:02 PM3/4/10
to
Hi, thank you for fixing ...

Gui Jianfeng wrote:
> +- blkio.weight_device
> + - One can specify per cgroup per device rules using this interface.
> + These rules override the default value of group weight as specified
> + by blkio.weight.
> +
> + Following is the format.
> +
> + #echo dev_maj:dev_minor weight > /pach/to/cgroup/blkio.weight_device

You deleted wrong character! We need 't' not 'c'.

Thanks,
Takuya

--

Gui Jianfeng

unread,
Mar 4, 2010, 9:50:02 PM3/4/10
to
Takuya Yoshikawa wrote:
> Hi, thank you for fixing ...
>
> Gui Jianfeng wrote:
>> +- blkio.weight_device
>> + - One can specify per cgroup per device rules using this interface.
>> + These rules override the default value of group weight as
>> specified
>> + by blkio.weight.
>> +
>> + Following is the format.
>> +
>> + #echo dev_maj:dev_minor weight >
>> /pach/to/cgroup/blkio.weight_device
>
> You deleted wrong character! We need 't' not 'c'.

OOPS, Sorry :(

Vivek Goyal

unread,
Mar 5, 2010, 9:20:02 AM3/5/10
to
On Fri, Mar 05, 2010 at 10:26:39AM +0800, Gui Jianfeng wrote:
> Here is the document for blkio.weight_device
>
> Signed-off-by: Gui Jianfeng <guiji...@cn.fujitsu.com>

Looks good to me.

Acked-by: Vivek Goyal <vgo...@redhat.com>

Vivek

Vivek Goyal

unread,
Mar 5, 2010, 9:20:02 AM3/5/10
to
On Fri, Mar 05, 2010 at 10:25:58AM +0800, Gui Jianfeng wrote:
> Currently, IO Controller makes use of blkio.weight to assign weight for
> all devices. Here a new user interface "blkio.weight_device" is introduced to
> assign different weights for different devices. blkio.weight becomes the
> default value for devices which are not configured by "blkio.weight_device"
>
> You can use the following format to assigned specific weight for a given
> device:
>
> major:minor represents device number.
>
> And you can remove a specific weight as following:
>
> V1->V2 changes:
> - use user interface "weight_device" instead of "policy" suggested by Vivek
> - rename some struct suggested by Vivek
> - rebase to 2.6-block "for-linus" branch
> - remove an useless list_empty check pointed out by Li Zefan
> - some trivial typo fix
>
> V2->V3 changes:
> - Move policy_*_node() functions up to get rid of forward declarations
> - rename related functions by adding prefix "blkio_"
>

Thanks for the changes Gui. Looks good to me.

Acked-by: Vivek Goyal <vgo...@redhat.com>

Vivek

> Signed-off-by: Gui Jianfeng <guiji...@cn.fujitsu.com>

Nauman Rafique

unread,
Mar 8, 2010, 2:50:02 PM3/8/10
to

I am not quite sure if exposing a mojor,minor number is the best
interface that can be exposed to user space. How about actual disk
names like sda, sdb, .. etc? The only problem I see there is that it
seems tricky to get to these disk names from within the block layer.
"struct request_queue" has a pointer to backing_dev which has a device
from which we can get major,minor. But in order to get to disk name,
we would have to call get_gendisk which can hold a semaphore. Is this
the reason for us going with major,minor as a user interface to
specify a disk? I bet there are good reasons for us not keeping a
pointer to "struct gendisk" from "struct request_queue". If we could
keep that pointer, our user interface could be very easily modified to
be the disk name like sda, sdb, etc.

Vivek Goyal

unread,
Mar 8, 2010, 6:10:01 PM3/8/10
to

That's a good question. Why not use device names instead of device
numbers? From user's perspective, device names will be more intutive
to use.

At the same time, will it look odd to handle devices with their names as WWID.

/dev/mapper/3600508b400105df70000e000026f0000

Though I see that there is an alternate way to address the same device
like /dev/dm-2 etc.

So from user's perspective I think it will be more intutive to handle
disk names instead of numbers.

Gui, did you forsee issues in implementing disk names?

Thanks
Vivek

Gui Jianfeng

unread,
Mar 8, 2010, 9:00:01 PM3/8/10
to

Hi Vivek,

From the implementation of view, we need a device number as a key in blkio_policy_node,
if using device name as user interface, i can't figure out a way to retirve the
corresponding device number by means of device name (like sda, not "/dev/sda").
Jens, is there any method to handle this?
Another cgroup subsystem "device" cgroup also make use of device number as user interface,
I guess the reason is the same.

Thanks,
Gui

Vivek Goyal

unread,
Mar 9, 2010, 2:10:02 PM3/9/10
to
On Tue, Mar 09, 2010 at 09:52:06AM +0800, Gui Jianfeng wrote:

[..]

Hi Gui,

How about using full device path names (/dev/sda)? "blockdev" utility also
expects full device pathnames. Same seems to be the case with device mapper
targets.

"device" cgroup controller probably is using major and minor numbers because
it needs to control creation of device file (mknod).

May be we can use lookup_bdev() to get block_device pointer and then
get_gendisk() to check if it is a partition.

I am not very sure but device name/path interface might turn out to be
more intutive.

Jens, do you have any thoughts on this?

Vivek Goyal

unread,
Mar 9, 2010, 3:20:02 PM3/9/10
to
On Mon, Mar 08, 2010 at 11:39:54AM -0800, Nauman Rafique wrote:
[..]

Hi Nauman,

Do we really store a device name in "struct gendisk"? IIUC, a disk is
identified by its major and minor number and then there can be multiple
device files pointing to same disk.

So I have a disk /dev/sdc in my system and I created another alias to
same disk using mknod and mounted the disk using the alias.

mknod /dev/sdc-alias b 8 32
mount /dev/sdc-alias /mnt

If that's the case, there is no way gendisk can store the pathname.
Instead, device file has inode associated with it, and there we store
major, minor number of disk, and using that we operate on disk/partition.

If that's the case, then major/minor number based interface for blkio
makes sense. Because we also need to export stats regarding the disk
time consumed by cgroup on a particular device, the only unique identifier
of the disk seems to be {major,minor} pair and multiple block device
files can be pointing to same disk. Because it is many to one mapping, it
will not be possible to reverse map it.

So I guess we need to continue to handle rules and stats using major/minor
numbers. One improvement probably we can do and that is allow setting
rules both by major/minor number and device file path. But internally
cgroup code will map device file path to major minor numbers and rules
will be displayed against major/minor number and not original device path.

Thanks
Vivek

Nauman Rafique

unread,
Mar 9, 2010, 3:40:01 PM3/9/10
to

You are right, you can create more aliases to point the same device.
But there is one name stored in disk_name field of struct gendisk. And
I guess this is the same name that you will see if you do "ls
/sys/block/". block layer exposes all its sysfs variables going
through the disk names. For example, if you have to switch a scheduler
on a block device, you would use the name like sdb and do "echo cfq >
/sys/block/sdb/queue/scheduler". The same holds true if you want to
change a io scheduler specific tunable, e.g.
/sys/block/sdb/queue/iosched/back_seek_max.

My point is that all per device interfaces in the io cgroup category
should use the same device names; this includes all the stats
reporting, interfaces to set weights, and so on. And not come up with
a different way of identifying devices.

Gui Jianfeng

unread,
Mar 9, 2010, 7:40:01 PM3/9/10
to

Hi Vivek,

I don't think using an inode path name as interface is a good idea. Because, one
can create new file to point to the same device. Also, pathname could be removed
or renamed by user.
So, i think device number is a better choice.

Thanks
Gui

Chad Talbott

unread,
Mar 10, 2010, 12:50:02 AM3/10/10
to
On Tue, Mar 9, 2010 at 4:33 PM, Gui Jianfeng <guiji...@cn.fujitsu.com> wrote:
> Vivek Goyal wrote:
>> On Tue, Mar 09, 2010 at 09:52:06AM +0800, Gui Jianfeng wrote:
>> I am not very sure but device name/path interface might turn out to be
>> more intutive.
>
> I don't think using an inode path name as interface is a good idea. Because, one
> can create new file to point to the same device. Also, pathname could be removed
> or renamed by user.
> So, i think device number is a better choice.

Sorry to butt in, but I don't think anyone is suggesting trying to
remember names from device nodes. I think the idea is to use the
names derived from the subsystem - such as 'sda' which is reflected in
/sys and dmesg.

Chad

Vivek Goyal

unread,
Mar 10, 2010, 10:40:02 AM3/10/10
to

Hi Gui,

Yes, full device file path name might not be the best option. But
disk_name (as seen in sysfs) still remains appealing.

For mapping a device name (sda) to its gendisk object, can't we iterate
through all the devices in block class in sysfs and then verify that device
name user has passed to set the rule is valid or not?

Look at block/genhd.c printk_all_partitions(). This functions iterates
through all gendisk objects.

This still leaves the issue of reaching a gendisk object from request
queue. Looking into it.

Chad Talbott

unread,
Mar 10, 2010, 12:40:03 PM3/10/10
to
On Wed, Mar 10, 2010 at 7:30 AM, Vivek Goyal <vgo...@redhat.com> wrote:
> This still leaves the issue of reaching a gendisk object from request
> queue. Looking into it.

It looks like we have that pairing way back in blk_register_queue()
which takes a gendisk. Is there any reason we don't hold onto the
gendisk there? Eyeballing add_disk() and unlink_gendisk() seems to
confirm that gendisk lifetime spans request_queue.

Nauman and I were also wondering why blkio_group and blkio_policy_node
store a dev_t, rather than a direct pointer to gendisk. dev_t seems
more like a userspace<->kernel interface than an inside-the-kernel
interface.

Chad

Vivek Goyal

unread,
Mar 10, 2010, 1:10:02 PM3/10/10
to
On Wed, Mar 10, 2010 at 09:38:35AM -0800, Chad Talbott wrote:
> On Wed, Mar 10, 2010 at 7:30 AM, Vivek Goyal <vgo...@redhat.com> wrote:
> > This still leaves the issue of reaching a gendisk object from request
> > queue. Looking into it.
>
> It looks like we have that pairing way back in blk_register_queue()
> which takes a gendisk. Is there any reason we don't hold onto the
> gendisk there? Eyeballing add_disk() and unlink_gendisk() seems to
> confirm that gendisk lifetime spans request_queue.
>

Yes, looking at the code, it looks like gendisk and request_queue object's
lifetime is same and probably we can store a pointer to gendisk in
request_queue at blk_register_queue() time. And then use this pointer to
retrieve gendisk->disk_name to report stats.

> Nauman and I were also wondering why blkio_group and blkio_policy_node
> store a dev_t, rather than a direct pointer to gendisk. dev_t seems
> more like a userspace<->kernel interface than an inside-the-kernel
> interface.
>

blkio_policy_node currently can't store a pointer to gendisk because there
is no mechanism to call back into blkio if device is removed. So if we
implement something so that once device is removed, blkio layer gets a
callback and we cleanup any state/rules associated with that device, then
I think we should be able to store the pointer to gendisk.

I am still trying to figure out how elevator/ioscheduler state is cleaned
up if a device is removed while some IO is happening to it.

OTOH, Gui, may be one can use blk_lookup_devt() to lookup the dev_t of a
device using the disk name (sda). I just noticed it while reading the
code.

Thanks
Vivek

Vivek Goyal

unread,
Mar 10, 2010, 3:40:02 PM3/10/10
to
On Wed, Mar 10, 2010 at 01:03:36PM -0500, Vivek Goyal wrote:
> On Wed, Mar 10, 2010 at 09:38:35AM -0800, Chad Talbott wrote:
> > On Wed, Mar 10, 2010 at 7:30 AM, Vivek Goyal <vgo...@redhat.com> wrote:
> > > This still leaves the issue of reaching a gendisk object from request
> > > queue. Looking into it.
> >
> > It looks like we have that pairing way back in blk_register_queue()
> > which takes a gendisk. Is there any reason we don't hold onto the
> > gendisk there? Eyeballing add_disk() and unlink_gendisk() seems to
> > confirm that gendisk lifetime spans request_queue.
> >
>
> Yes, looking at the code, it looks like gendisk and request_queue object's
> lifetime is same and probably we can store a pointer to gendisk in
> request_queue at blk_register_queue() time. And then use this pointer to
> retrieve gendisk->disk_name to report stats.
>

Well, gendisk and request_queue have little different life span. Following
seems to be the sequence a block driver follows.

blk_init_queue()
alloc_disk() and add_disk()
device_removed
del_gendisk()
blk_cleanup_queue()

So first we cleaup the gendisk structure and later driver calls to cleanup
the request queue.

> > Nauman and I were also wondering why blkio_group and blkio_policy_node
> > store a dev_t, rather than a direct pointer to gendisk. dev_t seems
> > more like a userspace<->kernel interface than an inside-the-kernel
> > interface.
> >
>
> blkio_policy_node currently can't store a pointer to gendisk because there
> is no mechanism to call back into blkio if device is removed. So if we
> implement something so that once device is removed, blkio layer gets a
> callback and we cleanup any state/rules associated with that device, then
> I think we should be able to store the pointer to gendisk.
>
> I am still trying to figure out how elevator/ioscheduler state is cleaned
> up if a device is removed while some IO is happening to it.
>

So blk_cleanup_queue() will do this. That means few things.

- We can't store pointers to gendisk in blkio_policy_node or blkio_group
because gendisk might have gone away but request queue is still there.
May be one can try saving a pointer and taking a reference, but I guess
that becomes littles complicated.

- If we are using disk name for rules and reporting stats, then we also
need to make sure that these rules are cleared from cgroups once device
has disappeared. Otherwise, following might happen.

- Create a rule for sda (x,y) for cgroup test1. x,y are major and
minor numbers.
- sda goes away. Rules still remains in blkio cgroup.
- Another device gets plugged in and i guess following can happen.
- device name is different but dev_t is same as sda.
- device name is same (sda) but device number is
different.

In both the cases a user will be confused with stale rules
in cgroups.

Cleaning up cgroups rules can get little complicated. I guess we need to
create a function in blkio-cgroup.c to traverse through all the cgroups
and cleanup any blkio_policy_nodes belonging to device going away.

In a nutshell, it probably is doable. You are welcome to write a patch. At
the same time I am not against deivce major/minor number based interface,
because it keeps things little simple.

Thanks
Vivek

-

Manuel Benitez

unread,
Mar 11, 2010, 2:30:02 PM3/11/10
to
On a closely related topic, I've just recently made a change to one of
my branches that exposes the blkio.time and blkio.sectors information
for the root cgroup. These stats would not show because the major and
minor information for the root blkio_croup structures is zero. This
information is not available at the when the root blkio_cgroup
structures are instantiated, so they are left without major and minor
information.

I have a simple fix that updates the major and minor information for
the root structures at a later time. It looks something like this:

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index cd79be0..b34c952 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -956,6 +956,11 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *c
return NULL;;

cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
+ if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
+ sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
+ cfqg->blkg.dev = MKDEV(major, minor);
+ goto done;
+ }
if (cfqg || !create)
goto done;

If folks think that this would be of interest, I can submit a formal
patch. If someone can suggest a better way to do it that doesn't
require extensive changes elsewhere, I'm open to working that up as
well.

-Ricky

Vivek Goyal

unread,
Mar 15, 2010, 10:00:03 AM3/15/10
to
On Thu, Mar 11, 2010 at 11:21:50AM -0800, Manuel Benitez wrote:
> On a closely related topic, I've just recently made a change to one of
> my branches that exposes the blkio.time and blkio.sectors information
> for the root cgroup. These stats would not show because the major and
> minor information for the root blkio_croup structures is zero. This
> information is not available at the when the root blkio_cgroup
> structures are instantiated, so they are left without major and minor
> information.
>
> I have a simple fix that updates the major and minor information for
> the root structures at a later time. It looks something like this:
>

Hi Ricky,

I think it is a good idea to export stats for root cgroup also. I also had
noticed this problem of major,minor not being available for root group as
at request queue initialization time it is not available. Checking for
blkg.dev being NULL at group creation time is not the cleanest solution
but at the same time can't think of better thing right now.

Can you please write a separate patch to fix it and post to lkml.

Thanks
Vivek

Gui Jianfeng

unread,
Mar 25, 2010, 2:30:02 AM3/25/10
to
Vivek Goyal wrote:
> On Fri, Mar 05, 2010 at 10:25:58AM +0800, Gui Jianfeng wrote:
>> Currently, IO Controller makes use of blkio.weight to assign weight for
>> all devices. Here a new user interface "blkio.weight_device" is introduced to
>> assign different weights for different devices. blkio.weight becomes the
>> default value for devices which are not configured by "blkio.weight_device"
>>
>> You can use the following format to assigned specific weight for a given
>> device:
>>
>> major:minor represents device number.
>>
>> And you can remove a specific weight as following:
>>
>> V1->V2 changes:
>> - use user interface "weight_device" instead of "policy" suggested by Vivek
>> - rename some struct suggested by Vivek
>> - rebase to 2.6-block "for-linus" branch
>> - remove an useless list_empty check pointed out by Li Zefan
>> - some trivial typo fix
>>
>> V2->V3 changes:
>> - Move policy_*_node() functions up to get rid of forward declarations
>> - rename related functions by adding prefix "blkio_"
>>
>
> Thanks for the changes Gui. Looks good to me.
>
> Acked-by: Vivek Goyal <vgo...@redhat.com>

Hi Jens,

This patchset seems to get lost. Just for reminding. Any comments?

Thanks,
Gui

Chad Talbott

unread,
Apr 7, 2010, 1:20:02 PM4/7/10
to
On Wed, Mar 24, 2010 at 11:28 PM, Gui Jianfeng
<guiji...@cn.fujitsu.com> wrote:
> Vivek Goyal wrote:
>> On Fri, Mar 05, 2010 at 10:25:58AM +0800, Gui Jianfeng wrote:
>>> V2->V3 changes:
>>> - Move policy_*_node() functions up to get rid of forward declarations
>>> - rename related functions by adding prefix "blkio_"
>>>
>>
>> Thanks for the changes Gui. Looks good to me.
>>
>> Acked-by: Vivek Goyal <vgo...@redhat.com>
>
> Hi Jens,
>
> This patchset seems to get lost. Just for reminding. Any comments?

Hi Jens,

Just want to add a quick note that we're excited about this
functionality and are hoping to see it go in.

Chad

Vivek Goyal

unread,
Apr 7, 2010, 1:30:02 PM4/7/10
to
On Wed, Apr 07, 2010 at 10:12:03AM -0700, Chad Talbott wrote:
> On Wed, Mar 24, 2010 at 11:28 PM, Gui Jianfeng
> <guiji...@cn.fujitsu.com> wrote:
> > Vivek Goyal wrote:
> >> On Fri, Mar 05, 2010 at 10:25:58AM +0800, Gui Jianfeng wrote:
> >>> V2->V3 changes:
> >>> - Move policy_*_node() functions up to get rid of forward declarations
> >>> - rename related functions by adding prefix "blkio_"
> >>>
> >>
> >> Thanks for the changes Gui. Looks good to me.
> >>
> >> Acked-by: Vivek Goyal <vgo...@redhat.com>
> >
> > Hi Jens,
> >
> > This patchset seems to get lost. Just for reminding. Any comments?
>
> Hi Jens,
>
> Just want to add a quick note that we're excited about this
> functionality and are hoping to see it go in.
>

I guess this patchset is now going to conflict with Divyesh's changes.

Gui, once blkio stat patches (cleaned up ones) are committed may be you
can rebase your changes on top of that. It will become easier for Jens.
(Until and unless he has some objections to this patchset).

Thanks
Vivek

Gui Jianfeng

unread,
Apr 7, 2010, 8:20:01 PM4/7/10
to
Vivek Goyal wrote:
> On Wed, Apr 07, 2010 at 10:12:03AM -0700, Chad Talbott wrote:
>> On Wed, Mar 24, 2010 at 11:28 PM, Gui Jianfeng
>> <guiji...@cn.fujitsu.com> wrote:
>>> Vivek Goyal wrote:
>>>> On Fri, Mar 05, 2010 at 10:25:58AM +0800, Gui Jianfeng wrote:
>>>>> V2->V3 changes:
>>>>> - Move policy_*_node() functions up to get rid of forward declarations
>>>>> - rename related functions by adding prefix "blkio_"
>>>>>
>>>> Thanks for the changes Gui. Looks good to me.
>>>>
>>>> Acked-by: Vivek Goyal <vgo...@redhat.com>
>>> Hi Jens,
>>>
>>> This patchset seems to get lost. Just for reminding. Any comments?
>> Hi Jens,
>>
>> Just want to add a quick note that we're excited about this
>> functionality and are hoping to see it go in.
>>
>
> I guess this patchset is now going to conflict with Divyesh's changes.
>
> Gui, once blkio stat patches (cleaned up ones) are committed may be you
> can rebase your changes on top of that. It will become easier for Jens.
> (Until and unless he has some objections to this patchset).

Sure, will do.

Thanks,
Gui

0 new messages