[PATCH-v3 1/4] idr: Percpu ida

2 views
Skip to first unread message

Nicholas A. Bellinger

unread,
Aug 16, 2013, 7:09:06 PM8/16/13
to target-devel, lf-virt, lkml, kvm-devel, Michael S. Tsirkin, Asias He, Kent Overstreet, Andrew Morton, Jens Axboe, Tejun Heo, Ingo Molnar, Andi Kleen, Christoph Lameter, Oleg Nesterov, Christoph Lameter, Nicholas A. Bellinger
From: Kent Overstreet <k...@daterainc.com>

Percpu frontend for allocating ids. With percpu allocation (that works),
it's impossible to guarantee it will always be possible to allocate all
nr_tags - typically, some will be stuck on a remote percpu freelist
where the current job can't get to them.

We do guarantee that it will always be possible to allocate at least
(nr_tags / 2) tags - this is done by keeping track of which and how many
cpus have tags on their percpu freelists. On allocation failure if
enough cpus have tags that there could potentially be (nr_tags / 2) tags
stuck on remote percpu freelists, we then pick a remote cpu at random to
steal from.

Note that there's no cpu hotplug notifier - we don't care, because
steal_tags() will eventually get the down cpu's tags. We _could_ satisfy
more allocations if we had a notifier - but we'll still meet our
guarantees and it's absolutely not a correctness issue, so I don't think
it's worth the extra code.

Signed-off-by: Kent Overstreet <kover...@google.com>
Cc: Tejun Heo <t...@kernel.org>
Cc: Oleg Nesterov <ol...@redhat.com>
Cc: Christoph Lameter <c...@linux-foundation.org>
Cc: Ingo Molnar <mi...@redhat.com>
Cc: Andi Kleen <an...@firstfloor.org>
Cc: Jens Axboe <ax...@kernel.dk>
Cc: "Nicholas A. Bellinger" <n...@linux-iscsi.org>
Signed-off-by: Nicholas Bellinger <n...@linux-iscsi.org>
---
include/linux/idr.h | 53 +++++++++
lib/idr.c | 316 +++++++++++++++++++++++++++++++++++++++++++++++++--
2 files changed, 361 insertions(+), 8 deletions(-)

diff --git a/include/linux/idr.h b/include/linux/idr.h
index 871a213..f0db12b 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -16,6 +16,8 @@
#include <linux/bitops.h>
#include <linux/init.h>
#include <linux/rcupdate.h>
+#include <linux/spinlock_types.h>
+#include <linux/wait.h>

/*
* We want shallower trees and thus more bits covered at each layer. 8
@@ -243,4 +245,55 @@ static inline int ida_get_new(struct ida *ida, int *p_id)

void __init idr_init_cache(void);

+/* Percpu IDA/tag allocator */
+
+struct percpu_ida_cpu;
+
+struct percpu_ida {
+ /*
+ * number of tags available to be allocated, as passed to
+ * percpu_ida_init()
+ */
+ unsigned nr_tags;
+
+ struct percpu_ida_cpu __percpu *tag_cpu;
+
+ /*
+ * Bitmap of cpus that (may) have tags on their percpu freelists:
+ * steal_tags() uses this to decide when to steal tags, and which cpus
+ * to try stealing from.
+ *
+ * It's ok for a freelist to be empty when its bit is set - steal_tags()
+ * will just keep looking - but the bitmap _must_ be set whenever a
+ * percpu freelist does have tags.
+ */
+ unsigned long *cpus_have_tags;
+
+ struct {
+ spinlock_t lock;
+ /*
+ * When we go to steal tags from another cpu (see steal_tags()),
+ * we want to pick a cpu at random. Cycling through them every
+ * time we steal is a bit easier and more or less equivalent:
+ */
+ unsigned cpu_last_stolen;
+
+ /* For sleeping on allocation failure */
+ wait_queue_head_t wait;
+
+ /*
+ * Global freelist - it's a stack where nr_free points to the
+ * top
+ */
+ unsigned nr_free;
+ unsigned *freelist;
+ } ____cacheline_aligned_in_smp;
+};
+
+int percpu_ida_alloc(struct percpu_ida *pool, gfp_t gfp);
+void percpu_ida_free(struct percpu_ida *pool, unsigned tag);
+
+void percpu_ida_destroy(struct percpu_ida *pool);
+int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags);
+
#endif /* __IDR_H__ */
diff --git a/lib/idr.c b/lib/idr.c
index bfe4db4..57bfabe 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -26,17 +26,20 @@
* with the slab allocator.
*/

-#ifndef TEST // to test in user space...
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/export.h>
-#endif
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+#include <linux/bug.h>
#include <linux/err.h>
-#include <linux/string.h>
+#include <linux/export.h>
+#include <linux/hardirq.h>
#include <linux/idr.h>
-#include <linux/spinlock.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
#include <linux/percpu.h>
-#include <linux/hardirq.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/spinlock.h>

#define MAX_IDR_SHIFT (sizeof(int) * 8 - 1)
#define MAX_IDR_BIT (1U << MAX_IDR_SHIFT)
@@ -1159,3 +1162,300 @@ void ida_init(struct ida *ida)

}
EXPORT_SYMBOL(ida_init);
+
+/* Percpu IDA */
+
+/*
+ * Number of tags we move between the percpu freelist and the global freelist at
+ * a time
+ */
+#define IDA_PCPU_BATCH_MOVE 32U
+
+/* Max size of percpu freelist, */
+#define IDA_PCPU_SIZE ((IDA_PCPU_BATCH_MOVE * 3) / 2)
+
+struct percpu_ida_cpu {
+ spinlock_t lock;
+ unsigned nr_free;
+ unsigned freelist[];
+};
+
+static inline void move_tags(unsigned *dst, unsigned *dst_nr,
+ unsigned *src, unsigned *src_nr,
+ unsigned nr)
+{
+ *src_nr -= nr;
+ memcpy(dst + *dst_nr, src + *src_nr, sizeof(unsigned) * nr);
+ *dst_nr += nr;
+}
+
+/*
+ * Try to steal tags from a remote cpu's percpu freelist.
+ *
+ * We first check how many percpu freelists have tags - we don't steal tags
+ * unless enough percpu freelists have tags on them that it's possible more than
+ * half the total tags could be stuck on remote percpu freelists.
+ *
+ * Then we iterate through the cpus until we find some tags - we don't attempt
+ * to find the "best" cpu to steal from, to keep cacheline bouncing to a
+ * minimum.
+ */
+static inline void steal_tags(struct percpu_ida *pool,
+ struct percpu_ida_cpu *tags)
+{
+ unsigned cpus_have_tags, cpu = pool->cpu_last_stolen;
+ struct percpu_ida_cpu *remote;
+
+ for (cpus_have_tags = bitmap_weight(pool->cpus_have_tags, nr_cpu_ids);
+ cpus_have_tags * IDA_PCPU_SIZE > pool->nr_tags / 2;
+ cpus_have_tags--) {
+ cpu = find_next_bit(pool->cpus_have_tags, nr_cpu_ids, cpu);
+
+ if (cpu == nr_cpu_ids)
+ cpu = find_first_bit(pool->cpus_have_tags, nr_cpu_ids);
+
+ if (cpu == nr_cpu_ids)
+ BUG();
+
+ pool->cpu_last_stolen = cpu;
+ remote = per_cpu_ptr(pool->tag_cpu, cpu);
+
+ clear_bit(cpu, pool->cpus_have_tags);
+
+ if (remote == tags)
+ continue;
+
+ spin_lock(&remote->lock);
+
+ if (remote->nr_free) {
+ memcpy(tags->freelist,
+ remote->freelist,
+ sizeof(unsigned) * remote->nr_free);
+
+ tags->nr_free = remote->nr_free;
+ remote->nr_free = 0;
+ }
+
+ spin_unlock(&remote->lock);
+
+ if (tags->nr_free)
+ break;
+ }
+}
+
+static inline void alloc_global_tags(struct percpu_ida *pool,
+ struct percpu_ida_cpu *tags)
+{
+ move_tags(tags->freelist, &tags->nr_free,
+ pool->freelist, &pool->nr_free,
+ min(pool->nr_free, IDA_PCPU_BATCH_MOVE));
+}
+
+static inline unsigned alloc_local_tag(struct percpu_ida *pool,
+ struct percpu_ida_cpu *tags)
+{
+ int tag = -ENOSPC;
+
+ spin_lock(&tags->lock);
+ if (tags->nr_free)
+ tag = tags->freelist[--tags->nr_free];
+ spin_unlock(&tags->lock);
+
+ return tag;
+}
+
+/**
+ * percpu_ida_alloc - allocate a tag
+ * @pool: pool to allocate from
+ * @gfp: gfp flags
+ *
+ * Returns a tag - an integer in the range [0..nr_tags) (passed to
+ * tag_pool_init()), or otherwise -ENOSPC on allocation failure.
+ *
+ * Safe to be called from interrupt context (assuming it isn't passed
+ * __GFP_WAIT, of course).
+ *
+ * Will not fail if passed __GFP_WAIT.
+ */
+int percpu_ida_alloc(struct percpu_ida *pool, gfp_t gfp)
+{
+ DEFINE_WAIT(wait);
+ struct percpu_ida_cpu *tags;
+ unsigned long flags;
+ int tag;
+
+ local_irq_save(flags);
+ tags = this_cpu_ptr(pool->tag_cpu);
+
+ /* Fastpath */
+ tag = alloc_local_tag(pool, tags);
+ if (likely(tag >= 0)) {
+ local_irq_restore(flags);
+ return tag;
+ }
+
+ while (1) {
+ spin_lock(&pool->lock);
+
+ /*
+ * prepare_to_wait() must come before steal_tags(), in case
+ * percpu_ida_free() on another cpu flips a bit in
+ * cpus_have_tags
+ *
+ * global lock held and irqs disabled, don't need percpu lock
+ */
+ prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
+
+ if (!tags->nr_free)
+ alloc_global_tags(pool, tags);
+ if (!tags->nr_free)
+ steal_tags(pool, tags);
+
+ if (tags->nr_free) {
+ tag = tags->freelist[--tags->nr_free];
+ if (tags->nr_free)
+ set_bit(smp_processor_id(),
+ pool->cpus_have_tags);
+ }
+
+ spin_unlock(&pool->lock);
+ local_irq_restore(flags);
+
+ if (tag >= 0 || !(gfp & __GFP_WAIT))
+ break;
+
+ schedule();
+
+ local_irq_save(flags);
+ tags = this_cpu_ptr(pool->tag_cpu);
+ }
+
+ finish_wait(&pool->wait, &wait);
+ return tag;
+}
+EXPORT_SYMBOL_GPL(percpu_ida_alloc);
+
+/**
+ * percpu_ida_free - free a tag
+ * @pool: pool @tag was allocated from
+ * @tag: a tag previously allocated with percpu_ida_alloc()
+ *
+ * Safe to be called from interrupt context.
+ */
+void percpu_ida_free(struct percpu_ida *pool, unsigned tag)
+{
+ struct percpu_ida_cpu *tags;
+ unsigned long flags;
+ unsigned nr_free;
+
+ BUG_ON(tag >= pool->nr_tags);
+
+ local_irq_save(flags);
+ tags = this_cpu_ptr(pool->tag_cpu);
+
+ spin_lock(&tags->lock);
+ tags->freelist[tags->nr_free++] = tag;
+
+ nr_free = tags->nr_free;
+ spin_unlock(&tags->lock);
+
+ if (nr_free == 1) {
+ set_bit(smp_processor_id(),
+ pool->cpus_have_tags);
+ wake_up(&pool->wait);
+ }
+
+ if (nr_free == IDA_PCPU_SIZE) {
+ spin_lock(&pool->lock);
+
+ /*
+ * Global lock held and irqs disabled, don't need percpu
+ * lock
+ */
+ if (tags->nr_free == IDA_PCPU_SIZE) {
+ move_tags(pool->freelist, &pool->nr_free,
+ tags->freelist, &tags->nr_free,
+ IDA_PCPU_BATCH_MOVE);
+
+ wake_up(&pool->wait);
+ }
+ spin_unlock(&pool->lock);
+ }
+
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(percpu_ida_free);
+
+/**
+ * percpu_ida_destroy - release a tag pool's resources
+ * @pool: pool to free
+ *
+ * Frees the resources allocated by percpu_ida_init().
+ */
+void percpu_ida_destroy(struct percpu_ida *pool)
+{
+ free_percpu(pool->tag_cpu);
+ kfree(pool->cpus_have_tags);
+ free_pages((unsigned long) pool->freelist,
+ get_order(pool->nr_tags * sizeof(unsigned)));
+}
+EXPORT_SYMBOL_GPL(percpu_ida_destroy);
+
+/**
+ * percpu_ida_init - initialize a percpu tag pool
+ * @pool: pool to initialize
+ * @nr_tags: number of tags that will be available for allocation
+ *
+ * Initializes @pool so that it can be used to allocate tags - integers in the
+ * range [0, nr_tags). Typically, they'll be used by driver code to refer to a
+ * preallocated array of tag structures.
+ *
+ * Allocation is percpu, but sharding is limited by nr_tags - for best
+ * performance, the workload should not span more cpus than nr_tags / 128.
+ */
+int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags)
+{
+ unsigned i, cpu, order;
+
+ memset(pool, 0, sizeof(*pool));
+
+ init_waitqueue_head(&pool->wait);
+ spin_lock_init(&pool->lock);
+ pool->nr_tags = nr_tags;
+
+ /* Guard against overflow */
+ if (nr_tags > (unsigned) INT_MAX + 1) {
+ pr_err("tags.c: nr_tags too large\n");
+ return -EINVAL;
+ }
+
+ order = get_order(nr_tags * sizeof(unsigned));
+ pool->freelist = (void *) __get_free_pages(GFP_KERNEL, order);
+ if (!pool->freelist)
+ return -ENOMEM;
+
+ for (i = 0; i < nr_tags; i++)
+ pool->freelist[i] = i;
+
+ pool->nr_free = nr_tags;
+
+ pool->cpus_have_tags = kzalloc(BITS_TO_LONGS(nr_cpu_ids) *
+ sizeof(unsigned long), GFP_KERNEL);
+ if (!pool->cpus_have_tags)
+ goto err;
+
+ pool->tag_cpu = __alloc_percpu(sizeof(struct percpu_ida_cpu) +
+ IDA_PCPU_SIZE * sizeof(unsigned),
+ sizeof(unsigned));
+ if (!pool->tag_cpu)
+ goto err;
+
+ for_each_possible_cpu(cpu)
+ spin_lock_init(&per_cpu_ptr(pool->tag_cpu, cpu)->lock);
+
+ return 0;
+err:
+ percpu_ida_destroy(pool);
+ return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(percpu_ida_init);
--
1.7.10.4

Nicholas A. Bellinger

unread,
Aug 16, 2013, 7:09:07 PM8/16/13
to target-devel, lf-virt, lkml, kvm-devel, Michael S. Tsirkin, Asias He, Kent Overstreet, Andrew Morton, Jens Axboe, Tejun Heo, Ingo Molnar, Andi Kleen, Christoph Lameter, Nicholas Bellinger
From: Nicholas Bellinger <n...@linux-iscsi.org>

This patch adds lib/idr.c based transport_init_session_tags() logic
that allows fabric drivers to setup a per-cpu se_sess->sess_tag_pool
and associated se_sess->sess_cmd_map for basic tagged pre-allocation
of fabric descriptor sized memory.

v3: Update to percpu-ida usage

Cc: Kent Overstreet <k...@daterainc.com>
Cc: Asias He <as...@redhat.com>
Cc: Michael S. Tsirkin <m...@redhat.com>
Signed-off-by: Nicholas Bellinger <n...@linux-iscsi.org>
---
drivers/target/target_core_transport.c | 33 ++++++++++++++++++++++++++++++++
include/target/target_core_base.h | 5 +++++
include/target/target_core_fabric.h | 1 +
3 files changed, 39 insertions(+)

diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
index 7172d00..64ceb51 100644
--- a/drivers/target/target_core_transport.c
+++ b/drivers/target/target_core_transport.c
@@ -232,6 +232,35 @@ struct se_session *transport_init_session(void)
}
EXPORT_SYMBOL(transport_init_session);

+struct se_session *transport_init_session_tags(unsigned int tag_num,
+ unsigned int tag_size)
+{
+ struct se_session *se_sess;
+ int rc;
+
+ se_sess = transport_init_session();
+ if (IS_ERR(se_sess))
+ return se_sess;
+
+ se_sess->sess_cmd_map = kzalloc(tag_num * tag_size, GFP_KERNEL);
+ if (!se_sess->sess_cmd_map) {
+ pr_err("Unable to allocate se_sess->sess_cmd_map\n");
+ transport_free_session(se_sess);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ rc = percpu_ida_init(&se_sess->sess_tag_pool, tag_num);
+ if (rc < 0) {
+ pr_err("Unable to init se_sess->sess_tag_pool,"
+ " tag_num: %u\n", tag_num);
+ transport_free_session(se_sess);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return se_sess;
+}
+EXPORT_SYMBOL(transport_init_session_tags);
+
/*
* Called with spin_lock_irqsave(&struct se_portal_group->session_lock called.
*/
@@ -367,6 +396,10 @@ EXPORT_SYMBOL(transport_deregister_session_configfs);

void transport_free_session(struct se_session *se_sess)
{
+ if (se_sess->sess_cmd_map) {
+ percpu_ida_destroy(&se_sess->sess_tag_pool);
+ kfree(se_sess->sess_cmd_map);
+ }
kmem_cache_free(se_sess_cache, se_sess);
}
EXPORT_SYMBOL(transport_free_session);
diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h
index e34fc90..360e4a3 100644
--- a/include/target/target_core_base.h
+++ b/include/target/target_core_base.h
@@ -5,6 +5,7 @@
#include <linux/configfs.h>
#include <linux/dma-mapping.h>
#include <linux/blkdev.h>
+#include <linux/idr.h>
#include <scsi/scsi_cmnd.h>
#include <net/sock.h>
#include <net/tcp.h>
@@ -415,6 +416,8 @@ struct se_cmd {
enum dma_data_direction data_direction;
/* For SAM Task Attribute */
int sam_task_attr;
+ /* Used for se_sess->sess_tag_pool */
+ unsigned int map_tag;
/* Transport protocol dependent state, see transport_state_table */
enum transport_state_table t_state;
unsigned cmd_wait_set:1;
@@ -536,6 +539,8 @@ struct se_session {
struct list_head sess_wait_list;
spinlock_t sess_cmd_lock;
struct kref sess_kref;
+ void *sess_cmd_map;
+ struct percpu_ida sess_tag_pool;
};

struct se_device;
diff --git a/include/target/target_core_fabric.h b/include/target/target_core_fabric.h
index 7a16178..b13afb6 100644
--- a/include/target/target_core_fabric.h
+++ b/include/target/target_core_fabric.h
@@ -84,6 +84,7 @@ struct target_core_fabric_ops {
};

struct se_session *transport_init_session(void);
+struct se_session *transport_init_session_tags(unsigned int, unsigned int);
void __transport_register_session(struct se_portal_group *,
struct se_node_acl *, struct se_session *, void *);
void transport_register_session(struct se_portal_group *,
--
1.7.10.4

Nicholas A. Bellinger

unread,
Aug 16, 2013, 7:09:08 PM8/16/13
to target-devel, lf-virt, lkml, kvm-devel, Michael S. Tsirkin, Asias He, Kent Overstreet, Andrew Morton, Jens Axboe, Tejun Heo, Ingo Molnar, Andi Kleen, Christoph Lameter, Nicholas Bellinger
From: Nicholas Bellinger <n...@linux-iscsi.org>

This patch changes vhost/scsi to use transport_init_session_tags()
pre-allocation logic for per-cpu session tag pooling with internal
ida_alloc() + ida_free() calls based upon the saved se_cmd->map_tag id.

FIXME: Make transport_init_session_tags() number of tags setup
configurable per vring client setting via configfs

v3 changes:
- Update to percpu-ida usage
- Rebase to v3.11-rc5 code

Cc: Michael S. Tsirkin <m...@redhat.com>
Cc: Asias He <as...@redhat.com>
Cc: Kent Overstreet <k...@daterainc.com>
Signed-off-by: Nicholas Bellinger <n...@linux-iscsi.org>
---
drivers/vhost/scsi.c | 33 +++++++++++++++++++++------------
1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 0c27c7d..af178b7 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -48,12 +48,14 @@
#include <linux/virtio_scsi.h>
#include <linux/llist.h>
#include <linux/bitmap.h>
+#include <linux/idr.h>

#include "vhost.h"

#define TCM_VHOST_VERSION "v0.1"
#define TCM_VHOST_NAMELEN 256
#define TCM_VHOST_MAX_CDB_SIZE 32
+#define TCM_VHOST_DEFAULT_TAGS 256

struct vhost_scsi_inflight {
/* Wait for the flush operation to finish */
@@ -450,6 +452,7 @@ static void tcm_vhost_release_cmd(struct se_cmd *se_cmd)
{
struct tcm_vhost_cmd *tv_cmd = container_of(se_cmd,
struct tcm_vhost_cmd, tvc_se_cmd);
+ struct se_session *se_sess = se_cmd->se_sess;

if (tv_cmd->tvc_sgl_count) {
u32 i;
@@ -460,7 +463,7 @@ static void tcm_vhost_release_cmd(struct se_cmd *se_cmd)
}

tcm_vhost_put_inflight(tv_cmd->inflight);
- kfree(tv_cmd);
+ percpu_ida_free(&se_sess->sess_tag_pool, se_cmd->map_tag);
}

static int tcm_vhost_shutdown_session(struct se_session *se_sess)
@@ -704,7 +707,7 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work)
}

static struct tcm_vhost_cmd *
-vhost_scsi_allocate_cmd(struct vhost_virtqueue *vq,
+vhost_scsi_get_tag(struct vhost_virtqueue *vq,
struct tcm_vhost_tpg *tpg,
struct virtio_scsi_cmd_req *v_req,
u32 exp_data_len,
@@ -712,18 +715,21 @@ vhost_scsi_allocate_cmd(struct vhost_virtqueue *vq,
{
struct tcm_vhost_cmd *cmd;
struct tcm_vhost_nexus *tv_nexus;
+ struct se_session *se_sess;
+ int tag;

tv_nexus = tpg->tpg_nexus;
if (!tv_nexus) {
pr_err("Unable to locate active struct tcm_vhost_nexus\n");
return ERR_PTR(-EIO);
}
+ se_sess = tv_nexus->tvn_se_sess;

- cmd = kzalloc(sizeof(struct tcm_vhost_cmd), GFP_ATOMIC);
- if (!cmd) {
- pr_err("Unable to allocate struct tcm_vhost_cmd\n");
- return ERR_PTR(-ENOMEM);
- }
+ tag = percpu_ida_alloc(&se_sess->sess_tag_pool, GFP_KERNEL);
+ cmd = &((struct tcm_vhost_cmd *)se_sess->sess_cmd_map)[tag];
+ memset(cmd, 0, sizeof(struct tcm_vhost_cmd));
+
+ cmd->tvc_se_cmd.map_tag = tag;
cmd->tvc_tag = v_req->tag;
cmd->tvc_task_attr = v_req->task_attr;
cmd->tvc_exp_data_len = exp_data_len;
@@ -989,10 +995,10 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
for (i = 0; i < data_num; i++)
exp_data_len += vq->iov[data_first + i].iov_len;

- cmd = vhost_scsi_allocate_cmd(vq, tpg, &v_req,
- exp_data_len, data_direction);
+ cmd = vhost_scsi_get_tag(vq, tpg, &v_req,
+ exp_data_len, data_direction);
if (IS_ERR(cmd)) {
- vq_err(vq, "vhost_scsi_allocate_cmd failed %ld\n",
+ vq_err(vq, "vhost_scsi_get_tag failed %ld\n",
PTR_ERR(cmd));
goto err_cmd;
}
@@ -1675,9 +1681,12 @@ static int tcm_vhost_make_nexus(struct tcm_vhost_tpg *tpg,
return -ENOMEM;
}
/*
- * Initialize the struct se_session pointer
+ * Initialize the struct se_session pointer and setup tagpool
+ * for struct tcm_vhost_cmd descriptors
*/
- tv_nexus->tvn_se_sess = transport_init_session();
+ tv_nexus->tvn_se_sess = transport_init_session_tags(
+ TCM_VHOST_DEFAULT_TAGS,
+ sizeof(struct tcm_vhost_cmd));
if (IS_ERR(tv_nexus->tvn_se_sess)) {
mutex_unlock(&tpg->tv_tpg_mutex);
kfree(tv_nexus);
--
1.7.10.4

Nicholas A. Bellinger

unread,
Aug 16, 2013, 7:09:09 PM8/16/13
to target-devel, lf-virt, lkml, kvm-devel, Michael S. Tsirkin, Asias He, Kent Overstreet, Andrew Morton, Jens Axboe, Tejun Heo, Ingo Molnar, Andi Kleen, Christoph Lameter, Nicholas Bellinger
From: Nicholas Bellinger <n...@linux-iscsi.org>

This patch adds support for pre-allocation of per tv_cmd descriptor
scatterlist + user-space page pointer memory using se_sess->sess_cmd_map
within tcm_vhost_make_nexus() code.

This includes sanity checks within vhost_scsi_map_to_sgl()
to reject I/O that exceeds these initial hardcoded values, and
the necessary cleanup in tcm_vhost_make_nexus() failure path +
tcm_vhost_drop_nexus().

v3 changes:
- Rebase to v3.11-rc5 code

Cc: Michael S. Tsirkin <m...@redhat.com>
Cc: Asias He <as...@redhat.com>
Cc: Kent Overstreet <k...@daterainc.com>
Signed-off-by: Nicholas Bellinger <n...@linux-iscsi.org>
---
drivers/vhost/scsi.c | 99 ++++++++++++++++++++++++++++++++++++++++----------
1 file changed, 80 insertions(+), 19 deletions(-)

diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index af178b7..8e8788e 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -56,6 +56,8 @@
#define TCM_VHOST_NAMELEN 256
#define TCM_VHOST_MAX_CDB_SIZE 32
#define TCM_VHOST_DEFAULT_TAGS 256
+#define TCM_VHOST_PREALLOC_SGLS 2048
+#define TCM_VHOST_PREALLOC_PAGES 2048

struct vhost_scsi_inflight {
/* Wait for the flush operation to finish */
@@ -81,6 +83,7 @@ struct tcm_vhost_cmd {
u32 tvc_lun;
/* Pointer to the SGL formatted memory from virtio-scsi */
struct scatterlist *tvc_sgl;
+ struct page **tvc_upages;
/* Pointer to response */
struct virtio_scsi_cmd_resp __user *tvc_resp;
/* Pointer to vhost_scsi for our device */
@@ -458,8 +461,6 @@ static void tcm_vhost_release_cmd(struct se_cmd *se_cmd)
u32 i;
for (i = 0; i < tv_cmd->tvc_sgl_count; i++)
put_page(sg_page(&tv_cmd->tvc_sgl[i]));
-
- kfree(tv_cmd->tvc_sgl);
}

tcm_vhost_put_inflight(tv_cmd->inflight);
@@ -716,6 +717,8 @@ vhost_scsi_get_tag(struct vhost_virtqueue *vq,
struct tcm_vhost_cmd *cmd;
struct tcm_vhost_nexus *tv_nexus;
struct se_session *se_sess;
+ struct scatterlist *sg;
+ struct page **pages;
int tag;

tv_nexus = tpg->tpg_nexus;
@@ -727,8 +730,12 @@ vhost_scsi_get_tag(struct vhost_virtqueue *vq,

tag = percpu_ida_alloc(&se_sess->sess_tag_pool, GFP_KERNEL);
cmd = &((struct tcm_vhost_cmd *)se_sess->sess_cmd_map)[tag];
+ sg = cmd->tvc_sgl;
+ pages = cmd->tvc_upages;
memset(cmd, 0, sizeof(struct tcm_vhost_cmd));

+ cmd->tvc_sgl = sg;
+ cmd->tvc_upages = pages;
cmd->tvc_se_cmd.map_tag = tag;
cmd->tvc_tag = v_req->tag;
cmd->tvc_task_attr = v_req->task_attr;
@@ -746,7 +753,8 @@ vhost_scsi_get_tag(struct vhost_virtqueue *vq,
* Returns the number of scatterlist entries used or -errno on error.
*/
static int
-vhost_scsi_map_to_sgl(struct scatterlist *sgl,
+vhost_scsi_map_to_sgl(struct tcm_vhost_cmd *tv_cmd,
+ struct scatterlist *sgl,
unsigned int sgl_count,
struct iovec *iov,
int write)
@@ -758,13 +766,25 @@ vhost_scsi_map_to_sgl(struct scatterlist *sgl,
struct page **pages;
int ret, i;

+ if (sgl_count > TCM_VHOST_PREALLOC_SGLS) {
+ pr_err("vhost_scsi_map_to_sgl() psgl_count: %u greater than"
+ " preallocated TCM_VHOST_PREALLOC_SGLS: %u\n",
+ sgl_count, TCM_VHOST_PREALLOC_SGLS);
+ return -ENOBUFS;
+ }
+
pages_nr = iov_num_pages(iov);
if (pages_nr > sgl_count)
return -ENOBUFS;

- pages = kmalloc(pages_nr * sizeof(struct page *), GFP_KERNEL);
- if (!pages)
- return -ENOMEM;
+ if (pages_nr > TCM_VHOST_PREALLOC_PAGES) {
+ pr_err("vhost_scsi_map_to_sgl() pages_nr: %u greater than"
+ " preallocated TCM_VHOST_PREALLOC_PAGES: %u\n",
+ pages_nr, TCM_VHOST_PREALLOC_PAGES);
+ return -ENOBUFS;
+ }
+
+ pages = tv_cmd->tvc_upages;

ret = get_user_pages_fast((unsigned long)ptr, pages_nr, write, pages);
/* No pages were pinned */
@@ -789,7 +809,6 @@ vhost_scsi_map_to_sgl(struct scatterlist *sgl,
}

out:
- kfree(pages);
return ret;
}

@@ -813,24 +832,20 @@ vhost_scsi_map_iov_to_sgl(struct tcm_vhost_cmd *cmd,

/* TODO overflow checking */

- sg = kmalloc(sizeof(cmd->tvc_sgl[0]) * sgl_count, GFP_ATOMIC);
- if (!sg)
- return -ENOMEM;
- pr_debug("%s sg %p sgl_count %u is_err %d\n", __func__,
- sg, sgl_count, !sg);
+ sg = cmd->tvc_sgl;
+ pr_debug("%s sg %p sgl_count %u\n", __func__, sg, sgl_count);
sg_init_table(sg, sgl_count);

- cmd->tvc_sgl = sg;
cmd->tvc_sgl_count = sgl_count;

pr_debug("Mapping %u iovecs for %u pages\n", niov, sgl_count);
for (i = 0; i < niov; i++) {
- ret = vhost_scsi_map_to_sgl(sg, sgl_count, &iov[i], write);
+ ret = vhost_scsi_map_to_sgl(cmd, sg, sgl_count, &iov[i],
+ write);
if (ret < 0) {
for (i = 0; i < cmd->tvc_sgl_count; i++)
put_page(sg_page(&cmd->tvc_sgl[i]));
- kfree(cmd->tvc_sgl);
- cmd->tvc_sgl = NULL;
+
cmd->tvc_sgl_count = 0;
return ret;
}
@@ -1660,11 +1675,31 @@ static void tcm_vhost_drop_nodeacl(struct se_node_acl *se_acl)
kfree(nacl);
}

+static void tcm_vhost_free_cmd_map_res(struct tcm_vhost_nexus *nexus,
+ struct se_session *se_sess)
+{
+ struct tcm_vhost_cmd *tv_cmd;
+ unsigned int i;
+
+ if (!se_sess->sess_cmd_map)
+ return;
+
+ for (i = 0; i < TCM_VHOST_DEFAULT_TAGS; i++) {
+ tv_cmd = &((struct tcm_vhost_cmd *)se_sess->sess_cmd_map)[i];
+
+ kfree(tv_cmd->tvc_sgl);
+ kfree(tv_cmd->tvc_upages);
+ }
+}
+
static int tcm_vhost_make_nexus(struct tcm_vhost_tpg *tpg,
const char *name)
{
struct se_portal_group *se_tpg;
+ struct se_session *se_sess;
struct tcm_vhost_nexus *tv_nexus;
+ struct tcm_vhost_cmd *tv_cmd;
+ unsigned int i;

mutex_lock(&tpg->tv_tpg_mutex);
if (tpg->tpg_nexus) {
@@ -1692,6 +1727,26 @@ static int tcm_vhost_make_nexus(struct tcm_vhost_tpg *tpg,
kfree(tv_nexus);
return -ENOMEM;
}
+ se_sess = tv_nexus->tvn_se_sess;
+ for (i = 0; i < TCM_VHOST_DEFAULT_TAGS; i++) {
+ tv_cmd = &((struct tcm_vhost_cmd *)se_sess->sess_cmd_map)[i];
+
+ tv_cmd->tvc_sgl = kzalloc(sizeof(struct scatterlist) *
+ TCM_VHOST_PREALLOC_SGLS, GFP_KERNEL);
+ if (!tv_cmd->tvc_sgl) {
+ mutex_unlock(&tpg->tv_tpg_mutex);
+ pr_err("Unable to allocate tv_cmd->tvc_sgl\n");
+ goto out;
+ }
+
+ tv_cmd->tvc_upages = kzalloc(sizeof(struct page *) *
+ TCM_VHOST_PREALLOC_PAGES, GFP_KERNEL);
+ if (!tv_cmd->tvc_upages) {
+ mutex_unlock(&tpg->tv_tpg_mutex);
+ pr_err("Unable to allocate tv_cmd->tvc_upages\n");
+ goto out;
+ }
+ }
/*
* Since we are running in 'demo mode' this call with generate a
* struct se_node_acl for the tcm_vhost struct se_portal_group with
@@ -1703,9 +1758,7 @@ static int tcm_vhost_make_nexus(struct tcm_vhost_tpg *tpg,
mutex_unlock(&tpg->tv_tpg_mutex);
pr_debug("core_tpg_check_initiator_node_acl() failed"
" for %s\n", name);
- transport_free_session(tv_nexus->tvn_se_sess);
- kfree(tv_nexus);
- return -ENOMEM;
+ goto out;
}
/*
* Now register the TCM vhost virtual I_T Nexus as active with the
@@ -1717,6 +1770,12 @@ static int tcm_vhost_make_nexus(struct tcm_vhost_tpg *tpg,

mutex_unlock(&tpg->tv_tpg_mutex);
return 0;
+
+out:
+ tcm_vhost_free_cmd_map_res(tv_nexus, se_sess);
+ transport_free_session(se_sess);
+ kfree(tv_nexus);
+ return -ENOMEM;
}

static int tcm_vhost_drop_nexus(struct tcm_vhost_tpg *tpg)
@@ -1756,6 +1815,8 @@ static int tcm_vhost_drop_nexus(struct tcm_vhost_tpg *tpg)
pr_debug("TCM_vhost_ConfigFS: Removing I_T Nexus to emulated"
" %s Initiator Port: %s\n", tcm_vhost_dump_proto_id(tpg->tport),
tv_nexus->tvn_se_sess->se_node_acl->initiatorname);
+
+ tcm_vhost_free_cmd_map_res(tv_nexus, se_sess);
/*
* Release the SCSI I_T Nexus to the emulated vhost Target Port
*/
--
1.7.10.4

Reply all
Reply to author
Forward
0 new messages