Google Groups no longer supports new Usenet posts or subscriptions. Historical content remains viewable.
Dismiss

[RFC] iovec in ->aio_read/->aio_write

3 views
Skip to first unread message

Christoph Hellwig

unread,
Oct 15, 2002, 3:50:00 PM10/15/02
to
I've recently looked into implementing the aio read and write
methods for XFS. Although all of read/write readv/writev
and aio_read/aio_write end up calling the exactly same code
in filemap.c (for the generic filesystem I/O code). Filesystems
like XFS that need additional code before calling the generic
functionality have to duplicated it though.

I don't think it makes sense to keep all those interfaces. As
the read/write entry points are used by most drivers I suggest
starting with the other two, that have far less users. The
patch below (compiled but not booted!) changes the aio_read/
aio_write interface to take the same array of iovec like
readv/write and updates all users. Note that we don't support
vectored I/O for the aio interface yet, but it seems like a
logical addition to me.

Proposed next steps: Convert over all readv/writev users
to aio_read/aio_write and remove the methods. Implement
aio_read/aio_write in all filesystems using the generic
pagecache code and kill the "normal" generic_file_read
and generic_file_write.

Comments?

--- 1.22/fs/aio.c Sun Oct 13 17:39:40 2002
+++ edited/fs/aio.c Tue Oct 15 20:45:37 2002
@@ -987,6 +987,28 @@
return -EINVAL;
}

+ssize_t aio_read_single(struct kiocb *iocb, char *buf,
+ size_t count, loff_t pos)
+{
+ struct file *file = iocb->ki_filp;
+
+ iocb->ki_single.iov_base = buf;
+ iocb->ki_single.iov_len = count;
+
+ return file->f_op->aio_read(iocb, &iocb->ki_single, 1, iocb->ki_pos);
+}
+
+ssize_t aio_write_single(struct kiocb *iocb, const char *buf,
+ size_t count, loff_t pos)
+{
+ struct file *file = iocb->ki_filp;
+
+ iocb->ki_single.iov_base = (void *)buf;
+ iocb->ki_single.iov_len = count;
+
+ return file->f_op->aio_write(iocb, &iocb->ki_single, 1, iocb->ki_pos);
+}
+
static int FASTCALL(io_submit_one(struct kioctx *ctx, struct iocb *user_iocb,
struct iocb *iocb));
static int io_submit_one(struct kioctx *ctx, struct iocb *user_iocb,
@@ -1048,7 +1070,7 @@
goto out_put_req;
ret = -EINVAL;
if (file->f_op->aio_read)
- ret = file->f_op->aio_read(req, buf,
+ ret = aio_read_single(req, buf,
iocb->aio_nbytes, req->ki_pos);
break;
case IOCB_CMD_PWRITE:
@@ -1060,7 +1082,7 @@
goto out_put_req;
ret = -EINVAL;
if (file->f_op->aio_write)
- ret = file->f_op->aio_write(req, buf,
+ ret = aio_write_single(req, buf,
iocb->aio_nbytes, req->ki_pos);
break;
case IOCB_CMD_FDSYNC:
--- 1.19/fs/read_write.c Thu Oct 10 23:36:26 2002
+++ edited/fs/read_write.c Tue Oct 15 20:44:24 2002
@@ -184,7 +184,7 @@

init_sync_kiocb(&kiocb, filp);
kiocb.ki_pos = *ppos;
- ret = filp->f_op->aio_read(&kiocb, buf, len, kiocb.ki_pos);
+ ret = aio_read_single(&kiocb, buf, len, kiocb.ki_pos);
if (-EIOCBQUEUED == ret)
ret = wait_on_sync_kiocb(&kiocb);
*ppos = kiocb.ki_pos;
@@ -224,7 +224,7 @@

init_sync_kiocb(&kiocb, filp);
kiocb.ki_pos = *ppos;
- ret = filp->f_op->aio_write(&kiocb, buf, len, kiocb.ki_pos);
+ ret = aio_write_single(&kiocb, buf, len, kiocb.ki_pos);
if (-EIOCBQUEUED == ret)
ret = wait_on_sync_kiocb(&kiocb);
*ppos = kiocb.ki_pos;
@@ -340,6 +340,37 @@
}
return seg;
}
+
+ssize_t do_sync_writev(struct file *filp, const struct iovec *iov,
+ unsigned long nr_segs, loff_t *ppos)
+{
+ struct kiocb kiocb;
+ ssize_t ret;
+
+ init_sync_kiocb(&kiocb, filp);
+ kiocb.ki_pos = *ppos;
+ ret = filp->f_op->aio_write(&kiocb, iov, nr_segs, kiocb.ki_pos);
+ if (-EIOCBQUEUED == ret)
+ ret = wait_on_sync_kiocb(&kiocb);
+ *ppos = kiocb.ki_pos;
+ return ret;
+}
+
+ssize_t do_sync_readv(struct file *filp, const struct iovec *iov,
+ unsigned long nr_segs, loff_t *ppos)
+{
+ struct kiocb kiocb;
+ ssize_t ret;
+
+ init_sync_kiocb(&kiocb, filp);
+ kiocb.ki_pos = *ppos;
+ ret = filp->f_op->aio_write(&kiocb, iov, nr_segs, kiocb.ki_pos);
+ if (-EIOCBQUEUED == ret)
+ ret = wait_on_sync_kiocb(&kiocb);
+ *ppos = kiocb.ki_pos;
+ return ret;
+}
+

static ssize_t do_readv_writev(int type, struct file *file,
const struct iovec * vector,
--- 1.9/fs/ext3/file.c Wed Oct 9 20:32:29 2002
+++ edited/fs/ext3/file.c Tue Oct 15 20:46:54 2002
@@ -61,7 +61,8 @@
*/

static ssize_t
-ext3_file_write(struct kiocb *iocb, const char *buf, size_t count, loff_t pos)
+ext3_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_dentry->d_inode;
@@ -76,15 +77,15 @@
if (IS_SYNC(inode) || (file->f_flags & O_SYNC))
mark_inode_dirty(inode);

- return generic_file_aio_write(iocb, buf, count, pos);
+ return generic_file_aio_write(iocb, iov, nr_segs, pos);
}

struct file_operations ext3_file_operations = {
.llseek = generic_file_llseek,
.read = do_sync_read,
.write = do_sync_write,
- .aio_read = generic_file_aio_read,
- .aio_write = ext3_file_write,
+ .aio_read = generic_file_aio_read,
+ .aio_write = ext3_file_aio_write,
.readv = generic_file_readv,
.writev = generic_file_writev,
.ioctl = ext3_ioctl,
--- 1.21/fs/nfs/file.c Tue Oct 8 23:37:02 2002
+++ edited/fs/nfs/file.c Tue Oct 15 20:55:56 2002
@@ -35,8 +35,10 @@
#define NFSDBG_FACILITY NFSDBG_FILE

static int nfs_file_mmap(struct file *, struct vm_area_struct *);
-static ssize_t nfs_file_read(struct kiocb *, char *, size_t, loff_t);
-static ssize_t nfs_file_write(struct kiocb *, const char *, size_t, loff_t);
+static ssize_t nfs_file_aio_read(struct kiocb *, const struct iovec *,
+ unsigned long, loff_t);
+static ssize_t nfs_file_aio_write(struct kiocb *, const struct iovec *,
+ unsigned long, loff_t);
static int nfs_file_flush(struct file *);
static int nfs_fsync(struct file *, struct dentry *dentry, int datasync);

@@ -44,8 +46,8 @@
.llseek = remote_llseek,
.read = do_sync_read,
.write = do_sync_write,
- .aio_read = nfs_file_read,
- .aio_write = nfs_file_write,
+ .aio_read = nfs_file_aio_read,
+ .aio_write = nfs_file_aio_write,
.mmap = nfs_file_mmap,
.open = nfs_open,
.flush = nfs_file_flush,
@@ -91,19 +93,20 @@
}

static ssize_t
-nfs_file_read(struct kiocb *iocb, char * buf, size_t count, loff_t pos)
+nfs_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
{
struct dentry * dentry = iocb->ki_filp->f_dentry;
struct inode * inode = dentry->d_inode;
ssize_t result;

- dfprintk(VFS, "nfs: read(%s/%s, %lu@%lu)\n",
+ dfprintk(VFS, "nfs: read(%s/%s, %lu)\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
- (unsigned long) count, (unsigned long) pos);
+ (unsigned long)pos);

result = nfs_revalidate_inode(NFS_SERVER(inode), inode);
if (!result)
- result = generic_file_aio_read(iocb, buf, count, pos);
+ result = generic_file_aio_read(iocb, iov, nr_segs, pos);
return result;
}

@@ -211,15 +214,16 @@
* Write to a file (through the page cache).
*/
static ssize_t
-nfs_file_write(struct kiocb *iocb, const char *buf, size_t count, loff_t pos)
+nfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
{
struct dentry * dentry = iocb->ki_filp->f_dentry;
struct inode * inode = dentry->d_inode;
ssize_t result;

- dfprintk(VFS, "nfs: write(%s/%s(%ld), %lu@%lu)\n",
+ dfprintk(VFS, "nfs: write(%s/%s(%ld), %lu)\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
- inode->i_ino, (unsigned long) count, (unsigned long) pos);
+ inode->i_ino, (unsigned long) pos);

result = -EBUSY;
if (IS_SWAPFILE(inode))
@@ -228,11 +232,7 @@
if (result)
goto out;

- result = count;
- if (!count)
- goto out;
-
- result = generic_file_aio_write(iocb, buf, count, pos);
+ result = generic_file_aio_write(iocb, iov, nr_segs, pos);
out:
return result;

--- 1.6/include/linux/aio.h Thu Oct 3 22:19:27 2002
+++ edited/include/linux/aio.h Tue Oct 15 20:38:00 2002
@@ -2,6 +2,7 @@
#define __LINUX__AIO_H

#include <linux/list.h>
+#include <linux/uio.h>
#include <linux/workqueue.h>
#include <linux/aio_abi.h>

@@ -60,6 +61,7 @@
* for cancellation */

void *ki_user_obj; /* pointer to userland's iocb */
+ struct iovec ki_single; /* iovec for non-vectored I/O */
__u64 ki_user_data; /* user's data for completion */
loff_t ki_pos;

@@ -145,6 +147,10 @@
extern void FASTCALL(kick_iocb(struct kiocb *iocb));
extern int FASTCALL(aio_complete(struct kiocb *iocb, long res, long res2));
extern void FASTCALL(__put_ioctx(struct kioctx *ctx));
+extern ssize_t aio_read_single(struct kiocb *iocb, char *buf,
+ size_t count, loff_t pos);
+extern ssize_t aio_write_single(struct kiocb *iocb, const char *buf,
+ size_t count, loff_t pos);
struct mm_struct;
extern void FASTCALL(exit_aio(struct mm_struct *mm));

--- 1.170/include/linux/fs.h Fri Oct 11 10:49:46 2002
+++ edited/include/linux/fs.h Tue Oct 15 20:37:27 2002
@@ -744,9 +744,11 @@
struct module *owner;
loff_t (*llseek) (struct file *, loff_t, int);
ssize_t (*read) (struct file *, char *, size_t, loff_t *);
- ssize_t (*aio_read) (struct kiocb *, char *, size_t, loff_t);
+ ssize_t (*aio_read) (struct kiocb *, const struct iovec *,
+ unsigned long, loff_t);
ssize_t (*write) (struct file *, const char *, size_t, loff_t *);
- ssize_t (*aio_write) (struct kiocb *, const char *, size_t, loff_t);
+ ssize_t (*aio_write) (struct kiocb *, const struct iovec *,
+ unsigned long, loff_t);
int (*readdir) (struct file *, void *, filldir_t);
unsigned int (*poll) (struct file *, struct poll_table_struct *);
int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long);
@@ -1242,8 +1244,10 @@
extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size);
extern ssize_t generic_file_read(struct file *, char *, size_t, loff_t *);
extern ssize_t generic_file_write(struct file *, const char *, size_t, loff_t *);
-extern ssize_t generic_file_aio_read(struct kiocb *, char *, size_t, loff_t);
-extern ssize_t generic_file_aio_write(struct kiocb *, const char *, size_t, loff_t);
+extern ssize_t generic_file_aio_read(struct kiocb *, const struct iovec *,
+ unsigned long, loff_t);
+extern ssize_t generic_file_aio_write(struct kiocb *, const struct iovec *,
+ unsigned long, loff_t);
extern ssize_t do_sync_read(struct file *filp, char *buf, size_t len, loff_t *ppos);
extern ssize_t do_sync_write(struct file *filp, const char *buf, size_t len, loff_t *ppos);
ssize_t generic_file_write_nolock(struct file *file, const struct iovec *iov,
--- 1.23/include/net/sock.h Fri Oct 11 01:14:45 2002
+++ edited/include/net/sock.h Tue Oct 15 20:06:13 2002
@@ -303,7 +303,6 @@
struct socket *sock;
struct sock *sk;
struct msghdr *msg, async_msg;
- struct iovec async_iov;
struct scm_cookie *scm, async_scm;
};

--- 1.147/mm/filemap.c Sun Oct 13 17:39:40 2002
+++ edited/mm/filemap.c Tue Oct 15 20:41:07 2002
@@ -884,12 +884,11 @@
}

ssize_t
-generic_file_aio_read(struct kiocb *iocb, char *buf, size_t count, loff_t pos)
+generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
{
- struct iovec local_iov = { .iov_base = buf, .iov_len = count };
-
BUG_ON(iocb->ki_pos != pos);
- return __generic_file_aio_read(iocb, &local_iov, 1, &iocb->ki_pos);
+ return __generic_file_aio_read(iocb, iov, nr_segs, &iocb->ki_pos);
}

ssize_t
@@ -1645,10 +1644,10 @@
return err;
}

-ssize_t generic_file_aio_write(struct kiocb *iocb, const char *buf,
- size_t count, loff_t pos)
+ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
{
- return generic_file_write(iocb->ki_filp, buf, count, &iocb->ki_pos);
+ return generic_file_writev(iocb->ki_filp, iov, nr_segs, &iocb->ki_pos);
}

ssize_t generic_file_write(struct file *file, const char *buf,
--- 1.30/net/socket.c Sat Oct 12 09:37:17 2002
+++ edited/net/socket.c Tue Oct 15 21:03:19 2002
@@ -90,10 +90,10 @@
#include <linux/netfilter.h>

static int sock_no_open(struct inode *irrelevant, struct file *dontcare);
-static ssize_t sock_aio_read(struct kiocb *iocb, char *buf,
- size_t size, loff_t pos);
-static ssize_t sock_aio_write(struct kiocb *iocb, const char *buf,
- size_t size, loff_t pos);
+static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos);
+static ssize_t sock_aio_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos);
static int sock_mmap(struct file *file, struct vm_area_struct * vma);

static int sock_close(struct inode *inode, struct file *file);
@@ -586,31 +586,31 @@
* area ubuf...ubuf+size-1 is writable before asking the protocol.
*/

-static ssize_t sock_aio_read(struct kiocb *iocb, char *ubuf,
- size_t size, loff_t pos)
+static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
{
struct sock_iocb *x = kiocb_to_siocb(iocb);
struct socket *sock;
int flags;
+ size_t tot_len = 0;
+ int i;

if (pos != 0)
return -ESPIPE;
- if (size==0) /* Match SYS5 behaviour */
- return 0;

sock = SOCKET_I(iocb->ki_filp->f_dentry->d_inode);

x->async_msg.msg_name = NULL;
x->async_msg.msg_namelen = 0;
- x->async_msg.msg_iov = &x->async_iov;
- x->async_msg.msg_iovlen = 1;
+ x->async_msg.msg_iov = (struct iovec *)iov;
+ x->async_msg.msg_iovlen = nr_segs;
x->async_msg.msg_control = NULL;
x->async_msg.msg_controllen = 0;
- x->async_iov.iov_base = ubuf;
- x->async_iov.iov_len = size;
flags = !(iocb->ki_filp->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT;

- return __sock_recvmsg(iocb, sock, &x->async_msg, size, flags);
+ for (i = 0 ; i < nr_segs ; i++)
+ tot_len += iov[i].iov_len;
+ return __sock_recvmsg(iocb, sock, &x->async_msg, flags, tot_len);
}


@@ -619,32 +619,32 @@
* is readable by the user process.
*/

-static ssize_t sock_aio_write(struct kiocb *iocb, const char *ubuf,
- size_t size, loff_t pos)
+static ssize_t sock_aio_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
{
struct sock_iocb *x = kiocb_to_siocb(iocb);
struct socket *sock;
+ size_t tot_len = 0;
+ int i;

if (pos != 0)
return -ESPIPE;
- if(size==0) /* Match SYS5 behaviour */
- return 0;

sock = SOCKET_I(iocb->ki_filp->f_dentry->d_inode);

x->async_msg.msg_name = NULL;
x->async_msg.msg_namelen = 0;
- x->async_msg.msg_iov = &x->async_iov;
- x->async_msg.msg_iovlen = 1;
+ x->async_msg.msg_iov = (struct iovec *)iov;
+ x->async_msg.msg_iovlen = nr_segs;
x->async_msg.msg_control = NULL;
x->async_msg.msg_controllen = 0;
x->async_msg.msg_flags = !(iocb->ki_filp->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT;
if (sock->type == SOCK_SEQPACKET)
x->async_msg.msg_flags |= MSG_EOR;
- x->async_iov.iov_base = (void *)ubuf;
- x->async_iov.iov_len = size;

- return __sock_sendmsg(iocb, sock, &x->async_msg, size);
+ for (i = 0 ; i < nr_segs ; i++)
+ tot_len += iov[i].iov_len;
+ return __sock_sendmsg(iocb, sock, &x->async_msg, tot_len);
}

ssize_t sock_sendpage(struct file *file, struct page *page,
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Janet Morgan

unread,
Oct 16, 2002, 3:04:39 AM10/16/02
to
>
> On Tue, Oct 15, 2002 at 10:33:15PM -0400, Christoph Hellwig wrote:
> > Proposed next steps: Convert over all readv/writev users
> > to aio_read/aio_write and remove the methods. Implement
> > aio_read/aio_write in all filesystems using the generic
> > pagecache code and kill the "normal" generic_file_read
> > and generic_file_write.
> >
> > Comments?
>
> Please not right now? ;-) At least introduce it as aio_readv/aio_writev
> as currently the way we deal with iovecs uglifies a lot of code, with no
> benefit for the vast majority of applications. As for killing
> generic_file_read/write and using the aio counterparts, that is the plan
> pending a bit more testing of things. I want to get there step by step
> without breaking everything along the way.
>
> -ben

Here's a patch for aio readv/writev support. Basically it adds:

- two new opcodes (IOCB_CMD_PREADV and IOCB_CMD_PWRITEV)
- a field to the iocb for the user vector
- aio_readv/writev methods to the file_operations structure
- routine aio.c/io_readv_writev, which borrows heavily from do_readv_writev.

I tested this using the aio dio patch that Badari submitted a while back.
I compared:
readv/writev io_submit for a vector of N iovecs
vs read/write io_submit for N iocbs.

My performance data is only preliminary at this point, but aio readv/writev
appears to outperform aio read/write -- twice as fast in some cases. The
results generally make sense to me: while there is only one io_submit in both
cases, aio readv/writev shortens codepath (one instead of N calls to the
underlying filesystem routine) and should normally result in fewer
bios/callbacks (at least for direct-io). As importantly, aio readv/writev
in my testing also reduces the number of (system) calls to io_getevents.

-Janet


fs/aio.c | 110 ++++++++++++++++++++++++++++++++++++++++++++++++
include/linux/aio_abi.h | 12 +++++
include/linux/fs.h | 2
3 files changed, 124 insertions(+)

--- linux-2.5.42/include/linux/aio_abi.h Fri Oct 11 21:21:33 2002
+++ aiov/include/linux/aio_abi.h Tue Oct 15 22:19:05 2002
@@ -41,6 +41,8 @@
* IOCB_CMD_POLL = 5,
*/
IOCB_CMD_NOOP = 6,
+ IOCB_CMD_PREADV = 7,
+ IOCB_CMD_PWRITEV = 8,
};

/* read() from /dev/aio returns these structures. */
@@ -65,6 +67,12 @@
* proper padding and aio_error abstraction
*/

+struct io_iocb_vector {
+ const struct iovec *vec;
+ __s32 nr;
+ __s64 offset;
+};
+
struct iocb {
/* these are internal to the kernel/libc. */
__u64 aio_data; /* data to be returned in event's data */
@@ -80,6 +88,10 @@
__u64 aio_nbytes;
__s64 aio_offset;

+ union {
+ struct io_iocb_vector v;
+ } u;
+
/* extra parameters */
__u64 aio_reserved2; /* TODO: use this for a (struct sigevent *) */
__u64 aio_reserved3;
--- linux-2.5.42/include/linux/fs.h Fri Oct 11 21:21:35 2002
+++ aiov/include/linux/fs.h Tue Oct 15 22:14:19 2002
@@ -763,6 +763,8 @@
ssize_t (*sendfile) (struct file *, struct file *, loff_t *, size_t);
ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
+ ssize_t (*aio_readv) (struct kiocb *, const struct iovec *, unsigned long , loff_t *);
+ ssize_t (*aio_writev) (struct kiocb *, const struct iovec *, unsigned long, loff_t *);
};

struct inode_operations {
--- linux-2.5.42/fs/aio.c Fri Oct 11 21:22:07 2002
+++ aiov/fs/aio.c Tue Oct 15 22:37:29 2002
@@ -33,6 +33,7 @@
#include <linux/module.h>
#include <linux/highmem.h>
#include <linux/workqueue.h>
+#include <linux/dnotify.h>

#include <asm/kmap_types.h>
#include <asm/uaccess.h>
@@ -992,6 +993,97 @@
return -EINVAL;
}

+#define iov(iocb) iocb->u.v.vec
+#define nr_segs(iocb) iocb->u.v.nr
+
+static ssize_t io_readv_writev(int type, struct kiocb *req,
+ const struct iovec * vector, unsigned long nr_segs, loff_t pos)
+{
+ size_t tot_len;
+ struct iovec iovstack[UIO_FASTIOV];
+ struct iovec *iov=iovstack;
+ ssize_t ret;
+ int seg;
+ struct file *file = req->ki_filp;
+ struct inode *inode = file->f_dentry->d_inode;
+
+ /*
+ * SuS says "The readv() function *may* fail if the iovcnt argument
+ * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
+ * traditionally returned zero for zero segments, so...
+ */
+ ret = 0;
+ if (nr_segs == 0)
+ goto out;
+
+ ret = -EINVAL;
+ if ((nr_segs > UIO_MAXIOV) || (nr_segs <= 0))
+ goto out;
+
+ if (nr_segs > UIO_FASTIOV) {
+ ret = -ENOMEM;
+ iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
+ if (!iov)
+ goto out;
+ }
+
+ if (copy_from_user(iov, vector, nr_segs*sizeof(*vector))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ /*
+ * Single unix specification:
+ * We should -EINVAL if an element length is not >= 0 or if
+ * the total length does not fit a ssize_t.
+ *
+ * Be careful here because iov_len is a size_t not a ssize_t.
+ */
+ tot_len = 0;
+ ret = -EINVAL;
+ for (seg = 0 ; seg < nr_segs; seg++) {
+ ssize_t tmp = tot_len;
+ ssize_t len = (ssize_t)iov[seg].iov_len;
+ void * base = iov[seg].iov_base;
+
+ if (len < 0) /* size_t not fitting an ssize_t .. */
+ goto out;
+ tot_len += len;
+ if (tot_len < tmp) /* maths overflow on the ssize_t */
+ goto out;
+
+ if (unlikely(!access_ok(type == IOCB_CMD_PREAD ?
+ VERIFY_WRITE : VERIFY_READ, base, len))) {
+ ret = -EFAULT;
+ goto out;
+ }
+ }
+ /* return zero if total length is zero */
+ if (tot_len == 0) {
+ ret = 0;
+ goto out;
+ }
+
+ ret = locks_verify_area((type == IOCB_CMD_PREADV
+ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE),
+ inode, file, file->f_pos, tot_len);
+ if (ret)
+ goto out;
+
+ if (type == IOCB_CMD_PREADV)
+ ret = file->f_op->aio_readv(req, iov, nr_segs, &pos);
+ else
+ ret = file->f_op->aio_writev(req, iov, nr_segs, &pos);
+
+out:
+ if (iov != iovstack)
+ kfree(iov);
+ if ((ret + (type == IOCB_CMD_PREADV)) > 0)
+ dnotify_parent(file->f_dentry,
+ (type == IOCB_CMD_PREADV) ? DN_MODIFY : DN_ACCESS);


+ return ret;
+}
+

static int FASTCALL(io_submit_one(struct kioctx *ctx, struct iocb *user_iocb,
struct iocb *iocb));
static int io_submit_one(struct kioctx *ctx, struct iocb *user_iocb,

@@ -1068,6 +1160,24 @@


ret = file->f_op->aio_write(req, buf,

iocb->aio_nbytes, req->ki_pos);
break;

+ case IOCB_CMD_PREADV:
+ ret = -EBADF;
+ if (unlikely(!(file->f_mode & FMODE_READ)))
+ goto out_put_req;
+ ret = -EINVAL;
+ if (file->f_op->aio_readv)
+ ret = io_readv_writev(IOCB_CMD_PREADV, req,
+ iov(iocb), nr_segs(iocb), iocb->aio_offset);
+ break;
+ case IOCB_CMD_PWRITEV:
+ ret = -EBADF;
+ if (unlikely(!(file->f_mode & FMODE_WRITE)))
+ goto out_put_req;
+ ret = -EINVAL;
+ if (file->f_op->aio_writev)
+ ret = io_readv_writev(IOCB_CMD_PWRITEV, req,
+ iov(iocb), nr_segs(iocb), iocb->aio_offset);
+ break;
case IOCB_CMD_FDSYNC:
ret = -EINVAL;
if (file->f_op->aio_fsync)
-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in

Shailabh Nagar

unread,
Oct 16, 2002, 10:03:53 AM10/16/02
to

Janet Morgan wrote:

> Here's a patch for aio readv/writev support. Basically it adds:
>
> - two new opcodes (IOCB_CMD_PREADV and IOCB_CMD_PWRITEV)
> - a field to the iocb for the user vector
> - aio_readv/writev methods to the file_operations structure

I presume f_op->aio_readv could point to __generic_file_aio_read for most
filesystems.

Would f_op->aio_writev need a new wrapper function for 2.5.42 ?
f_op->aio_write eventually calls generic_file_write which uses a different inode
from generic_file_writev. So f_op->aio_writev might need to point to a function
like generic_file_writev but using the same inode as generic_file_write.


> - routine aio.c/io_readv_writev, which borrows heavily from do_readv_writev.
>
> I tested this using the aio dio patch that Badari submitted a while back.
> I compared:
> readv/writev io_submit for a vector of N iovecs
> vs read/write io_submit for N iocbs.
>
> My performance data is only preliminary at this point, but aio readv/writev
> appears to outperform aio read/write -- twice as fast in some cases. The
> results generally make sense to me: while there is only one io_submit in both
> cases, aio readv/writev shortens codepath (one instead of N calls to the
> underlying filesystem routine) and should normally result in fewer

Twice as fast looks good !

> bios/callbacks (at least for direct-io). As importantly, aio readv/writev
> in my testing also reduces the number of (system) calls to io_getevents.

It would be interesting to see the performance boost when <iov length> events
are retrieved at once, using the min_nr parameter of io_getevents.


--Shailabh

Helen Pang

unread,
Oct 16, 2002, 10:55:00 AM10/16/02
to

Shailabh Nagar wrote:

> It would be interesting to see the performance boost when <iov length>
events
> are retrieved at once, using the min_nr parameter of io_getevents

My experience is that specified minimum number of events (min_nr) of
io_getevents is not quite working yet in kernel 2.4.19.
I haven't started to exercise this in 2.5.42, but if it is working,
logically it will help the performance indeed.

-Helen


Shailabh Nagar <na...@watson.ibm.com>@kvack.org on 10/16/2002 08:41:03 AM

Sent by: owner-l...@kvack.org


To: jane...@beaverton.ibm.com
cc: Benjamin LaHaise <bc...@redhat.com>, Christoph Hellwig <h...@sgi.com>,
ak...@digeo.com, linux-...@vger.kernel.org,
linux-...@vger.kernel.org, linu...@kvack.org
Subject: Re: [RFC] iovec in ->aio_read/->aio_write

Janet Morgan wrote:


--Shailabh

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majo...@kvack.org. For more info on Linux AIO,
see: http://www.kvack.org/aio/


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in


the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html

Janet Morgan

unread,
Oct 17, 2002, 7:37:44 AM10/17/02
to
>
> On Tue, Oct 15, 2002 at 11:51:21PM -0700, Janet Morgan wrote:
> > - two new opcodes (IOCB_CMD_PREADV and IOCB_CMD_PWRITEV)
> > - a field to the iocb for the user vector
> > - aio_readv/writev methods to the file_operations structure
> > - routine aio.c/io_readv_writev, which borrows heavily from do_readv_writev.
>
> A few comments about the interface to userland: don't add fields to the
> iocb, that breaks the abi. Also, the iovec should be pointed to by
> iocb->aio_buf, with aio_nbytes containing the length of the iovec (that
> avoids the need for an additional field). The structure of the iovec
> itself should probably match the iovec struct, but that means we'll need
> different opcodes for the 32 bit and 64 bit variants. Alternatively a
> new iovec that is the same on both (like the iocb) could be defined, but
> that would be inconvenient for userland.
>
> Within the kernel, the loff_t *pos shouldn't be a pointer -- I'm trying to
> get rid of extraneous pointers as that means an additional chunk of memory
> has to be held over the entirety of the aio request. Similarly, the iovec
> passed into the operation itself can never be allocated on the stack. This
> probably works for direct io where the iovec gets translated into a scatter
> gather list of pages, but wouldn't work for something like networking where
> the iovec itself gets used to determine where in the user address space to
> copy data (as this can occur after the submit operation has returned).
>
> Aside from those details, I guess if people really need vectored ios it's
> okay.
>
> -ben
>

Thanks for the really helpful comments. I've attached another version
of the patch.

In addition to (hopefully) addressing the issues you raised, I also
removed the calls to locks_verify_area and dnotify_parent from my original
patch. Please let me know if you think I should re-instate those calls or
if you see any new problems.

-Janet


fs/aio.c | 156 ++++++++++++++++++++++++++++++++++++++++++++++++
include/linux/aio.h | 1
include/linux/aio_abi.h | 14 ++++
include/linux/fs.h | 2
4 files changed, 173 insertions(+)


--- linux-2.5.43/include/linux/aio_abi.h Tue Oct 15 20:27:18 2002
+++ aiov/include/linux/aio_abi.h Wed Oct 16 23:27:40 2002
@@ -41,6 +41,10 @@


* IOCB_CMD_POLL = 5,
*/
IOCB_CMD_NOOP = 6,

+ IOCB_CMD_PREADV32 = 7,
+ IOCB_CMD_PREADV64 = 8,
+ IOCB_CMD_PWRITEV32 = 9,
+ IOCB_CMD_PWRITEV64 = 10,


};

/* read() from /dev/aio returns these structures. */

@@ -85,6 +89,16 @@
__u64 aio_reserved3;
}; /* 64 bytes */

+struct iovec32 {
+ __u32 iov_base;
+ __u32 iov_len;
+};
+
+struct iovec64 {
+ __u64 iov_base;
+ __u64 iov_len;
+};
+
#undef IFBIG
#undef IFLITTLE

--- linux-2.5.43/include/linux/fs.h Tue Oct 15 20:27:45 2002
+++ aiov/include/linux/fs.h Wed Oct 16 20:41:33 2002
@@ -764,6 +764,8 @@


ssize_t (*sendfile) (struct file *, struct file *, loff_t *, size_t);
ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);

+ ssize_t (*aio_readv) (struct kiocb *, const struct iovec *, unsigned long , loff_t);
+ ssize_t (*aio_writev) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
};

struct inode_operations {
--- linux-2.5.43/include/linux/aio.h Tue Oct 15 20:27:09 2002
+++ aiov/include/linux/aio.h Wed Oct 16 20:41:33 2002
@@ -51,6 +51,7 @@
int ki_users;
unsigned ki_key; /* id of this request */

+ struct iovec *ki_iov; /* io vector for readv/writev */
struct file *ki_filp;
struct kioctx *ki_ctx; /* may be NULL for sync ops */
int (*ki_cancel)(struct kiocb *, struct io_event *);
--- linux-2.5.43/fs/aio.c Tue Oct 15 20:27:57 2002
+++ aiov/fs/aio.c Thu Oct 17 02:40:49 2002
@@ -28,6 +28,7 @@


#include <linux/module.h>
#include <linux/highmem.h>
#include <linux/workqueue.h>
+#include <linux/dnotify.h>

#include <asm/kmap_types.h>
#include <asm/uaccess.h>

@@ -393,6 +394,7 @@
req->ki_users = 1;
req->ki_key = 0;
req->ki_ctx = ctx;
+ req->ki_iov = NULL;
req->ki_cancel = NULL;
req->ki_retry = NULL;
req->ki_user_obj = NULL;
@@ -444,6 +446,9 @@
req->ki_ctx = NULL;
req->ki_filp = NULL;
req->ki_user_obj = NULL;
+ if (req->ki_iov)
+ kfree(req->ki_iov);
+ req->ki_iov = NULL;
kmem_cache_free(kiocb_cachep, req);
ctx->reqs_active--;

@@ -987,6 +992,139 @@
return -EINVAL;
}

+#define _32bit_kiov (sizeof(struct iovec) == sizeof(struct iovec32))
+#define _64bit_kiov (sizeof(struct iovec) == sizeof(struct iovec64))
+
+#define _32bit(cmd) \
+ (((cmd) == IOCB_CMD_PREADV32) || ((cmd) == IOCB_CMD_PWRITEV32))
+#define _64bit(cmd) \
+ (((cmd) == IOCB_CMD_PREADV64) || ((cmd) == IOCB_CMD_PWRITEV64))
+
+#define is_readv(cmd) \
+ (((cmd) == IOCB_CMD_PREADV32) || ((cmd) == IOCB_CMD_PREADV64))
+
+#define can_fast_copy(cmd) \
+ ((_32bit(cmd) && _32bit_kiov) || ((_64bit(cmd) && _64bit_kiov)))
+
+int resize_user_vec(int cmd, struct kiocb *req, const struct iovec *uiov,
+ unsigned long nr_segs)
+{
+ size_t len;
+ struct iovec32 *iov32;
+ struct iovec64 *iov64;
+ struct iovec *kiov = req->ki_iov;
+ int seg, ret = 0;
+
+ /*
+ * copy 32-bit user iovecs to 64-bit kernel iovecs or vice versa
+ */
+ if (_32bit(cmd)) {
+ len = nr_segs*sizeof(struct iovec32);
+ iov32 = kmalloc(len, GFP_KERNEL);
+ if (!iov32)
+ return -ENOMEM;
+ if (copy_from_user(iov32, uiov, len))
+ ret = -EFAULT;
+ else for (seg = 0; seg < nr_segs; seg++) {
+ kiov[seg].iov_base = (void *)iov32[seg].iov_base;
+ kiov[seg].iov_len = iov32[seg].iov_len;
+ }
+ kfree(iov32);
+
+ } else {
+ len = nr_segs*sizeof(struct iovec64);
+ iov64 = kmalloc(len, GFP_KERNEL);
+ if (!iov64)
+ return -ENOMEM;
+ if (copy_from_user(iov64, uiov, len))
+ ret = -EFAULT;
+ else for (seg = 0; seg < nr_segs; seg++) {
+ kiov[seg].iov_base =
+ (void *)(unsigned long)iov64[seg].iov_base;
+ kiov[seg].iov_len = iov64[seg].iov_len;
+ }
+ kfree(iov64);
+ }
+out:


+ return ret;
+}
+

+#define iov(iocb) (const struct iovec *)(unsigned long)iocb->aio_buf
+#define nr_segs(iocb) (unsigned long)iocb->aio_nbytes
+
+static ssize_t io_readv_writev(int cmd, struct kiocb *req, struct iocb *iocb)
+{
+ size_t tot_len;


+ int seg;
+ struct file *file = req->ki_filp;

+ const struct iovec *uiov = iov(iocb);
+ unsigned long nr_segs = nr_segs(iocb);
+ ssize_t ret = 0;
+


+ if (nr_segs == 0)
+ goto out;
+

+ if ((nr_segs > UIO_MAXIOV) || ((ssize_t)nr_segs < 0)) {
+ ret = -EINVAL;


+ goto out;
+ }
+

+ req->ki_iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
+ if (!req->ki_iov) {
+ ret = -ENOMEM;


+ goto out;
+ }
+

+ ret = -EFAULT;
+ if (can_fast_copy(cmd)) {
+ int len = nr_segs * sizeof(struct iovec);
+ if (copy_from_user(req->ki_iov, uiov, len)) {
+ goto out;
+ }
+ } else if (resize_user_vec(cmd, req, uiov, nr_segs))
+ goto out;
+


+ /*
+ * Single unix specification:
+ * We should -EINVAL if an element length is not >= 0 or if
+ * the total length does not fit a ssize_t.
+ *
+ * Be careful here because iov_len is a size_t not a ssize_t.
+ */
+ tot_len = 0;
+ ret = -EINVAL;
+ for (seg = 0 ; seg < nr_segs; seg++) {
+ ssize_t tmp = tot_len;

+ ssize_t len = (ssize_t)req->ki_iov[seg].iov_len;
+ void * base = req->ki_iov[seg].iov_base;


+
+ if (len < 0) /* size_t not fitting an ssize_t .. */
+ goto out;
+ tot_len += len;
+ if (tot_len < tmp) /* maths overflow on the ssize_t */
+ goto out;
+

+ if (unlikely(!access_ok(is_readv(cmd) ?

+ VERIFY_WRITE : VERIFY_READ, base, len))) {
+ ret = -EFAULT;
+ goto out;
+ }
+ }

+ if (tot_len == 0) {
+ ret = 0;
+ goto out;
+ }
+

+ if (is_readv(cmd))
+ ret = file->f_op->aio_readv(req, req->ki_iov,
+ nr_segs(iocb), iocb->aio_offset);
+ else
+ ret = file->f_op->aio_writev(req, req->ki_iov,
+ nr_segs(iocb), iocb->aio_offset);
+out:


+ return ret;
+}
+
static int FASTCALL(io_submit_one(struct kioctx *ctx, struct iocb *user_iocb,
struct iocb *iocb));
static int io_submit_one(struct kioctx *ctx, struct iocb *user_iocb,

@@ -1063,6 +1201,24 @@


ret = file->f_op->aio_write(req, buf,
iocb->aio_nbytes, req->ki_pos);
break;

+ case IOCB_CMD_PREADV32:
+ case IOCB_CMD_PREADV64:


+ ret = -EBADF;
+ if (unlikely(!(file->f_mode & FMODE_READ)))
+ goto out_put_req;
+ ret = -EINVAL;
+ if (file->f_op->aio_readv)

+ ret = io_readv_writev(iocb->aio_lio_opcode, req, iocb);
+ break;
+ case IOCB_CMD_PWRITEV32:
+ case IOCB_CMD_PWRITEV64:


+ ret = -EBADF;
+ if (unlikely(!(file->f_mode & FMODE_WRITE)))
+ goto out_put_req;
+ ret = -EINVAL;
+ if (file->f_op->aio_writev)

+ ret = io_readv_writev(iocb->aio_lio_opcode, req, iocb);

+ break;
case IOCB_CMD_FDSYNC:
ret = -EINVAL;
if (file->f_op->aio_fsync)

-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in

0 new messages