Google Groups no longer supports new Usenet posts or subscriptions. Historical content remains viewable.

Dismiss

[PATCH] Filesystem aio rdwr patchset

2 views

Skip to first unread message

Suparna Bhattacharya

unread,

Apr 1, 2003, 11:42:52 AM4/1/03

On Tue, Apr 01, 2003 at 09:59:57PM +0530, Suparna Bhattacharya wrote:
> 01aioretry.patch : this is the common generic aio
> retry code

--
Suparna Bhattacharya (sup...@in.ibm.com)
Linux Technology Center
IBM Software Labs, India

diff -ur linux-2.5.66/fs/aio.c linux-2.5.66aio/fs/aio.c
--- linux-2.5.66/fs/aio.c Tue Mar 25 03:30:22 2003
+++ linux-2.5.66aio/fs/aio.c Wed Mar 26 20:25:02 2003
@@ -395,6 +396,7 @@
req->ki_cancel = NULL;
req->ki_retry = NULL;
req->ki_user_obj = NULL;
+ INIT_LIST_HEAD(&req->ki_run_list);

/* Check if the completion queue has enough free space to
* accept an event from this io.
@@ -558,46 +560,124 @@
enter_lazy_tlb(mm, current, smp_processor_id());
}

-/* Run on kevent's context. FIXME: needs to be per-cpu and warn if an
- * operation blocks.
- */
-static void aio_kick_handler(void *data)
+static inline int __queue_kicked_iocb(struct kiocb *iocb)
{
- struct kioctx *ctx = data;
+ struct kioctx *ctx = iocb->ki_ctx;

- use_mm(ctx->mm);
+ if (list_empty(&iocb->ki_run_list)) {
+ list_add_tail(&iocb->ki_run_list,
+ &ctx->run_list);
+ return 1;
+ }
+ return 0;
+}

- spin_lock_irq(&ctx->ctx_lock);
- while (!list_empty(&ctx->run_list)) {
- struct kiocb *iocb;
- long ret;
+/* Expects to be called with iocb->ki_ctx->lock held */
+static ssize_t aio_run_iocb(struct kiocb *iocb)
+{
+ struct kioctx *ctx = iocb->ki_ctx;
+ ssize_t (*retry)(struct kiocb *);
+ ssize_t ret;

- iocb = list_entry(ctx->run_list.next, struct kiocb,
- ki_run_list);
- list_del(&iocb->ki_run_list);
- iocb->ki_users ++;
- spin_unlock_irq(&ctx->ctx_lock);
+ if (iocb->ki_retried++ > 1024*1024) {
+ printk("Maximal retry count. Bytes done %d\n",
+ iocb->ki_nbytes - iocb->ki_left);
+ return -EAGAIN;
+ }
+
+ if (!(iocb->ki_retried & 0xff)) {
+ printk("%ld aio retries completed %d bytes of %d\n",
+ iocb->ki_retried,
+ iocb->ki_nbytes - iocb->ki_left, iocb->ki_nbytes);
+ }
+
+ if (!(retry = iocb->ki_retry)) {
+ printk("aio_run_iocb: iocb->ki_retry = NULL\n");
+ return 0;
+ }
+
+ iocb->ki_users ++;
+ kiocbClearKicked(iocb);
+ iocb->ki_run_list.next = iocb->ki_run_list.prev = NULL;
+ iocb->ki_retry = NULL;
+ spin_unlock_irq(&ctx->ctx_lock);
+
+ BUG_ON(current->iocb != NULL);
+
+ current->iocb = iocb;
+ ret = retry(iocb);
+ current->iocb = NULL;

- kiocbClearKicked(iocb);
- ret = iocb->ki_retry(iocb);
- if (-EIOCBQUEUED != ret) {
+ if (-EIOCBQUEUED != ret) {
+ if (list_empty(&iocb->ki_wait.task_list))
aio_complete(iocb, ret, 0);
- iocb = NULL;
- }
+ else
+ printk("can't delete iocb in use\n");
+ } else {
+ if (list_empty(&iocb->ki_wait.task_list))
+ kiocbSetKicked(iocb);
+ }
+ spin_lock_irq(&ctx->ctx_lock);

- spin_lock_irq(&ctx->ctx_lock);
- if (NULL != iocb)
- __aio_put_req(ctx, iocb);
+ iocb->ki_retry = retry;
+ INIT_LIST_HEAD(&iocb->ki_run_list);
+ if (kiocbIsKicked(iocb)) {
+ BUG_ON(ret != -EIOCBQUEUED);
+ __queue_kicked_iocb(iocb);
+ }
+ __aio_put_req(ctx, iocb);
+ return ret;
+}
+
+static void aio_run_iocbs(struct kioctx *ctx)
+{
+ struct kiocb *iocb;
+ ssize_t ret;
+
+ spin_lock_irq(&ctx->ctx_lock);
+ while (!list_empty(&ctx->run_list)) {
+ iocb = list_entry(ctx->run_list.next, struct kiocb,
+ ki_run_list);
+ list_del(&iocb->ki_run_list);
+ ret = aio_run_iocb(iocb);
}
spin_unlock_irq(&ctx->ctx_lock);
+}
+
+/* Run on aiod/kevent's context. FIXME: needs to be per-cpu and warn if an
+ * operation blocks.
+ */
+static void aio_kick_handler(void *data)
+{
+ struct kioctx *ctx = data;

+ use_mm(ctx->mm);
+ aio_run_iocbs(ctx);
unuse_mm(ctx->mm);
}

-void kick_iocb(struct kiocb *iocb)
+
+void queue_kicked_iocb(struct kiocb *iocb)
{
struct kioctx *ctx = iocb->ki_ctx;
+ unsigned long flags;
+ int run = 0;
+
+ WARN_ON((!list_empty(&iocb->ki_wait.task_list)));
+
+ spin_lock_irqsave(&ctx->ctx_lock, flags);
+ run = __queue_kicked_iocb(iocb);
+ spin_unlock_irqrestore(&ctx->ctx_lock, flags);
+ if (run) {
+ if (waitqueue_active(&ctx->wait))
+ wake_up(&ctx->wait);
+ else
+ queue_work(aio_wq, &ctx->wq);
+ }
+}

+void kick_iocb(struct kiocb *iocb)
+{
/* sync iocbs are easy: they can only ever be executing from a
* single context. */
if (is_sync_kiocb(iocb)) {
@@ -607,11 +687,9 @@
}

if (!kiocbTryKick(iocb)) {
- unsigned long flags;
- spin_lock_irqsave(&ctx->ctx_lock, flags);
- list_add_tail(&iocb->ki_run_list, &ctx->run_list);
- spin_unlock_irqrestore(&ctx->ctx_lock, flags);
- schedule_work(&ctx->wq);
+ queue_kicked_iocb(iocb);
+ } else {
+ pr_debug("iocb already kicked or in progress\n");
}
}

@@ -642,13 +720,13 @@
iocb->ki_user_data = res;
if (iocb->ki_users == 1) {
iocb->ki_users = 0;
- return 1;
+ ret = 1;
+ } else {
+ spin_lock_irq(&ctx->ctx_lock);
+ iocb->ki_users--;
+ ret = (0 == iocb->ki_users);
+ spin_unlock_irq(&ctx->ctx_lock);
}
- spin_lock_irq(&ctx->ctx_lock);
- iocb->ki_users--;
- ret = (0 == iocb->ki_users);
- spin_unlock_irq(&ctx->ctx_lock);
-
/* sync iocbs put the task here for us */
wake_up_process(iocb->ki_user_obj);
return ret;
@@ -664,6 +742,9 @@
*/
spin_lock_irqsave(&ctx->ctx_lock, flags);

+ if (iocb->ki_run_list.prev && !list_empty(&iocb->ki_run_list))
+ list_del_init(&iocb->ki_run_list);
+
ring = kmap_atomic(info->ring_pages[0], KM_IRQ1);

tail = info->tail;
@@ -865,6 +946,8 @@
ret = 0;
if (to.timed_out) /* Only check after read evt */
break;
+ /* accelerate kicked iocbs for this ctx */
+ aio_run_iocbs(ctx);
schedule();
if (signal_pending(tsk)) {
ret = -EINTR;
@@ -984,6 +1067,149 @@
return -EINVAL;
}

+ssize_t aio_pread(struct kiocb *iocb)
+{
+ struct file *file = iocb->ki_filp;
+ ssize_t ret = 0;
+
+ ret = file->f_op->aio_read(iocb, iocb->ki_buf,
+ iocb->ki_left, iocb->ki_pos);
+
+ pr_debug("aio_pread: fop ret %d\n", ret);
+
+ /*
+ * Can't just depend on iocb->ki_left to determine
+ * whether we are done. This may have been a short read.
+ */
+ if (ret > 0) {
+ iocb->ki_buf += ret;
+ iocb->ki_left -= ret;
+
+ ret = -EIOCBQUEUED;
+ }
+
+ /* This means we must have transferred all that we could */
+ /* No need to retry anymore */
+ if (ret == 0)
+ ret = iocb->ki_nbytes - iocb->ki_left;
+
+ return ret;
+}
+
+ssize_t aio_pwrite(struct kiocb *iocb)
+{
+ struct file *file = iocb->ki_filp;
+ ssize_t ret = 0;
+
+ ret = file->f_op->aio_write(iocb, iocb->ki_buf,
+ iocb->ki_left, iocb->ki_pos);
+
+ pr_debug("aio_pread: fop ret %d\n", ret);
+
+ /*
+ * TBD: Even if iocb->ki_left = 0, could we need to
+ * wait for data to be sync'd ? Or can we assume
+ * that aio_fdsync/aio_fsync would be called explicitly
+ * as required.
+ */
+ if (ret > 0) {
+ iocb->ki_buf += ret;
+ iocb->ki_left -= ret;
+
+ ret = -EIOCBQUEUED;
+ }
+
+ /* This means we must have transferred all that we could */
+ /* No need to retry anymore */
+ if (ret == 0)
+ ret = iocb->ki_nbytes - iocb->ki_left;
+
+ return ret;
+}
+
+ssize_t aio_fdsync(struct kiocb *iocb)
+{
+ struct file *file = iocb->ki_filp;
+ ssize_t ret = -EINVAL;
+
+ if (file->f_op->aio_fsync)
+ ret = file->f_op->aio_fsync(iocb, 1);
+ return ret;
+}
+
+ssize_t aio_fsync(struct kiocb *iocb)
+{
+ struct file *file = iocb->ki_filp;
+ ssize_t ret = -EINVAL;
+
+ if (file->f_op->aio_fsync)
+ ret = file->f_op->aio_fsync(iocb, 0);
+ return ret;
+}
+
+/* Called during initial submission and subsequent retry operations */
+ssize_t aio_setup_iocb(struct kiocb *iocb)
+{
+ struct file *file = iocb->ki_filp;
+ ssize_t ret = 0;
+
+ switch (iocb->ki_opcode) {
+ case IOCB_CMD_PREAD:
+ ret = -EBADF;
+ if (unlikely(!(file->f_mode & FMODE_READ)))
+ break;
+ ret = -EFAULT;
+ if (unlikely(!access_ok(VERIFY_WRITE, iocb->ki_buf,
+ iocb->ki_left)))
+ break;
+ ret = -EINVAL;
+ if (file->f_op->aio_read)
+ iocb->ki_retry = aio_pread;
+ break;
+ case IOCB_CMD_PWRITE:
+ ret = -EBADF;
+ if (unlikely(!(file->f_mode & FMODE_WRITE)))
+ break;
+ ret = -EFAULT;
+ if (unlikely(!access_ok(VERIFY_READ, iocb->ki_buf,
+ iocb->ki_left)))
+ break;
+ ret = -EINVAL;
+ if (file->f_op->aio_write)
+ iocb->ki_retry = aio_pwrite;
+ break;
+ case IOCB_CMD_FDSYNC:
+ ret = -EINVAL;
+ if (file->f_op->aio_fsync)
+ iocb->ki_retry = aio_fdsync;
+ break;
+ case IOCB_CMD_FSYNC:
+ ret = -EINVAL;
+ if (file->f_op->aio_fsync)
+ iocb->ki_retry = aio_fsync;
+ break;
+ default:
+ dprintk("EINVAL: io_submit: no operation provided\n");
+ ret = -EINVAL;
+ }
+
+ if (!iocb->ki_retry)
+ return ret;
+
+ pr_debug("ki_pos = %llu\n", iocb->ki_pos);
+
+ return 0;
+}
+
+int aio_wake_function(wait_queue_t *wait, unsigned mode, int sync)
+{
+ struct kiocb *iocb = container_of(wait, struct kiocb, ki_wait);
+
+ list_del_init(&wait->task_list);
+ kick_iocb(iocb);
+ return 1;
+}
+
static int FASTCALL(io_submit_one(struct kioctx *ctx, struct iocb *user_iocb,
struct iocb *iocb));
static int io_submit_one(struct kioctx *ctx, struct iocb *user_iocb,
@@ -992,7 +1218,6 @@
struct kiocb *req;
struct file *file;
ssize_t ret;
- char *buf;

/* enforce forwards compatibility on users */
if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2 ||
@@ -1033,51 +1258,27 @@
req->ki_user_data = iocb->aio_data;
req->ki_pos = iocb->aio_offset;

- buf = (char *)(unsigned long)iocb->aio_buf;
+ req->ki_buf = (char *)(unsigned long)iocb->aio_buf;
+ req->ki_left = req->ki_nbytes = iocb->aio_nbytes;
+ req->ki_opcode = iocb->aio_lio_opcode;
+ init_waitqueue_func_entry(&req->ki_wait, aio_wake_function);
+ INIT_LIST_HEAD(&req->ki_wait.task_list);
+ req->ki_run_list.next = req->ki_run_list.prev = NULL;
+ req->ki_retry = NULL;
+ req->ki_retried = 0;

- switch (iocb->aio_lio_opcode) {
- case IOCB_CMD_PREAD:
- ret = -EBADF;
- if (unlikely(!(file->f_mode & FMODE_READ)))
- goto out_put_req;
- ret = -EFAULT;
- if (unlikely(!access_ok(VERIFY_WRITE, buf, iocb->aio_nbytes)))
- goto out_put_req;
- ret = -EINVAL;
- if (file->f_op->aio_read)
- ret = file->f_op->aio_read(req, buf,
- iocb->aio_nbytes, req->ki_pos);
- break;
- case IOCB_CMD_PWRITE:
- ret = -EBADF;
- if (unlikely(!(file->f_mode & FMODE_WRITE)))
- goto out_put_req;
- ret = -EFAULT;
- if (unlikely(!access_ok(VERIFY_READ, buf, iocb->aio_nbytes)))
- goto out_put_req;
- ret = -EINVAL;
- if (file->f_op->aio_write)
- ret = file->f_op->aio_write(req, buf,
- iocb->aio_nbytes, req->ki_pos);
- break;
- case IOCB_CMD_FDSYNC:
- ret = -EINVAL;
- if (file->f_op->aio_fsync)
- ret = file->f_op->aio_fsync(req, 1);
- break;
- case IOCB_CMD_FSYNC:
- ret = -EINVAL;
- if (file->f_op->aio_fsync)
- ret = file->f_op->aio_fsync(req, 0);
- break;
- default:
- dprintk("EINVAL: io_submit: no operation provided\n");
- ret = -EINVAL;
- }
+ ret = aio_setup_iocb(req);
+
+ if ((-EBADF == ret) || (-EFAULT == ret))
+ goto out_put_req;
+
+ spin_lock_irq(&ctx->ctx_lock);
+ ret = aio_run_iocb(req);
+ spin_unlock_irq(&ctx->ctx_lock);
+
+ if (-EIOCBQUEUED == ret)
+ queue_work(aio_wq, &ctx->wq);

- if (likely(-EIOCBQUEUED == ret))
- return 0;
- aio_complete(req, ret, 0);
return 0;

out_put_req:
diff -ur linux-2.5.66/include/linux/aio.h linux-2.5.66aio/include/linux/aio.h
--- linux-2.5.66/include/linux/aio.h Tue Mar 25 03:29:54 2003
+++ linux-2.5.66aio/include/linux/aio.h Wed Mar 26 18:46:18 2003
@@ -54,7 +54,7 @@
struct file *ki_filp;
struct kioctx *ki_ctx; /* may be NULL for sync ops */
int (*ki_cancel)(struct kiocb *, struct io_event *);
- long (*ki_retry)(struct kiocb *);
+ ssize_t (*ki_retry)(struct kiocb *);

struct list_head ki_list; /* the aio core uses this
* for cancellation */
@@ -62,6 +62,14 @@
void *ki_user_obj; /* pointer to userland's iocb */
__u64 ki_user_data; /* user's data for completion */
loff_t ki_pos;
+
+ /* State that we remember to be able to restart/retry */
+ unsigned short ki_opcode;
+ size_t ki_nbytes; /* copy of iocb->aio_nbytes */
+ char *ki_buf; /* remaining iocb->aio_buf */
+ size_t ki_left; /* remaining bytes */
+ wait_queue_t ki_wait;
+ long ki_retried; /* just for testing */

char private[KIOCB_PRIVATE_SIZE];
};
@@ -77,6 +85,8 @@
(x)->ki_ctx = &tsk->active_mm->default_kioctx; \
(x)->ki_cancel = NULL; \
(x)->ki_user_obj = tsk; \
+ (x)->ki_user_data = 0; \
+ init_wait((&(x)->ki_wait)); \
} while (0)

#define AIO_RING_MAGIC 0xa10a10a1
@@ -151,6 +161,13 @@
#define get_ioctx(kioctx) do { if (unlikely(atomic_read(&(kioctx)->users) <= 0)) BUG(); atomic_inc(&(kioctx)->users); } while (0)
#define put_ioctx(kioctx) do { if (unlikely(atomic_dec_and_test(&(kioctx)->users))) __put_ioctx(kioctx); else if (unlikely(atomic_read(&(kioctx)->users) < 0)) BUG(); } while (0)

+#define do_sync_op(op) do { \
+ struct kiocb *iocb = current->iocb; \
+ current->iocb = NULL; \
+ op; \
+ current->iocb = iocb; \
+ } while (0);
+
#include <linux/aio_abi.h>

static inline struct kiocb *list_kiocb(struct list_head *h)
diff -ur linux-2.5.66/include/linux/init_task.h linux-2.5.66aio/include/linux/init_task.h
--- linux-2.5.66/include/linux/init_task.h Tue Mar 25 03:30:00 2003
+++ linux-2.5.66aio/include/linux/init_task.h Fri Mar 21 14:50:42 2003
@@ -103,6 +103,7 @@
.alloc_lock = SPIN_LOCK_UNLOCKED, \
.switch_lock = SPIN_LOCK_UNLOCKED, \
.journal_info = NULL, \
+ .iocb = NULL, \
}

diff -ur linux-2.5.66/include/linux/sched.h linux-2.5.66aio/include/linux/sched.h
--- linux-2.5.66/include/linux/sched.h Tue Mar 25 03:30:00 2003
+++ linux-2.5.66aio/include/linux/sched.h Fri Mar 21 14:50:42 2003
@@ -438,6 +438,8 @@

unsigned long ptrace_message;
siginfo_t *last_siginfo; /* For ptrace use. */
+/* current aio handle */
+ struct kiocb *iocb;
};

extern void __put_task_struct(struct task_struct *tsk);
diff -ur linux-2.5.66/kernel/fork.c linux-2.5.66aio/kernel/fork.c
--- linux-2.5.66/kernel/fork.c Tue Mar 25 03:30:00 2003
+++ linux-2.5.66aio/kernel/fork.c Wed Mar 26 18:32:24 2003
@@ -856,6 +856,7 @@
p->lock_depth = -1; /* -1 = no lock */
p->start_time = get_jiffies_64();
p->security = NULL;
+ p->iocb = NULL;

retval = -ENOMEM;
if (security_task_alloc(p))
-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html

Suparna Bhattacharya

unread,

Apr 1, 2003, 11:54:00 AM4/1/03

On Tue, Apr 01, 2003 at 09:59:57PM +0530, Suparna Bhattacharya wrote:

> 02aiordwr.patch : this is the filesystem read+write
> changes for aio using the retry model

>
--
Suparna Bhattacharya (sup...@in.ibm.com)
Linux Technology Center
IBM Software Labs, India

diff -ur linux-2.5.66/drivers/block/ll_rw_blk.c linux-2.5.66aio/drivers/block/ll_rw_blk.c
--- linux-2.5.66/drivers/block/ll_rw_blk.c Tue Mar 25 03:30:00 2003
+++ linux-2.5.66aio/drivers/block/ll_rw_blk.c Tue Apr 1 10:36:53 2003
@@ -1564,17 +1564,33 @@
* If no queues are congested then just wait for the next request to be
* returned.
*/
-void blk_congestion_wait(int rw, long timeout)
+int blk_congestion_wait_async(int rw, long timeout)
{
- DEFINE_WAIT(wait);
+ DEFINE_WAIT(sync_wait);
+ wait_queue_t *wait = &sync_wait;
+ int state = TASK_UNINTERRUPTIBLE;
wait_queue_head_t *wqh = &congestion_wqh[rw];

+ if (current->iocb) {
+ wait = &current->iocb->ki_wait;
+ state = TASK_RUNNING;
+ }
blk_run_queues();
- prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
+ prepare_to_wait(wqh, wait, state);
+ if (current->iocb)
+ return -EIOCBQUEUED;
+
io_schedule_timeout(timeout);
- finish_wait(wqh, &wait);
+ finish_wait(wqh, wait);
+ return 0;
}

+void blk_congestion_wait(int rw, long timeout)
+{
+ do_sync_op(blk_congestion_wait_async(rw, timeout));
+}
+
+
/*
* Has to be called with the request spinlock acquired
*/
diff -ur linux-2.5.66/fs/buffer.c linux-2.5.66aio/fs/buffer.c
--- linux-2.5.66/fs/buffer.c Tue Mar 25 03:30:48 2003
+++ linux-2.5.66aio/fs/buffer.c Wed Mar 26 19:11:09 2003
@@ -1821,8 +1860,11 @@
clear_buffer_new(bh);
if (!buffer_mapped(bh)) {
err = get_block(inode, block, bh, 1);
- if (err)
+ if (err) {
+ if (-EIOCBQUEUED == err)
+ pr_debug("get_block queued\n");
goto out;
+ }
if (buffer_new(bh)) {
clear_buffer_new(bh);
unmap_underlying_metadata(bh->b_bdev,
diff -ur linux-2.5.66/include/linux/blkdev.h linux-2.5.66aio/include/linux/blkdev.h
--- linux-2.5.66/include/linux/blkdev.h Tue Mar 25 03:30:09 2003
+++ linux-2.5.66aio/include/linux/blkdev.h Wed Mar 26 20:07:06 2003
@@ -391,6 +391,7 @@
extern void blk_queue_free_tags(request_queue_t *);
extern void blk_queue_invalidate_tags(request_queue_t *);
extern void blk_congestion_wait(int rw, long timeout);
+extern int blk_congestion_wait_async(int rw, long timeout);

#define MAX_PHYS_SEGMENTS 128
#define MAX_HW_SEGMENTS 128
diff -ur linux-2.5.66/include/linux/pagemap.h linux-2.5.66aio/include/linux/pagemap.h
--- linux-2.5.66/include/linux/pagemap.h Tue Mar 25 03:29:54 2003
+++ linux-2.5.66aio/include/linux/pagemap.h Wed Mar 26 19:40:29 2003
@@ -135,6 +135,16 @@
if (TestSetPageLocked(page))
__lock_page(page);
}
+
+extern int FASTCALL(__lock_page_async(struct page *page));
+static inline int lock_page_async(struct page *page)
+{
+ if (TestSetPageLocked(page))
+ return __lock_page_async(page);
+ else

+ return 0;
+}
+

/*
* This is exported only for wait_on_page_locked/wait_on_page_writeback.
@@ -155,6 +165,15 @@
wait_on_page_bit(page, PG_locked);
}

+extern int FASTCALL(wait_on_page_bit_async(struct page *page, int bit_nr));
+static inline int wait_on_page_locked_async(struct page *page)
+{
+ if (PageLocked(page))
+ return wait_on_page_bit_async(page, PG_locked);
+ else

+ return 0;
+}
+

/*
* Wait for a page to complete writeback
*/
diff -ur linux-2.5.66/include/linux/writeback.h linux-2.5.66aio/include/linux/writeback.h
--- linux-2.5.66/include/linux/writeback.h Tue Mar 25 03:30:01 2003
+++ linux-2.5.66aio/include/linux/writeback.h Mon Mar 24 12:00:32 2003
@@ -80,8 +80,8 @@

void page_writeback_init(void);
-void balance_dirty_pages(struct address_space *mapping);
-void balance_dirty_pages_ratelimited(struct address_space *mapping);
+int balance_dirty_pages(struct address_space *mapping);
+int balance_dirty_pages_ratelimited(struct address_space *mapping);
int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0);
int do_writepages(struct address_space *mapping, struct writeback_control *wbc);

diff -ur linux-2.5.66/mm/filemap.c linux-2.5.66aio/mm/filemap.c
--- linux-2.5.66/mm/filemap.c Tue Mar 25 03:30:15 2003
+++ linux-2.5.66aio/mm/filemap.c Wed Mar 26 20:38:03 2003
@@ -254,19 +254,36 @@
return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)];
}

-void wait_on_page_bit(struct page *page, int bit_nr)
+int wait_on_page_bit_async(struct page *page, int bit_nr)
{
wait_queue_head_t *waitqueue = page_waitqueue(page);
- DEFINE_WAIT(wait);
+ DEFINE_WAIT(sync_wait);
+ wait_queue_t *wait = &sync_wait;
+ int state = TASK_UNINTERRUPTIBLE;
+
+ if (current->iocb) {
+ wait = &current->iocb->ki_wait;
+ state = TASK_RUNNING;
+ }

do {
- prepare_to_wait(waitqueue, &wait, TASK_UNINTERRUPTIBLE);
+ prepare_to_wait(waitqueue, wait, state);
if (test_bit(bit_nr, &page->flags)) {
sync_page(page);
+ if (current->iocb)
+ return -EIOCBQUEUED;
io_schedule();
}
} while (test_bit(bit_nr, &page->flags));
- finish_wait(waitqueue, &wait);
+ finish_wait(waitqueue, wait);

+
+ return 0;
+}

+EXPORT_SYMBOL(wait_on_page_bit_async);
+
+void wait_on_page_bit(struct page *page, int bit_nr)
+{
+ do_sync_op(wait_on_page_bit_async(page, bit_nr));
}
EXPORT_SYMBOL(wait_on_page_bit);

@@ -322,19 +339,35 @@
* chances are that on the second loop, the block layer's plug list is empty,
* so sync_page() will then return in state TASK_UNINTERRUPTIBLE.
*/
-void __lock_page(struct page *page)
+int __lock_page_async(struct page *page)
{
wait_queue_head_t *wqh = page_waitqueue(page);
- DEFINE_WAIT(wait);
+ DEFINE_WAIT(sync_wait);
+ wait_queue_t *wait = &sync_wait;
+ int state = TASK_UNINTERRUPTIBLE;
+
+ if (current->iocb) {
+ wait = &current->iocb->ki_wait;
+ state = TASK_RUNNING;
+ }

while (TestSetPageLocked(page)) {
- prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
+ prepare_to_wait(wqh, wait, state);
if (PageLocked(page)) {
sync_page(page);
+ if (current->iocb)
+ return -EIOCBQUEUED;
io_schedule();
}
}
- finish_wait(wqh, &wait);
+ finish_wait(wqh, wait);
+ return 0;
+}
+EXPORT_SYMBOL(__lock_page_async);
+
+void __lock_page(struct page *page)
+{
+ do_sync_op(__lock_page_async(page));
}
EXPORT_SYMBOL(__lock_page);

@@ -384,7 +417,7 @@
*
* Returns zero if the page was not present. find_lock_page() may sleep.
*/
-struct page *find_lock_page(struct address_space *mapping,
+struct page *find_lock_page_async(struct address_space *mapping,
unsigned long offset)
{
struct page *page;
@@ -396,7 +429,10 @@
page_cache_get(page);
if (TestSetPageLocked(page)) {
read_unlock(&mapping->page_lock);
- lock_page(page);
+ if (-EIOCBQUEUED == lock_page_async(page)) {
+ page_cache_release(page);
+ return ERR_PTR(-EIOCBQUEUED);
+ }
read_lock(&mapping->page_lock);

/* Has the page been truncated while we slept? */
@@ -411,6 +447,19 @@
return page;
}

+struct page *find_lock_page(struct address_space *mapping,
+ unsigned long offset)
+{
+ struct page *page;

+ struct kiocb *iocb = current->iocb;
+
+ current->iocb = NULL;

+ page = find_lock_page_async(mapping, offset);

+ current->iocb = iocb;
+

+ return page;
+}
+
/**
* find_or_create_page - locate or add a pagecache page
*
@@ -607,7 +656,13 @@
goto page_ok;

/* Get exclusive access to the page ... */
- lock_page(page);
+
+ if (lock_page_async(page)) {
+ pr_debug("queued lock page \n");
+ error = -EIOCBQUEUED;
+ /* TBD: should we hold on to the cached page ? */
+ goto sync_error;
+ }

/* Did it get unhashed before we got the lock? */
if (!page->mapping) {
@@ -629,12 +684,19 @@
if (!error) {
if (PageUptodate(page))
goto page_ok;
- wait_on_page_locked(page);
+ if (wait_on_page_locked_async(page)) {
+ pr_debug("queued wait_on_page \n");
+ error = -EIOCBQUEUED;
+ /*TBD:should we hold on to the cached page ?*/
+ goto sync_error;
+ }
+
if (PageUptodate(page))
goto page_ok;
error = -EIO;
}

+sync_error:
/* UHHUH! A synchronous read error occurred. Report it */
desc->error = error;
page_cache_release(page);
@@ -806,6 +868,7 @@
ssize_t ret;

init_sync_kiocb(&kiocb, filp);

+ BUG_ON(current->iocb != NULL);

ret = __generic_file_aio_read(&kiocb, &local_iov, 1, ppos);
if (-EIOCBQUEUED == ret)
ret = wait_on_sync_kiocb(&kiocb);
@@ -837,6 +900,7 @@
{
read_descriptor_t desc;

+ BUG_ON(current->iocb != NULL);

if (!count)
return 0;

@@ -1364,7 +1428,9 @@
int err;
struct page *page;
repeat:
- page = find_lock_page(mapping, index);
+ page = find_lock_page_async(mapping, index);
+ if (IS_ERR(page))
+ return page;
if (!page) {
if (!*cached_page) {
*cached_page = page_cache_alloc(mapping);
@@ -1683,6 +1749,10 @@
fault_in_pages_readable(buf, bytes);

page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
+ if (IS_ERR(page)) {
+ status = PTR_ERR(page);
+ break;
+ }
if (!page) {
status = -ENOMEM;
break;
@@ -1690,6 +1760,8 @@

status = a_ops->prepare_write(file, page, offset, offset+bytes);
if (unlikely(status)) {
+ if (-EIOCBQUEUED == status)
+ pr_debug("queued prepare_write\n");
/*
* prepare_write() may have instantiated a few blocks
* outside i_size. Trim these off again.
@@ -1730,7 +1802,11 @@
page_cache_release(page);
if (status < 0)
break;
- balance_dirty_pages_ratelimited(mapping);
+ status = balance_dirty_pages_ratelimited(mapping);
+ if (status < 0) {
+ pr_debug("async balance_dirty_pages\n");
+ break;
+ }
cond_resched();
} while (count);
*ppos = pos;
@@ -1742,9 +1818,10 @@
* For now, when the user asks for O_SYNC, we'll actually give O_DSYNC
*/
if (status >= 0) {
- if ((file->f_flags & O_SYNC) || IS_SYNC(inode))
+ if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
status = generic_osync_inode(inode,
OSYNC_METADATA|OSYNC_DATA);
+ }
}

out_status:
diff -ur linux-2.5.66/mm/page-writeback.c linux-2.5.66aio/mm/page-writeback.c
--- linux-2.5.66/mm/page-writeback.c Tue Mar 25 03:30:55 2003
+++ linux-2.5.66aio/mm/page-writeback.c Wed Mar 26 19:37:42 2003
@@ -135,7 +135,7 @@
* If we're over `background_thresh' then pdflush is woken to perform some
* writeout.
*/
-void balance_dirty_pages(struct address_space *mapping)
+int balance_dirty_pages(struct address_space *mapping)
{
struct page_state ps;
long background_thresh;
@@ -152,6 +152,7 @@
.sync_mode = WB_SYNC_NONE,
.older_than_this = NULL,
.nr_to_write = write_chunk,
+ .nonblocking = current->iocb ? 1 : 0,
};

dirty_exceeded = 1;
@@ -165,7 +166,10 @@
pages_written += write_chunk - wbc.nr_to_write;
if (pages_written >= write_chunk)
break; /* We've done our duty */
- blk_congestion_wait(WRITE, HZ/10);
+ if (-EIOCBQUEUED == blk_congestion_wait_async(WRITE, HZ/10)) {
+ pr_debug("async blk congestion wait\n");
+ return -EIOCBQUEUED;
+ }
}

if (ps.nr_dirty + ps.nr_writeback <= dirty_thresh)
@@ -173,6 +177,8 @@

if (!writeback_in_progress(bdi) && ps.nr_dirty > background_thresh)
pdflush_operation(background_writeout, 0);
+
+ return 0;
}

/**
@@ -188,7 +194,7 @@
* decrease the ratelimiting by a lot, to prevent individual processes from
* overshooting the limit by (ratelimit_pages) each.
*/
-void balance_dirty_pages_ratelimited(struct address_space *mapping)
+int balance_dirty_pages_ratelimited(struct address_space *mapping)
{
static DEFINE_PER_CPU(int, ratelimits) = 0;
int cpu;
@@ -202,10 +208,10 @@
if (per_cpu(ratelimits, cpu)++ >= ratelimit) {
per_cpu(ratelimits, cpu) = 0;
put_cpu();
- balance_dirty_pages(mapping);
- return;
+ return balance_dirty_pages(mapping);
}
put_cpu();
+ return 0;
}
EXPORT_SYMBOL_GPL(balance_dirty_pages_ratelimited);

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html

Please read the FAQ at http://www.tux.org/lkml/

Suparna Bhattacharya

unread,

Apr 1, 2003, 11:55:01 AM4/1/03

On Tue, Apr 01, 2003 at 09:59:57PM +0530, Suparna Bhattacharya wrote:

> 04ext2-aiogetblk.patch : an async get block
> implementation for ext2

>
--
Suparna Bhattacharya (sup...@in.ibm.com)
Linux Technology Center
IBM Software Labs, India

diff -ur linux-2.5.66/fs/ext2/balloc.c linux-2.5.66aio/fs/ext2/balloc.c
--- linux-2.5.66/fs/ext2/balloc.c Tue Mar 25 03:30:18 2003
+++ linux-2.5.66aio/fs/ext2/balloc.c Wed Mar 26 19:50:08 2003
@@ -76,7 +76,7 @@
* Return buffer_head on success or NULL in case of failure.
*/
static struct buffer_head *
-read_block_bitmap(struct super_block *sb, unsigned int block_group)
+read_block_bitmap_async(struct super_block *sb, unsigned int block_group)
{
struct ext2_group_desc * desc;
struct buffer_head * bh = NULL;
@@ -84,7 +84,7 @@
desc = ext2_get_group_desc (sb, block_group, NULL);
if (!desc)
goto error_out;
- bh = sb_bread(sb, le32_to_cpu(desc->bg_block_bitmap));
+ bh = sb_bread_async(sb, le32_to_cpu(desc->bg_block_bitmap));
if (!bh)
ext2_error (sb, "read_block_bitmap",
"Cannot read block bitmap - "
@@ -94,6 +94,15 @@
return bh;
}

+static struct buffer_head *
+read_block_bitmap(struct super_block *sb, unsigned int block_group)
+{
+ struct buffer_head * bh = NULL;
+
+ do_sync_op(bh = read_block_bitmap_async(sb, block_group));
+ return bh;
+}
+
static inline int reserve_blocks(struct super_block *sb, int count)
{
struct ext2_sb_info * sbi = EXT2_SB(sb);
@@ -309,7 +318,7 @@
* bitmap, and then for any free bit if that fails.
* This function also updates quota and i_blocks field.
*/
-int ext2_new_block (struct inode * inode, unsigned long goal,
+int ext2_new_block_async (struct inode * inode, unsigned long goal,
u32 * prealloc_count, u32 * prealloc_block, int * err)
{
struct buffer_head *bitmap_bh = NULL;
@@ -401,7 +410,7 @@
}
brelse(bitmap_bh);
bitmap_bh = read_block_bitmap(sb, group_no);
- if (!bitmap_bh)
+ if (!bitmap_bh || IS_ERR(bitmap_bh))
goto io_error;

ret_block = grab_block(bitmap_bh->b_data, group_size, 0);
@@ -481,10 +490,20 @@
return block;

io_error:
- *err = -EIO;
+ *err = IS_ERR(bitmap_bh) ? PTR_ERR(bitmap_bh) : -EIO;
goto out_release;
}

+int ext2_new_block (struct inode * inode, unsigned long goal,
+ u32 * prealloc_count, u32 * prealloc_block, int * err)
+{
+ int block = 0;
+
+ do_sync_op(block = ext2_new_block_async(inode, goal, prealloc_count,
+ prealloc_block, err));
+ return block;
+}
+
unsigned long ext2_count_free_blocks (struct super_block * sb)
{
#ifdef EXT2FS_DEBUG
diff -ur linux-2.5.66/fs/ext2/ext2.h linux-2.5.66aio/fs/ext2/ext2.h
--- linux-2.5.66/fs/ext2/ext2.h Tue Mar 25 03:31:48 2003
+++ linux-2.5.66aio/fs/ext2/ext2.h Tue Mar 25 13:42:02 2003
@@ -74,6 +74,8 @@
extern unsigned long ext2_bg_num_gdb(struct super_block *sb, int group);
extern int ext2_new_block (struct inode *, unsigned long,
__u32 *, __u32 *, int *);
+extern int ext2_new_block_async (struct inode *, unsigned long,
+ __u32 *, __u32 *, int *);
extern void ext2_free_blocks (struct inode *, unsigned long,
unsigned long);
extern unsigned long ext2_count_free_blocks (struct super_block *);
diff -ur linux-2.5.66/fs/ext2/inode.c linux-2.5.66aio/fs/ext2/inode.c
--- linux-2.5.66/fs/ext2/inode.c Tue Mar 25 03:29:57 2003
+++ linux-2.5.66aio/fs/ext2/inode.c Mon Mar 31 21:16:08 2003
@@ -98,7 +98,8 @@
#endif
}

-static int ext2_alloc_block (struct inode * inode, unsigned long goal, int *err)
+static int ext2_alloc_block_async (struct inode * inode, unsigned long goal,
+ int *err)
{
#ifdef EXT2FS_DEBUG
static unsigned long alloc_hits = 0, alloc_attempts = 0;
@@ -123,18 +124,26 @@
ext2_debug ("preallocation miss (%lu/%lu).\n",
alloc_hits, ++alloc_attempts);
if (S_ISREG(inode->i_mode))
- result = ext2_new_block (inode, goal,
+ result = ext2_new_block_async (inode, goal,
&ei->i_prealloc_count,
&ei->i_prealloc_block, err);
else
result = ext2_new_block (inode, goal, 0, 0, err);
}
#else
- result = ext2_new_block (inode, goal, 0, 0, err);
+ result = ext2_new_block_async (inode, goal, 0, 0, err);
#endif
return result;
}

+static int ext2_alloc_block (struct inode * inode, unsigned long goal, int *err)
+{
+ int result;
+
+ do_sync_op(result = ext2_alloc_block_async(inode, goal, err));
+ return result;
+}
+
typedef struct {
u32 *p;
u32 key;
@@ -252,7 +261,7 @@
* or when it reads all @depth-1 indirect blocks successfully and finds
* the whole chain, all way to the data (returns %NULL, *err == 0).
*/
-static Indirect *ext2_get_branch(struct inode *inode,
+static Indirect *ext2_get_branch_async(struct inode *inode,
int depth,
int *offsets,
Indirect chain[4],
@@ -268,8 +277,8 @@
if (!p->key)
goto no_block;
while (--depth) {
- bh = sb_bread(sb, le32_to_cpu(p->key));
- if (!bh)
+ bh = sb_bread_async(sb, le32_to_cpu(p->key));
+ if (!bh || IS_ERR(bh))
goto failure;
read_lock(&EXT2_I(inode)->i_meta_lock);
if (!verify_chain(chain, p))
@@ -287,11 +296,24 @@
*err = -EAGAIN;
goto no_block;
failure:
- *err = -EIO;
+ *err = IS_ERR(bh) ? PTR_ERR(bh) : -EIO;
no_block:
return p;
}

+static Indirect *ext2_get_branch(struct inode *inode,
+ int depth,
+ int *offsets,
+ Indirect chain[4],
+ int *err)
+{
+ Indirect *p;
+
+ do_sync_op(p = ext2_get_branch_async(inode, depth, offsets, chain,
+ err));
+ return p;
+}
+
/**
* ext2_find_near - find a place for allocation with sufficient locality
* @inode: owner
@@ -406,7 +428,7 @@
* as described above and return 0.
*/

-static int ext2_alloc_branch(struct inode *inode,
+static int ext2_alloc_branch_async(struct inode *inode,
int num,
unsigned long goal,
int *offsets,
@@ -422,7 +444,7 @@
if (parent) for (n = 1; n < num; n++) {
struct buffer_head *bh;
/* Allocate the next block */
- int nr = ext2_alloc_block(inode, parent, &err);
+ int nr = ext2_alloc_block_async(inode, parent, &err);
if (!nr)
break;
branch[n].key = cpu_to_le32(nr);
@@ -458,6 +480,19 @@
return err;
}

+static int ext2_alloc_branch(struct inode *inode,
+ int num,
+ unsigned long goal,
+ int *offsets,
+ Indirect *branch)
+{
+ int err;
+
+ do_sync_op(err = ext2_alloc_branch_async(inode, num, goal,
+ offsets, branch));
+ return err;
+}
+
/**
* ext2_splice_branch - splice the allocated branch onto inode.
* @inode: owner
@@ -531,7 +566,7 @@
* reachable from inode.
*/

-static int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create)
+static int ext2_get_block_async(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create)
{
int err = -EIO;
int offsets[4];
@@ -546,7 +581,7 @@
goto out;

reread:
- partial = ext2_get_branch(inode, depth, offsets, chain, &err);
+ partial = ext2_get_branch_async(inode, depth, offsets, chain, &err);

/* Simplest case - block found, no allocation needed */
if (!partial) {
@@ -560,7 +595,7 @@
}

/* Next simple case - plain lookup or failed read of indirect block */
- if (!create || err == -EIO) {
+ if (!create || err == -EIO || err == -EIOCBQUEUED) {
cleanup:
while (partial > chain) {
brelse(partial->bh);
@@ -582,7 +617,7 @@
goto changed;

left = (chain + depth) - partial;
- err = ext2_alloc_branch(inode, left, goal,
+ err = ext2_alloc_branch_async(inode, left, goal,
offsets+(partial-chain), partial);
if (err)
goto cleanup;
@@ -601,6 +636,15 @@
goto reread;
}

+static int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create)
+{
+ int err;
+
+ do_sync_op(err = ext2_get_block_async(inode, iblock, bh_result,
+ create));
+ return err;
+}
+
static int ext2_writepage(struct page *page, struct writeback_control *wbc)
{
return block_write_full_page(page, ext2_get_block, wbc);
@@ -622,7 +666,7 @@
ext2_prepare_write(struct file *file, struct page *page,
unsigned from, unsigned to)
{
- return block_prepare_write(page,from,to,ext2_get_block);
+ return block_prepare_write(page,from,to,ext2_get_block_async);
}

static int
-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in

Suparna Bhattacharya

unread,

Apr 1, 2003, 11:55:02 AM4/1/03

On Tue, Apr 01, 2003 at 09:59:57PM +0530, Suparna Bhattacharya wrote:

> 03aiobread.patch : code for async breads which can
> be used by filesystems for providing async get block
> implementation

--
Suparna Bhattacharya (sup...@in.ibm.com)
Linux Technology Center
IBM Software Labs, India

diff -ur linux-2.5.66/fs/buffer.c linux-2.5.66aio/fs/buffer.c

--- linux-2.5.66/fs/buffer.c Tue Mar 25 03:30:48 2003
+++ linux-2.5.66aio/fs/buffer.c Wed Mar 26 19:11:09 2003

@@ -118,21 +118,38 @@
* from becoming locked again - you have to lock it yourself
* if you want to preserve its state.
*/
-void __wait_on_buffer(struct buffer_head * bh)
+int __wait_on_buffer_async(struct buffer_head * bh)
{
wait_queue_head_t *wqh = bh_waitq_head(bh);

- DEFINE_WAIT(wait);
+ DEFINE_WAIT(sync_wait);
+ wait_queue_t *wait = &sync_wait;
+ int state = TASK_UNINTERRUPTIBLE;
+
+ if (current->iocb) {
+ wait = &current->iocb->ki_wait;
+ state = TASK_RUNNING;
+ }

get_bh(bh);
do {

- prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
+ prepare_to_wait(wqh, wait, state);

if (buffer_locked(bh)) {
blk_run_queues();
+ if (current->iocb) {
+ put_bh(bh); /* TBD: is this correct ? */
+ return -EIOCBQUEUED;
+ }
io_schedule();
}
} while (buffer_locked(bh));
put_bh(bh);

- finish_wait(wqh, &wait);
+ finish_wait(wqh, wait);
+ return 0;
+}
+

+void __wait_on_buffer(struct buffer_head * bh)
+{
+ do_sync_op(__wait_on_buffer_async(bh));
}

static void
@@ -1188,9 +1205,11 @@
__brelse(bh);
}

-static struct buffer_head *__bread_slow(struct buffer_head *bh)
+static struct buffer_head *__bread_slow_async(struct buffer_head *bh)
{
- lock_buffer(bh);
+ if (-EIOCBQUEUED == lock_buffer_async(bh))
+ return ERR_PTR(-EIOCBQUEUED);
+
if (buffer_uptodate(bh)) {
unlock_buffer(bh);
return bh;
@@ -1200,7 +1219,8 @@
get_bh(bh);
bh->b_end_io = end_buffer_io_sync;
submit_bh(READ, bh);
- wait_on_buffer(bh);
+ if (-EIOCBQUEUED == wait_on_buffer_async(bh))
+ return ERR_PTR(-EIOCBQUEUED);
if (buffer_uptodate(bh))
return bh;
}
@@ -1208,6 +1228,14 @@
return NULL;
}

+static inline struct buffer_head *__bread_slow(struct buffer_head *bh)
+{
+ struct buffer_head *ret_bh;
+
+ do_sync_op(ret_bh = __bread_slow_async(bh));
+ return ret_bh;
+}
+
/*
* Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
* The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
@@ -1382,6 +1410,17 @@
bh = __bread_slow(bh);
return bh;
}
+
+
+struct buffer_head *
+__bread_async(struct block_device *bdev, sector_t block, int size)
+{
+ struct buffer_head *bh = __getblk(bdev, block, size);
+
+ if (!buffer_uptodate(bh))
+ bh = __bread_slow_async(bh);
+ return bh;
+}
EXPORT_SYMBOL(__bread);

/*
diff -ur linux-2.5.66/include/linux/buffer_head.h linux-2.5.66aio/include/linux/buffer_head.h
--- linux-2.5.66/include/linux/buffer_head.h Tue Mar 25 03:29:54 2003
+++ linux-2.5.66aio/include/linux/buffer_head.h Wed Mar 26 19:26:42 2003
@@ -156,6 +156,7 @@
void __invalidate_buffers(kdev_t dev, int);
int sync_blockdev(struct block_device *bdev);
void __wait_on_buffer(struct buffer_head *);
+int __wait_on_buffer_async(struct buffer_head *);
wait_queue_head_t *bh_waitq_head(struct buffer_head *bh);
void wake_up_buffer(struct buffer_head *bh);
int fsync_bdev(struct block_device *);
@@ -166,6 +167,8 @@
void __brelse(struct buffer_head *);
void __bforget(struct buffer_head *);
struct buffer_head *__bread(struct block_device *, sector_t block, int size);
+struct buffer_head *__bread_async(struct block_device *, sector_t block,
+ int size);
struct buffer_head *alloc_buffer_head(void);
void free_buffer_head(struct buffer_head * bh);
void FASTCALL(unlock_buffer(struct buffer_head *bh));
@@ -237,6 +240,12 @@
return __bread(sb->s_bdev, block, sb->s_blocksize);
}

+static inline struct buffer_head *sb_bread_async(struct super_block *sb,
+ sector_t block)
+{
+ return __bread_async(sb->s_bdev, block, sb->s_blocksize);
+}
+
static inline struct buffer_head *sb_getblk(struct super_block *sb, sector_t block)
{
return __getblk(sb->s_bdev, block, sb->s_blocksize);
@@ -262,12 +271,28 @@
__wait_on_buffer(bh);
}

+static inline int wait_on_buffer_async(struct buffer_head *bh)
+{
+ if (buffer_locked(bh))
+ return __wait_on_buffer_async(bh);

+
+ return 0;
+}
+

static inline void lock_buffer(struct buffer_head *bh)
{
while (test_set_buffer_locked(bh))
__wait_on_buffer(bh);
}

+static inline int lock_buffer_async(struct buffer_head *bh)
+{
+ if (test_set_buffer_locked(bh))
+ return __wait_on_buffer_async(bh);

+
+ return 0;
+}
+

/*
* Debug
*/
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html

Christoph Hellwig

unread,

Apr 1, 2003, 12:10:05 PM4/1/03

> +int blk_congestion_wait_async(int rw, long timeout)

Isn't the name a bit silly? :)

Suparna Bhattacharya

unread,

Apr 2, 2003, 5:27:19 AM4/2/03

On Tue, Apr 01, 2003 at 03:27:13PM -0500, Benjamin LaHaise wrote:
> On Tue, Apr 01, 2003 at 09:59:57PM +0530, Suparna Bhattacharya wrote:

> > I would really appreciate comments and review feedback
> > from the perspective of fs developers especially on
> > the latter 2 patches in terms of whether this seems a
> > sound approach or if I'm missing something very crucial
> > (which I just well might be)
> > Is this easy to do for other filesystems as well ?
>
> I disagree with putting the iocb pointer in the task_struct: it feels
> completely bogus as it modifies semantics behind the scenes without
> fixing APIs.

You mean we could pass the iocb as a parameter all the way down
for the async versions of the ops and do_sync_op() could just do
a wait_for_sync_iocb() ?

That was what I'd originally intended to do.
But then I experimented with the current->iocb alternative
because:

1. I wasn't sure how much API fixing, we could do at this stage.
(it is after all pretty late in the 2.5 cycle)
If you notice I've been trying to tread very carefully in
terms of the modifications to interfaces, especially anything
that requires changes to all filesystems.
2. I wanted to quickly have something we could play with and run
performance tests on, with minimal changes/impact on existing
code paths and sync i/o operations. Additionally current->iocb
gave me an simple way to detect blocking operations (schedules)
during aio, no matter how deep a subroutine we are in. (I have
been using those indicators to prioritize which blocking
points to tackle)
3. After a first pass of trying to use retries for sync ops
as well, it seemed like being able to continue from a blocking
point directly as we do today would be more efficient (In
this case, we do care more about latency than we do for async
ops). So that meant a switch between return -EIOCBQUEUED and
blocking depending on whether this was an async or sync
context. I could do that with an is_sync_iocb() check as
well (vs current->iocb), but even that would be changing
semantics.

So if (1) is sorted out, i.e. we still have the opportunity
to alter some APIs, then we could do it that way.
Do we ?

Regards
Suparna

--
Suparna Bhattacharya (sup...@in.ibm.com)
Linux Technology Center
IBM Software Labs, India

-
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in

Suparna Bhattacharya

unread,

Apr 3, 2003, 10:00:40 PM4/3/03

On Wed, Apr 02, 2003 at 03:49:01PM +0530, Suparna Bhattacharya wrote:
> On Tue, Apr 01, 2003 at 03:27:13PM -0500, Benjamin LaHaise wrote:
> > On Tue, Apr 01, 2003 at 09:59:57PM +0530, Suparna Bhattacharya wrote:
> > > I would really appreciate comments and review feedback
> > > from the perspective of fs developers especially on
> > > the latter 2 patches in terms of whether this seems a
> > > sound approach or if I'm missing something very crucial
> > > (which I just well might be)
> > > Is this easy to do for other filesystems as well ?
> >
> > I disagree with putting the iocb pointer in the task_struct: it feels
> > completely bogus as it modifies semantics behind the scenes without
> > fixing APIs.

I later remembered one more reason why I'd tried this out -- it
enabled me to play with async handling of page faults (i.e. an
async fault_in_pages .. or a retriable copy_xxx_user). I didn't
want to inclue that code until/unless I saw some real gains, so its
not an important consideration, but nevertheless it was an
added flexibility.

BTW, does making this a wait queue entry pointer rather than iocb
pointer sound any better (i.e tsk->io_wait instead of tsk->iocb) ? The
code turns out to be cleaner, and the semantics feels a little
more natural ... (though maybe its just because I've become used
to it :))

Regards
Suparna

> --
> To unsubscribe, send a message with 'unsubscribe linux-aio' in
> the body to majo...@kvack.org. For more info on Linux AIO,
> see: http://www.kvack.org/aio/
> Don't email: <a href=mailto:"aa...@kvack.org">aa...@kvack.org</a>

0 new messages