[PATCH 2/2] NFSv4.1: Fix a race in nfs4_write

Trond Myklebust

unread,

Jan 13, 2014, 1:50:01 PM1/13/14

to

nfs4_write_inode() must not be allowed to exit until the layoutcommit
is done. That means that both NFS_INO_LAYOUTCOMMIT and
NFS_INO_LAYOUTCOMMITTING have to be cleared.

Signed-off-by: Trond Myklebust <trond.m...@primarydata.com>
---
fs/nfs/nfs4super.c | 14 +++---------
fs/nfs/pnfs.c | 67 ++++++++++++++++++++++++++++--------------------------
2 files changed, 38 insertions(+), 43 deletions(-)

diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index 65ab0a0ca1c4..808f29574412 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -77,17 +77,9 @@ static int nfs4_write_inode(struct inode *inode, struct writeback_control *wbc)
{
int ret = nfs_write_inode(inode, wbc);

- if (ret >= 0 && test_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(inode)->flags)) {
- int status;
- bool sync = true;
-
- if (wbc->sync_mode == WB_SYNC_NONE)
- sync = false;
-
- status = pnfs_layoutcommit_inode(inode, sync);
- if (status < 0)
- return status;
- }
+ if (ret == 0)
+ ret = pnfs_layoutcommit_inode(inode,
+ wbc->sync_mode == WB_SYNC_ALL);
return ret;
}

diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index d75d938d36cb..4755858e37a0 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1790,6 +1790,15 @@ pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
}
EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages);

+static void pnfs_clear_layoutcommitting(struct inode *inode)
+{
+ unsigned long *bitlock = &NFS_I(inode)->flags;
+
+ clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock);
+ smp_mb__after_clear_bit();
+ wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING);
+}
+
/*
* There can be multiple RW segments.
*/
@@ -1807,7 +1816,6 @@ static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp)
static void pnfs_list_write_lseg_done(struct inode *inode, struct list_head *listp)
{
struct pnfs_layout_segment *lseg, *tmp;
- unsigned long *bitlock = &NFS_I(inode)->flags;

/* Matched by references in pnfs_set_layoutcommit */
list_for_each_entry_safe(lseg, tmp, listp, pls_lc_list) {
@@ -1815,9 +1823,7 @@ static void pnfs_list_write_lseg_done(struct inode *inode, struct list_head *lis
pnfs_put_lseg(lseg);
}

- clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock);
- smp_mb__after_clear_bit();
- wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING);
+ pnfs_clear_layoutcommitting(inode);
}

void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)
@@ -1881,43 +1887,37 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
struct nfs4_layoutcommit_data *data;
struct nfs_inode *nfsi = NFS_I(inode);
loff_t end_pos;
- int status = 0;
+ int status;

- dprintk("--> %s inode %lu\n", __func__, inode->i_ino);
-
- if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
+ if (!pnfs_layoutcommit_outstanding(inode))
return 0;

- /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */
- data = kzalloc(sizeof(*data), GFP_NOFS);
- if (!data) {
- status = -ENOMEM;
- goto out;
- }
-
- if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
- goto out_free;
+ dprintk("--> %s inode %lu\n", __func__, inode->i_ino);

+ status = -EAGAIN;
if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) {
- if (!sync) {
- status = -EAGAIN;
- goto out_free;
- }
- status = wait_on_bit_lock(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING,
- nfs_wait_bit_killable, TASK_KILLABLE);
+ if (!sync)
+ goto out;
+ status = wait_on_bit_lock(&nfsi->flags,
+ NFS_INO_LAYOUTCOMMITTING,
+ nfs_wait_bit_killable,
+ TASK_KILLABLE);
if (status)
- goto out_free;
+ goto out;
}

- INIT_LIST_HEAD(&data->lseg_list);
+ status = -ENOMEM;
+ /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */
+ data = kzalloc(sizeof(*data), GFP_NOFS);
+ if (!data)
+ goto clear_layoutcommitting;
+
+ status = 0;
spin_lock(&inode->i_lock);
- if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
- clear_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags);
- spin_unlock(&inode->i_lock);
- wake_up_bit(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING);
- goto out_free;
- }
+ if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
+ goto out_unlock;

+ INIT_LIST_HEAD(&data->lseg_list);
pnfs_list_write_lseg(inode, &data->lseg_list);

end_pos = nfsi->layout->plh_lwb;
@@ -1940,8 +1940,11 @@ out:
mark_inode_dirty_sync(inode);
dprintk("<-- %s status %d\n", __func__, status);
return status;
-out_free:
+out_unlock:
+ spin_unlock(&inode->i_lock);
kfree(data);
+clear_layoutcommitting:
+ pnfs_clear_layoutcommitting(inode);
goto out;
}

--
1.8.4.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Trond Myklebust

unread,

Jan 13, 2014, 1:50:02 PM1/13/14

to

If a LAYOUTCOMMIT is outstanding, then chances are that the metadata
server may still be returning incorrect values for the change attribute,
ctime, mtime and/or size.
Just ignore those attributes for now, and wait for the LAYOUTCOMMIT
rpc call to finish.

Reported-by: shaobingqing <shaobi...@bwstor.com.cn>

Signed-off-by: Trond Myklebust <trond.m...@primarydata.com>
---

fs/nfs/inode.c | 19 +++++++++++++++++--
fs/nfs/nfs4proc.c | 5 ++---
fs/nfs/pnfs.h | 16 ++++++++++++++++
3 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 5feec233895d..c63e15224466 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1283,12 +1283,28 @@ static int nfs_inode_attrs_need_update(const struct inode *inode, const struct n
((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0);
}

+/*
+ * Don't trust the change_attribute, mtime, ctime or size if
+ * a pnfs LAYOUTCOMMIT is outstanding
+ */
+static void nfs_inode_attrs_handle_layoutcommit(struct inode *inode,
+ struct nfs_fattr *fattr)
+{
+ if (pnfs_layoutcommit_outstanding(inode))
+ fattr->valid &= ~(NFS_ATTR_FATTR_CHANGE |
+ NFS_ATTR_FATTR_MTIME |
+ NFS_ATTR_FATTR_CTIME |
+ NFS_ATTR_FATTR_SIZE);
+}
+
static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr)
{
int ret;

trace_nfs_refresh_inode_enter(inode);

+ nfs_inode_attrs_handle_layoutcommit(inode, fattr);
+
if (nfs_inode_attrs_need_update(inode, fattr))
ret = nfs_update_inode(inode, fattr);
else
@@ -1518,8 +1534,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
if (new_isize != cur_isize) {
/* Do we perhaps have any outstanding writes, or has
* the file grown beyond our last write? */
- if ((nfsi->npages == 0 && !test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) ||
- new_isize > cur_isize) {
+ if ((nfsi->npages == 0) || new_isize > cur_isize) {
i_size_write(inode, new_isize);
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 15052b81df42..f4908eb40a21 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -7780,10 +7780,7 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
case -NFS4ERR_BADLAYOUT: /* no layout */
case -NFS4ERR_GRACE: /* loca_recalim always false */
task->tk_status = 0;
- break;
case 0:
- nfs_post_op_update_inode_force_wcc(data->args.inode,
- data->res.fattr);
break;
default:
if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
@@ -7798,6 +7795,8 @@ static void nfs4_layoutcommit_release(void *calldata)
struct nfs4_layoutcommit_data *data = calldata;

pnfs_cleanup_layoutcommit(data);
+ nfs_post_op_update_inode_force_wcc(data->args.inode,
+ data->res.fattr);
put_rpccred(data->cred);
kfree(data);
}
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index a4f41810a7f4..023793909778 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -359,6 +359,15 @@ pnfs_ld_layoutret_on_setattr(struct inode *inode)
PNFS_LAYOUTRET_ON_SETATTR;
}

+static inline bool
+pnfs_layoutcommit_outstanding(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ return test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags) != 0 ||
+ test_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags) != 0;
+}
+
static inline int pnfs_return_layout(struct inode *ino)
{
struct nfs_inode *nfsi = NFS_I(ino);
@@ -515,6 +524,13 @@ pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src,
return false;
}

+static inline bool
+pnfs_layoutcommit_outstanding(struct inode *inode)
+{
+ return false;
+}
+
+
static inline struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
{
return NULL;

Peng Tao

unread,

Jan 16, 2014, 10:50:03 AM1/16/14

to

On Tue, Jan 14, 2014 at 2:45 AM, Trond Myklebust
<trond.m...@primarydata.com> wrote:
> nfs4_write_inode() must not be allowed to exit until the layoutcommit
> is done. That means that both NFS_INO_LAYOUTCOMMIT and
> NFS_INO_LAYOUTCOMMITTING have to be cleared.
>
> Signed-off-by: Trond Myklebust <trond.m...@primarydata.com>
> ---
> fs/nfs/nfs4super.c | 14 +++---------
> fs/nfs/pnfs.c | 67 ++++++++++++++++++++++++++++--------------------------
> 2 files changed, 38 insertions(+), 43 deletions(-)
>
> diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
> index 65ab0a0ca1c4..808f29574412 100644
> --- a/fs/nfs/nfs4super.c
> +++ b/fs/nfs/nfs4super.c
> @@ -77,17 +77,9 @@ static int nfs4_write_inode(struct inode *inode, struct writeback_control *wbc)
> {
> int ret = nfs_write_inode(inode, wbc);
>
> - if (ret >= 0 && test_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(inode)->flags)) {
> - int status;

> - bool sync = true;!test_and_clear_bit

This might be a problem. If nfsi->flags has !NFS_INO_LAYOUTCOMMIT and
NFS_INO_LAYOUTCOMMITTING, client cannot issue a new layoutcommit after
the inflight one finishes. It might not be an issue for file layout as
long as we only use layoutcommit to update time, but it can cause data
corruption for block layout.

Thanks,
Tao

Trond Myklebust

unread,

Jan 16, 2014, 12:20:02 PM1/16/14

to

I donï¿½t understand.

With the new patch, if _either_ NFS_INO_LAYOUTCOMMIT or NFS_INO_LAYOUTCOMMITTING are set, then the client will wait until NFS_INO_LAYOUTCOMMITTING can be locked, it will test for NFS_INO_LAYOUTCOMMIT, and then either issue a new layout commit or exit. How can that cause new breakage for blocks?

The only issues that Iï¿½m aware of with the blocks layout and LAYOUTCOMMIT today are:
1. encode_pnfs_block_layoutupdate() runs out of XDR buffer space after 4-5 iterations in the list_for_each_entry_safe() loop. That is because nobody has yet added support for preallocating a page buffer to store the (potentially very large) array of extents. BTW: that array looks like a perfect candidate for xdr_encode_array2() if we could teach the latter about xdr_stream...
2. the blocks layout also needs to be able handle the case where the list of extents is so large that a single LAYOUTCOMMIT is not sufficient. There is no reason why it should not be able to send multiple LAYOUTCOMMIT rpc calls when the size exceeds the session forward channel's negotiated max_rqst_sz.

--
Trond Myklebust
Linux NFS client maintainer

Peng Tao

unread,

Jan 17, 2014, 8:10:02 AM1/17/14

to

On Fri, Jan 17, 2014 at 1:11 AM, Trond Myklebust

<trond.m...@primarydata.com> wrote:
>
> On Jan 16, 2014, at 10:49, Peng Tao <berg...@gmail.com> wrote:
>> On Tue, Jan 14, 2014 at 2:45 AM, Trond Myklebust
>> <trond.m...@primarydata.com> wrote:
>>> void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)
>>> @@ -1881,43 +1887,37 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
>>> struct nfs4_layoutcommit_data *data;
>>> struct nfs_inode *nfsi = NFS_I(inode);
>>> loff_t end_pos;
>>> - int status = 0;
>>> + int status;
>>>
>>> - dprintk("--> %s inode %lu\n", __func__, inode->i_ino);
>>> -
>>> - if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
>>> + if (!pnfs_layoutcommit_outstanding(inode))
>> This might be a problem. If nfsi->flags has !NFS_INO_LAYOUTCOMMIT and
>> NFS_INO_LAYOUTCOMMITTING, client cannot issue a new layoutcommit after
>> the inflight one finishes. It might not be an issue for file layout as
>> long as we only use layoutcommit to update time, but it can cause data
>> corruption for block layout.
>

> I don’t understand.

>
> With the new patch, if _either_ NFS_INO_LAYOUTCOMMIT or NFS_INO_LAYOUTCOMMITTING are set, then the client will wait until NFS_INO_LAYOUTCOMMITTING can be locked, it will test for NFS_INO_LAYOUTCOMMIT, and then either issue a new layout commit or exit. How can that cause new breakage for blocks?
>

ah, sorry, I read the old code as your patch... yeah, so you actually
fixed the issue instead of introducing it.

> The only issues that I’m aware of with the blocks layout and LAYOUTCOMMIT today are:

> 1. encode_pnfs_block_layoutupdate() runs out of XDR buffer space after 4-5 iterations in the list_for_each_entry_safe() loop. That is because nobody has yet added support for preallocating a page buffer to store the (potentially very large) array of extents. BTW: that array looks like a perfect candidate for xdr_encode_array2() if we could teach the latter about xdr_stream...

This can also be fixed by limiting the number of extents allowed to be
sent in one single layoutcommit.

> 2. the blocks layout also needs to be able handle the case where the list of extents is so large that a single LAYOUTCOMMIT is not sufficient. There is no reason why it should not be able to send multiple LAYOUTCOMMIT rpc calls when the size exceeds the session forward channel's negotiated max_rqst_sz.
>

You are absolutely right.

Thanks,
Tao

[PATCH 2/2] NFSv4.1: Fix a race in nfs4_write_inode

Trond Myklebust

Trond Myklebust

Peng Tao

Trond Myklebust

Peng Tao