From 38511722446993d926861696194c39ef135d85a4 Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Thu, 3 Feb 2011 18:28:50 +0000 Subject: [PATCH 01/54] pnfs: avoid incorrect use of layout stateid The code could violate the following from RFC5661, section 12.5.3: "Once a client has no more layouts on a file, the layout stateid is no longer valid and MUST NOT be used." This can occur when a layout already has a lseg, starts another non-everlapping LAYOUTGET, and a CB_LAYOUTRECALL for the existing lseg is processed before we hit pnfs_layout_process(). Solve by setting, each time the client has no more lsegs for a file, a flag which blocks further use of the layout and triggers its removal. This also fixes a second bug which occurs in the same instance as above. If we actually use pnfs_layout_process, we add the new lseg to the layout, but the layout has been removed from the nfs_client list by the intervening CB_LAYOUTRECALL and will not be added back. Thus the newly acquired lseg will not be properly returned in the event of a subsequent CB_LAYOUTRECALL. Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 1b1bc1a0fb0a..c8d9b2148cb1 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -255,6 +255,9 @@ put_lseg_locked(struct pnfs_layout_segment *lseg, list_del_init(&lseg->pls_layout->plh_layouts); spin_unlock(&clp->cl_lock); clear_bit(NFS_LAYOUT_BULK_RECALL, &lseg->pls_layout->plh_flags); + set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags); + /* Matched by initial refcount set in alloc_init_layout_hdr */ + put_layout_hdr_locked(lseg->pls_layout); } rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq); list_add(&lseg->pls_list, tmp_list); @@ -299,6 +302,11 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, dprintk("%s:Begin lo %p\n", __func__, lo); + if (list_empty(&lo->plh_segs)) { + if (!test_and_set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags)) + put_layout_hdr_locked(lo); + return 0; + } list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) if (should_free_lseg(lseg->pls_range.iomode, iomode)) { dprintk("%s: freeing lseg %p iomode %d " @@ -332,10 +340,8 @@ pnfs_destroy_layout(struct nfs_inode *nfsi) spin_lock(&nfsi->vfs_inode.i_lock); lo = nfsi->layout; if (lo) { - set_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags); + lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */ mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY); - /* Matched by refcount set to 1 in alloc_init_layout_hdr */ - put_layout_hdr_locked(lo); } spin_unlock(&nfsi->vfs_inode.i_lock); pnfs_free_lseg_list(&tmp_list); @@ -403,6 +409,7 @@ pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid, (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0) return true; return lo->plh_block_lgets || + test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags) || test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || (list_empty(&lo->plh_segs) && (atomic_read(&lo->plh_outstanding) > lget)); From 9f52c2525e09854ed6aa4cbd83915a56226d86c1 Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Thu, 3 Feb 2011 18:28:51 +0000 Subject: [PATCH 02/54] pnfs: do not need to clear NFS_LAYOUT_BULK_RECALL flag We do not need to clear the NFS_LAYOUT_BULK_RECALL, as setting it guarantees that NFS_LAYOUT_DESTROYED will be set once any outstanding io is finished. Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index c8d9b2148cb1..c17edfbbaebf 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -254,7 +254,6 @@ put_lseg_locked(struct pnfs_layout_segment *lseg, /* List does not take a reference, so no need for put here */ list_del_init(&lseg->pls_layout->plh_layouts); spin_unlock(&clp->cl_lock); - clear_bit(NFS_LAYOUT_BULK_RECALL, &lseg->pls_layout->plh_flags); set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags); /* Matched by initial refcount set in alloc_init_layout_hdr */ put_layout_hdr_locked(lseg->pls_layout); @@ -754,7 +753,6 @@ pnfs_update_layout(struct inode *ino, spin_lock(&clp->cl_lock); list_del_init(&lo->plh_layouts); spin_unlock(&clp->cl_lock); - clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); } spin_unlock(&ino->i_lock); } From f49f9baac8f63de9cbc17a0a84e04060496e8e76 Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Thu, 3 Feb 2011 18:28:52 +0000 Subject: [PATCH 03/54] pnfs: fix pnfs lock inversion of i_lock and cl_lock The pnfs code was using throughout the lock order i_lock, cl_lock. This conflicts with the nfs delegation code. Rework the pnfs code to avoid taking both locks simultaneously. Currently the code takes the double lock to add/remove the layout to a nfs_client list, while atomically checking that the list of lsegs is empty. To avoid this, we rely on existing serializations. When a layout is initialized with lseg count equal zero, LAYOUTGET's openstateid serialization is in effect, making it safe to assume it stays zero unless we change it. And once a layout's lseg count drops to zero, it is set as DESTROYED and so will stay at zero. Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/callback_proc.c | 2 +- fs/nfs/pnfs.c | 42 +++++++++++++++++++++++++----------------- 2 files changed, 26 insertions(+), 18 deletions(-) diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 89587573fe50..2f41dccea18e 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -188,10 +188,10 @@ static u32 initiate_bulk_draining(struct nfs_client *clp, rv = NFS4ERR_DELAY; list_del_init(&lo->plh_bulk_recall); spin_unlock(&ino->i_lock); + pnfs_free_lseg_list(&free_me_list); put_layout_hdr(lo); iput(ino); } - pnfs_free_lseg_list(&free_me_list); return rv; } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index c17edfbbaebf..0f5b66f90d17 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -247,13 +247,6 @@ put_lseg_locked(struct pnfs_layout_segment *lseg, BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); list_del(&lseg->pls_list); if (list_empty(&lseg->pls_layout->plh_segs)) { - struct nfs_client *clp; - - clp = NFS_SERVER(ino)->nfs_client; - spin_lock(&clp->cl_lock); - /* List does not take a reference, so no need for put here */ - list_del_init(&lseg->pls_layout->plh_layouts); - spin_unlock(&clp->cl_lock); set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags); /* Matched by initial refcount set in alloc_init_layout_hdr */ put_layout_hdr_locked(lseg->pls_layout); @@ -319,11 +312,27 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, return invalid - removed; } +/* note free_me must contain lsegs from a single layout_hdr */ void pnfs_free_lseg_list(struct list_head *free_me) { struct pnfs_layout_segment *lseg, *tmp; + struct pnfs_layout_hdr *lo; + if (list_empty(free_me)) + return; + + lo = list_first_entry(free_me, struct pnfs_layout_segment, + pls_list)->pls_layout; + + if (test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags)) { + struct nfs_client *clp; + + clp = NFS_SERVER(lo->plh_inode)->nfs_client; + spin_lock(&clp->cl_lock); + list_del_init(&lo->plh_layouts); + spin_unlock(&clp->cl_lock); + } list_for_each_entry_safe(lseg, tmp, free_me, pls_list) { list_del(&lseg->pls_list); free_lseg(lseg); @@ -705,6 +714,7 @@ pnfs_update_layout(struct inode *ino, struct nfs_client *clp = NFS_SERVER(ino)->nfs_client; struct pnfs_layout_hdr *lo; struct pnfs_layout_segment *lseg = NULL; + bool first = false; if (!pnfs_enabled_sb(NFS_SERVER(ino))) return NULL; @@ -735,7 +745,10 @@ pnfs_update_layout(struct inode *ino, atomic_inc(&lo->plh_outstanding); get_layout_hdr(lo); - if (list_empty(&lo->plh_segs)) { + if (list_empty(&lo->plh_segs)) + first = true; + spin_unlock(&ino->i_lock); + if (first) { /* The lo must be on the clp list if there is any * chance of a CB_LAYOUTRECALL(FILE) coming in. */ @@ -744,17 +757,12 @@ pnfs_update_layout(struct inode *ino, list_add_tail(&lo->plh_layouts, &clp->cl_layouts); spin_unlock(&clp->cl_lock); } - spin_unlock(&ino->i_lock); lseg = send_layoutget(lo, ctx, iomode); - if (!lseg) { - spin_lock(&ino->i_lock); - if (list_empty(&lo->plh_segs)) { - spin_lock(&clp->cl_lock); - list_del_init(&lo->plh_layouts); - spin_unlock(&clp->cl_lock); - } - spin_unlock(&ino->i_lock); + if (!lseg && first) { + spin_lock(&clp->cl_lock); + list_del_init(&lo->plh_layouts); + spin_unlock(&clp->cl_lock); } atomic_dec(&lo->plh_outstanding); put_layout_hdr(lo); From cee6a5372f8804f58acc87f07816f64db36718e2 Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Fri, 11 Feb 2011 15:42:35 +0000 Subject: [PATCH 04/54] RPC: remove check for impossible condition in rpc_make_runnable queue_work() only returns 0 or 1, never a negative value. Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- net/sunrpc/sched.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 59e599498e37..93107265256d 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -299,15 +299,8 @@ static void rpc_make_runnable(struct rpc_task *task) if (rpc_test_and_set_running(task)) return; if (RPC_IS_ASYNC(task)) { - int status; - INIT_WORK(&task->u.tk_work, rpc_async_schedule); - status = queue_work(rpciod_workqueue, &task->u.tk_work); - if (status < 0) { - printk(KERN_WARNING "RPC: failed to add task to queue: error: %d!\n", status); - task->tk_status = status; - return; - } + queue_work(rpciod_workqueue, &task->u.tk_work); } else wake_up_bit(&task->tk_runstate, RPC_TASK_QUEUED); } From eabf5baaaaf41b6a0273043cfb06d53dca67acef Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Fri, 11 Feb 2011 15:42:36 +0000 Subject: [PATCH 05/54] RPC: clarify rpc_run_task error handling rpc_run_task can only fail if it is not passed in a preallocated task. However, that is not at all clear with the current code. So remove several impossible to occur failure checks. Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 6 ------ net/sunrpc/sched.c | 6 ------ 2 files changed, 12 deletions(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 57d344cf2256..8b5a6b40d37c 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -636,12 +636,6 @@ struct rpc_task *rpc_run_task(const struct rpc_task_setup *task_setup_data) rpc_task_set_client(task, task_setup_data->rpc_client); rpc_task_set_rpc_message(task, task_setup_data->rpc_message); - if (task->tk_status != 0) { - int ret = task->tk_status; - rpc_put_task(task); - return ERR_PTR(ret); - } - if (task->tk_action == NULL) rpc_call_start(task); diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 93107265256d..5681c6a12d20 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -836,12 +836,6 @@ struct rpc_task *rpc_new_task(const struct rpc_task_setup *setup_data) } rpc_init_task(task, setup_data); - if (task->tk_status < 0) { - int err = task->tk_status; - rpc_put_task(task); - return ERR_PTR(err); - } - task->tk_flags |= flags; dprintk("RPC: allocated task %p\n", task); return task; From 83762c56c1ba7c5b4b92fb32d570661633228bc6 Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Fri, 11 Feb 2011 15:42:37 +0000 Subject: [PATCH 06/54] NFS: remove pointless if statement in nfs_direct_write_result The code was doing nothing more in either branch of the if. Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 9943a75bb6d1..f493bdd74f78 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -649,8 +649,7 @@ static void nfs_direct_write_result(struct rpc_task *task, void *calldata) { struct nfs_write_data *data = calldata; - if (nfs_writeback_done(task, data) != 0) - return; + nfs_writeback_done(task, data); } /* From 136028967a283929c6f01518d0700b73fa622d56 Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Fri, 11 Feb 2011 15:42:38 +0000 Subject: [PATCH 07/54] NFS: change nfs_writeback_done to return void The return values are not used by any callers. Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 8 ++++---- include/linux/nfs_fs.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 42b92d7a9cc4..ae528b98b804 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1132,7 +1132,7 @@ static const struct rpc_call_ops nfs_write_full_ops = { /* * This function is called when the WRITE call is complete. */ -int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) +void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) { struct nfs_writeargs *argp = &data->args; struct nfs_writeres *resp = &data->res; @@ -1151,7 +1151,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) */ status = NFS_PROTO(data->inode)->write_done(task, data); if (status != 0) - return status; + return; nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count); #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) @@ -1196,7 +1196,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) argp->stable = NFS_FILE_SYNC; } nfs_restart_rpc(task, server->nfs_client); - return -EAGAIN; + return; } if (time_before(complain, jiffies)) { printk(KERN_WARNING @@ -1207,7 +1207,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) /* Can't do anything about it except throw an error. */ task->tk_status = -EIO; } - return 0; + return; } diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 6023efa9f5d9..f88522b10a38 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -501,7 +501,7 @@ extern int nfs_writepage(struct page *page, struct writeback_control *wbc); extern int nfs_writepages(struct address_space *, struct writeback_control *); extern int nfs_flush_incompatible(struct file *file, struct page *page); extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int); -extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *); +extern void nfs_writeback_done(struct rpc_task *, struct nfs_write_data *); /* * Try to write back everything synchronously (but check the From 6f78befc417dd7122249706b49520da29ba58451 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Tue, 1 Mar 2011 01:34:06 +0000 Subject: [PATCH 08/54] NFSv4: remove CONFIG_NFS_V4 from nfs_read_data Cleanup nfs_read_data. We also won't use CONFIG_NFS_V4_1 for additional NFSv4.1 fields in subsequent patches. Signed-off-by: Andy Adamson Signed-off-by: Trond Myklebust --- include/linux/nfs_xdr.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index b0068579bec2..51bfadbe24e2 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1016,9 +1016,7 @@ struct nfs_read_data { unsigned int npages; /* Max length of pagevec */ struct nfs_readargs args; struct nfs_readres res; -#ifdef CONFIG_NFS_V4 unsigned long timestamp; /* For lease renewal */ -#endif struct page *page_array[NFS_PAGEVEC_SIZE]; }; From bf9c1387ca80deac792c9ecf1c64dfcc5d1cc768 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Tue, 1 Mar 2011 01:34:07 +0000 Subject: [PATCH 09/54] NFSv4.1: put_layout_hdr can remove nfsi->layout Prevents an Oops triggered by CB_LAYOUTRECALL and LAYOUTGET race on a pnfs_layout_hdr first pnfs_layout_segment. Signed-off-by: Andy Adamson Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 0f5b66f90d17..7d031cd7d920 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -768,7 +768,7 @@ pnfs_update_layout(struct inode *ino, put_layout_hdr(lo); out: dprintk("%s end, state 0x%lx lseg %p\n", __func__, - nfsi->layout->plh_flags, lseg); + nfsi->layout ? nfsi->layout->plh_flags : -1, lseg); return lseg; out_unlock: spin_unlock(&ino->i_lock); From 45a52a02072b2a7e265f024cfdb00127e08dd9f2 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Tue, 1 Mar 2011 01:34:08 +0000 Subject: [PATCH 10/54] NFS move nfs_client initialization into nfs_get_client Now nfs_get_client returns an nfs_client ready to be used no matter if it was found or created. Signed-off-by: Andy Adamson Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 56 ++++++++++++++++++++++------------------- fs/nfs/internal.h | 9 +++++++ fs/nfs/nfs3proc.c | 1 + fs/nfs/nfs4proc.c | 1 + fs/nfs/proc.c | 1 + include/linux/nfs_xdr.h | 3 +++ 6 files changed, 45 insertions(+), 26 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index bd3ca32879e7..b9ed2a8bc26a 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -481,7 +481,12 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat * Look up a client by IP address and protocol version * - creates a new record if one doesn't yet exist */ -static struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init) +static struct nfs_client * +nfs_get_client(const struct nfs_client_initdata *cl_init, + const struct rpc_timeout *timeparms, + const char *ip_addr, + rpc_authflavor_t authflavour, + int noresvport) { struct nfs_client *clp, *new = NULL; int error; @@ -512,6 +517,13 @@ install_client: clp = new; list_add(&clp->cl_share_link, &nfs_client_list); spin_unlock(&nfs_client_lock); + + error = cl_init->rpc_ops->init_client(clp, timeparms, ip_addr, + authflavour, noresvport); + if (error < 0) { + nfs_put_client(clp); + return ERR_PTR(error); + } dprintk("--> nfs_get_client() = %p [new]\n", clp); return clp; @@ -767,9 +779,9 @@ static int nfs_init_server_rpcclient(struct nfs_server *server, /* * Initialise an NFS2 or NFS3 client */ -static int nfs_init_client(struct nfs_client *clp, - const struct rpc_timeout *timeparms, - const struct nfs_parsed_mount_data *data) +int nfs_init_client(struct nfs_client *clp, const struct rpc_timeout *timeparms, + const char *ip_addr, rpc_authflavor_t authflavour, + int noresvport) { int error; @@ -784,7 +796,7 @@ static int nfs_init_client(struct nfs_client *clp, * - RFC 2623, sec 2.3.2 */ error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX, - 0, data->flags & NFS_MOUNT_NORESVPORT); + 0, noresvport); if (error < 0) goto error; nfs_mark_client_ready(clp, NFS_CS_READY); @@ -820,19 +832,17 @@ static int nfs_init_server(struct nfs_server *server, cl_init.rpc_ops = &nfs_v3_clientops; #endif + nfs_init_timeout_values(&timeparms, data->nfs_server.protocol, + data->timeo, data->retrans); + /* Allocate or find a client reference we can use */ - clp = nfs_get_client(&cl_init); + clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX, + data->flags & NFS_MOUNT_NORESVPORT); if (IS_ERR(clp)) { dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp)); return PTR_ERR(clp); } - nfs_init_timeout_values(&timeparms, data->nfs_server.protocol, - data->timeo, data->retrans); - error = nfs_init_client(clp, &timeparms, data); - if (error < 0) - goto error; - server->nfs_client = clp; /* Initialise the client representation from the mount data */ @@ -1307,11 +1317,11 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp) /* * Initialise an NFS4 client record */ -static int nfs4_init_client(struct nfs_client *clp, - const struct rpc_timeout *timeparms, - const char *ip_addr, - rpc_authflavor_t authflavour, - int flags) +int nfs4_init_client(struct nfs_client *clp, + const struct rpc_timeout *timeparms, + const char *ip_addr, + rpc_authflavor_t authflavour, + int noresvport) { int error; @@ -1325,7 +1335,7 @@ static int nfs4_init_client(struct nfs_client *clp, clp->rpc_ops = &nfs_v4_clientops; error = nfs_create_rpc_client(clp, timeparms, authflavour, - 1, flags & NFS_MOUNT_NORESVPORT); + 1, noresvport); if (error < 0) goto error; strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); @@ -1378,22 +1388,16 @@ static int nfs4_set_client(struct nfs_server *server, dprintk("--> nfs4_set_client()\n"); /* Allocate or find a client reference we can use */ - clp = nfs_get_client(&cl_init); + clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour, + server->flags & NFS_MOUNT_NORESVPORT); if (IS_ERR(clp)) { error = PTR_ERR(clp); goto error; } - error = nfs4_init_client(clp, timeparms, ip_addr, authflavour, - server->flags); - if (error < 0) - goto error_put; server->nfs_client = clp; dprintk("<-- nfs4_set_client() = 0 [new %p]\n", clp); return 0; - -error_put: - nfs_put_client(clp); error: dprintk("<-- nfs4_set_client() = xerror %d\n", error); return error; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index cf9fdbdabc67..4d7b3a97e522 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -215,6 +215,10 @@ extern struct rpc_procinfo nfs4_procedures[]; /* proc.c */ void nfs_close_context(struct nfs_open_context *ctx, int is_sync); +extern int nfs_init_client(struct nfs_client *clp, + const struct rpc_timeout *timeparms, + const char *ip_addr, rpc_authflavor_t authflavour, + int noresvport); /* dir.c */ extern int nfs_access_cache_shrinker(struct shrinker *shrink, @@ -274,6 +278,11 @@ extern int nfs_migrate_page(struct address_space *, #endif /* nfs4proc.c */ +extern int nfs4_init_client(struct nfs_client *clp, + const struct rpc_timeout *timeparms, + const char *ip_addr, + rpc_authflavor_t authflavour, + int noresvport); extern int _nfs4_call_sync(struct nfs_server *server, struct rpc_message *msg, struct nfs4_sequence_args *args, diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index ce939c062a52..d0c80d8b3f96 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -885,4 +885,5 @@ const struct nfs_rpc_ops nfs_v3_clientops = { .lock = nfs3_proc_lock, .clear_acl_cache = nfs3_forget_cached_acls, .close_context = nfs_close_context, + .init_client = nfs_init_client, }; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 0a07e353a961..55a8fc2f3df4 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5648,6 +5648,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .clear_acl_cache = nfs4_zap_acl_attr, .close_context = nfs4_close_context, .open_context = nfs4_atomic_open, + .init_client = nfs4_init_client, }; static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = { diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 77d5e21c4ad6..b8ec170f2a0f 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -741,4 +741,5 @@ const struct nfs_rpc_ops nfs_v2_clientops = { .lock = nfs_proc_lock, .lock_check_bounds = nfs_lock_check_bounds, .close_context = nfs_close_context, + .init_client = nfs_init_client, }; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 51bfadbe24e2..d159fe733381 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1040,6 +1040,7 @@ struct nfs_write_data { }; struct nfs_access_entry; +struct nfs_client; /* * RPC procedure vector for NFSv2/NFSv3 demuxing @@ -1104,6 +1105,8 @@ struct nfs_rpc_ops { struct nfs_open_context *ctx, int open_flags, struct iattr *iattr); + int (*init_client) (struct nfs_client *, const struct rpc_timeout *, + const char *, rpc_authflavor_t, int); }; /* From 89d1ea65798953b251e399b17f32d31033889ae0 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Tue, 1 Mar 2011 01:34:09 +0000 Subject: [PATCH 11/54] NFSv4.1: send zero stateid seqid on v4.1 i/o Data servers require a zero stateid seqid, and there is no advantage to not doing the same for all NFSv4.1 Signed-off-by: Andy Adamson Signed-off-by: Trond Myklebust --- fs/nfs/nfs4xdr.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 94d50e86a124..a656b6e179b0 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1384,7 +1384,7 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr) hdr->replen += decode_putrootfh_maxsz; } -static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx) +static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx, int zero_seqid) { nfs4_stateid stateid; __be32 *p; @@ -1392,6 +1392,8 @@ static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context p = reserve_space(xdr, NFS4_STATEID_SIZE); if (ctx->state != NULL) { nfs4_copy_stateid(&stateid, ctx->state, l_ctx->lockowner, l_ctx->pid); + if (zero_seqid) + stateid.stateid.seqid = 0; xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE); } else xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); @@ -1404,7 +1406,8 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, p = reserve_space(xdr, 4); *p = cpu_to_be32(OP_READ); - encode_stateid(xdr, args->context, args->lock_context); + encode_stateid(xdr, args->context, args->lock_context, + hdr->minorversion); p = reserve_space(xdr, 12); p = xdr_encode_hyper(p, args->offset); @@ -1592,7 +1595,8 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg p = reserve_space(xdr, 4); *p = cpu_to_be32(OP_WRITE); - encode_stateid(xdr, args->context, args->lock_context); + encode_stateid(xdr, args->context, args->lock_context, + hdr->minorversion); p = reserve_space(xdr, 16); p = xdr_encode_hyper(p, args->offset); From d3b4c9d76738df49a7db7682c2518a0ef9f7391d Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Tue, 1 Mar 2011 01:34:10 +0000 Subject: [PATCH 12/54] NFSv4.1: new flag for state renewal check Data servers not sharing a session with the mount MDS always have an empty cl_superblocks list. Replace the cl_superblocks empty list check to see if it is time to shut down renewd with the NFS_CS_STOP_RENEW bit which is not set by such a data server. Signed-off-by: Andy Adamson Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 5 +++++ fs/nfs/nfs4renewd.c | 6 +----- include/linux/nfs_fs_sb.h | 1 + 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index b9ed2a8bc26a..a86698cd82fd 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -1019,14 +1019,19 @@ static void nfs_server_insert_lists(struct nfs_server *server) spin_lock(&nfs_client_lock); list_add_tail_rcu(&server->client_link, &clp->cl_superblocks); list_add_tail(&server->master_link, &nfs_volume_list); + clear_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state); spin_unlock(&nfs_client_lock); } static void nfs_server_remove_lists(struct nfs_server *server) { + struct nfs_client *clp = server->nfs_client; + spin_lock(&nfs_client_lock); list_del_rcu(&server->client_link); + if (clp && list_empty(&clp->cl_superblocks)) + set_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state); list_del(&server->master_link); spin_unlock(&nfs_client_lock); diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c index 402143d75fc5..df8e7f3ca56d 100644 --- a/fs/nfs/nfs4renewd.c +++ b/fs/nfs/nfs4renewd.c @@ -64,12 +64,8 @@ nfs4_renew_state(struct work_struct *work) ops = clp->cl_mvops->state_renewal_ops; dprintk("%s: start\n", __func__); - rcu_read_lock(); - if (list_empty(&clp->cl_superblocks)) { - rcu_read_unlock(); + if (test_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state)) goto out; - } - rcu_read_unlock(); spin_lock(&clp->cl_lock); lease = clp->cl_lease_time; diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 3e112de12d8d..0bac4176e505 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -30,6 +30,7 @@ struct nfs_client { #define NFS_CS_CALLBACK 1 /* - callback started */ #define NFS_CS_IDMAP 2 /* - idmap started */ #define NFS_CS_RENEWD 3 /* - renewd started */ +#define NFS_CS_STOP_RENEW 4 /* no more state to renew */ struct sockaddr_storage cl_addr; /* server identifier */ size_t cl_addrlen; char * cl_hostname; /* hostname of server */ From d6fb79d433d0a34c36bdf74eaf90857193a6261f Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Tue, 1 Mar 2011 01:34:11 +0000 Subject: [PATCH 13/54] NFSv4.1: new flag for lease time check Data servers cannot send nfs4_proc_get_lease_time. but still need to setup state renewal. Add the NFS_CS_CHECK_LEASE_TIME bit to indicate if the lease time can be checked. Signed-off-by: Andy Adamson Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 9 +++++++++ fs/nfs/nfs4state.c | 5 +++++ include/linux/nfs_fs_sb.h | 1 + 3 files changed, 15 insertions(+) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index a86698cd82fd..280d41f64a57 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -1400,6 +1400,15 @@ static int nfs4_set_client(struct nfs_server *server, goto error; } + /* + * Query for the lease time on clientid setup or renewal + * + * Note that this will be set on nfs_clients that were created + * only for the DS role and did not set this bit, but now will + * serve a dual role. + */ + set_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state); + server->nfs_client = clp; dprintk("<-- nfs4_set_client() = 0 [new %p]\n", clp); return 0; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 0592288f9f06..69c836373124 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -153,6 +153,11 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp) int status; struct nfs_fsinfo fsinfo; + if (!test_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state)) { + nfs4_schedule_state_renewal(clp); + return 0; + } + status = nfs4_proc_get_lease_time(clp, &fsinfo); if (status == 0) { /* Update lease time and schedule renewal */ diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 0bac4176e505..c00d4ec47ec3 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -31,6 +31,7 @@ struct nfs_client { #define NFS_CS_IDMAP 2 /* - idmap started */ #define NFS_CS_RENEWD 3 /* - renewd started */ #define NFS_CS_STOP_RENEW 4 /* no more state to renew */ +#define NFS_CS_CHECK_LEASE_TIME 5 /* need to check lease time */ struct sockaddr_storage cl_addr; /* server identifier */ size_t cl_addrlen; char * cl_hostname; /* hostname of server */ From 94de8b27d0dcb2608d56a7e5c2941b87e6da7ce3 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Tue, 1 Mar 2011 01:34:12 +0000 Subject: [PATCH 14/54] NFSv4.1: add MDS mount DS only check The DS only role cannot be used to mount. Signed-off-by: Andy Adamson Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 4 ++++ fs/nfs/nfs4_fs.h | 13 +++++++++++++ 2 files changed, 17 insertions(+) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 280d41f64a57..d5c5bdfa4231 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -1453,6 +1453,10 @@ static int nfs4_server_common_setup(struct nfs_server *server, BUG_ON(!server->nfs_client->rpc_ops); BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); + /* data servers support only a subset of NFSv4.1 */ + if (is_ds_only_client(server->nfs_client)) + return -EPROTONOSUPPORT; + fattr = nfs_alloc_fattr(); if (fattr == NULL) return -ENOMEM; diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 1be36cf65bfc..d4cfacc40003 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -259,6 +259,13 @@ extern int nfs4_proc_destroy_session(struct nfs4_session *); extern int nfs4_init_session(struct nfs_server *server); extern int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo); + +static inline bool +is_ds_only_client(struct nfs_client *clp) +{ + return (clp->cl_exchange_flags & EXCHGID4_FLAG_MASK_PNFS) == + EXCHGID4_FLAG_USE_PNFS_DS; +} #else /* CONFIG_NFS_v4_1 */ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) { @@ -276,6 +283,12 @@ static inline int nfs4_init_session(struct nfs_server *server) { return 0; } + +static inline bool +is_ds_only_client(struct nfs_client *clp) +{ + return false; +} #endif /* CONFIG_NFS_V4_1 */ extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[]; From d684d2ae10a4f95d3035abf698d7d611ff2cd279 Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Tue, 1 Mar 2011 01:34:13 +0000 Subject: [PATCH 15/54] NFSv4.1: lseg refcounting Prepare put_lseg and get_lseg to be called from the pNFS I/O code. Pull common code from pnfs_lseg_locked to call from pnfs_lseg. Inline pnfs_lseg_locked into it's only caller. Signed-off-by: Fred Isaman Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 62 ++++++++++++++++++++++++++++++++------------------- fs/nfs/pnfs.h | 20 +++++++++++++++++ 2 files changed, 59 insertions(+), 23 deletions(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 7d031cd7d920..3afa82e45438 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -230,32 +230,41 @@ static void free_lseg(struct pnfs_layout_segment *lseg) put_layout_hdr(NFS_I(ino)->layout); } -/* The use of tmp_list is necessary because pnfs_curr_ld->free_lseg - * could sleep, so must be called outside of the lock. - * Returns 1 if object was removed, otherwise return 0. - */ -static int -put_lseg_locked(struct pnfs_layout_segment *lseg, - struct list_head *tmp_list) +static void +put_lseg_common(struct pnfs_layout_segment *lseg) { + struct inode *inode = lseg->pls_layout->plh_inode; + + BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); + list_del_init(&lseg->pls_list); + if (list_empty(&lseg->pls_layout->plh_segs)) { + set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags); + /* Matched by initial refcount set in alloc_init_layout_hdr */ + put_layout_hdr_locked(lseg->pls_layout); + } + rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq); +} + +static void +put_lseg(struct pnfs_layout_segment *lseg) +{ + struct inode *inode; + + if (!lseg) + return; + dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, atomic_read(&lseg->pls_refcount), test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); - if (atomic_dec_and_test(&lseg->pls_refcount)) { - struct inode *ino = lseg->pls_layout->plh_inode; + inode = lseg->pls_layout->plh_inode; + if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) { + LIST_HEAD(free_me); - BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); - list_del(&lseg->pls_list); - if (list_empty(&lseg->pls_layout->plh_segs)) { - set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags); - /* Matched by initial refcount set in alloc_init_layout_hdr */ - put_layout_hdr_locked(lseg->pls_layout); - } - rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq); - list_add(&lseg->pls_list, tmp_list); - return 1; + put_lseg_common(lseg); + list_add(&lseg->pls_list, &free_me); + spin_unlock(&inode->i_lock); + pnfs_free_lseg_list(&free_me); } - return 0; } static bool @@ -276,7 +285,13 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, * list. It will now be removed when all * outstanding io is finished. */ - rv = put_lseg_locked(lseg, tmp_list); + dprintk("%s: lseg %p ref %d\n", __func__, lseg, + atomic_read(&lseg->pls_refcount)); + if (atomic_dec_and_test(&lseg->pls_refcount)) { + put_lseg_common(lseg); + list_add(&lseg->pls_list, tmp_list); + rv = 1; + } } return rv; } @@ -689,7 +704,7 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode) list_for_each_entry(lseg, &lo->plh_segs, pls_list) { if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) && is_matching_lseg(lseg, iomode)) { - ret = lseg; + ret = get_lseg(lseg); break; } if (cmp_layout(iomode, lseg->pls_range.iomode) > 0) @@ -769,6 +784,7 @@ pnfs_update_layout(struct inode *ino, out: dprintk("%s end, state 0x%lx lseg %p\n", __func__, nfsi->layout ? nfsi->layout->plh_flags : -1, lseg); + put_lseg(lseg); /* STUB - callers currently ignore return value */ return lseg; out_unlock: spin_unlock(&ino->i_lock); @@ -821,7 +837,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) } init_lseg(lo, lseg); lseg->pls_range = res->range; - *lgp->lsegpp = lseg; + *lgp->lsegpp = get_lseg(lseg); pnfs_insert_layout(lo, lseg); if (res->return_on_close) { diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index e2612ea0cbed..9a994bc9899f 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -177,6 +177,16 @@ static inline int lo_fail_bit(u32 iomode) NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED; } +static inline struct pnfs_layout_segment * +get_lseg(struct pnfs_layout_segment *lseg) +{ + if (lseg) { + atomic_inc(&lseg->pls_refcount); + smp_mb__after_atomic_inc(); + } + return lseg; +} + /* Return true if a layout driver is being used for this mountpoint */ static inline int pnfs_enabled_sb(struct nfs_server *nfss) { @@ -193,6 +203,16 @@ static inline void pnfs_destroy_layout(struct nfs_inode *nfsi) { } +static inline struct pnfs_layout_segment * +get_lseg(struct pnfs_layout_segment *lseg) +{ + return NULL; +} + +static inline void put_lseg(struct pnfs_layout_segment *lseg) +{ +} + static inline struct pnfs_layout_segment * pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, enum pnfs_iomode access_type) From 94ad1c80e28f9700c84b4d28d1e5302ddf63a6fd Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Tue, 1 Mar 2011 01:34:14 +0000 Subject: [PATCH 16/54] NFSv4.1: coelesce across layout stripes Add a pg_test layout driver hook which is used to avoid coelescing I/O across layout stripes. Signed-off-by: Andy Adamson Signed-off-by: Andy Adamson Signed-off-by: Dean Hildebrand Signed-off-by: Fred Isaman Signed-off-by: Fred Isaman Signed-off-by: Benny Halevy Signed-off-by: Boaz Harrosh Signed-off-by: Oleg Drokin Signed-off-by: Tao Guo Signed-off-by: Trond Myklebust --- fs/nfs/nfs4filelayout.c | 26 ++++++++++++++++++++++++++ fs/nfs/pagelist.c | 12 ++++++++++-- fs/nfs/pnfs.c | 16 ++++++++++++++++ fs/nfs/pnfs.h | 12 ++++++++++++ fs/nfs/read.c | 1 + fs/nfs/write.c | 3 +++ include/linux/nfs_page.h | 2 ++ 7 files changed, 70 insertions(+), 2 deletions(-) diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 23f930caf1e2..0efe8cbd9e3c 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -252,6 +252,31 @@ filelayout_free_lseg(struct pnfs_layout_segment *lseg) _filelayout_free_lseg(fl); } +/* + * filelayout_pg_test(). Called by nfs_can_coalesce_requests() + * + * return 1 : coalesce page + * return 0 : don't coalesce page + */ +int +filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, + struct nfs_page *req) +{ + u64 p_stripe, r_stripe; + u32 stripe_unit; + + if (!pgio->pg_lseg) + return 1; + p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT; + r_stripe = (u64)req->wb_index << PAGE_CACHE_SHIFT; + stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit; + + do_div(p_stripe, stripe_unit); + do_div(r_stripe, stripe_unit); + + return (p_stripe == r_stripe); +} + static struct pnfs_layoutdriver_type filelayout_type = { .id = LAYOUT_NFSV4_1_FILES, .name = "LAYOUT_NFSV4_1_FILES", @@ -260,6 +285,7 @@ static struct pnfs_layoutdriver_type filelayout_type = { .clear_layoutdriver = filelayout_clear_layoutdriver, .alloc_lseg = filelayout_alloc_lseg, .free_lseg = filelayout_free_lseg, + .pg_test = filelayout_pg_test, }; static int __init nfs4filelayout_init(void) diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index e1164e3f9e69..9b9a65c9bb4f 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -226,6 +226,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, desc->pg_doio = doio; desc->pg_ioflags = io_flags; desc->pg_error = 0; + desc->pg_lseg = NULL; } /** @@ -240,7 +241,8 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, * Return 'true' if this is the case, else return 'false'. */ static int nfs_can_coalesce_requests(struct nfs_page *prev, - struct nfs_page *req) + struct nfs_page *req, + struct nfs_pageio_descriptor *pgio) { if (req->wb_context->cred != prev->wb_context->cred) return 0; @@ -254,6 +256,12 @@ static int nfs_can_coalesce_requests(struct nfs_page *prev, return 0; if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE) return 0; + /* + * Non-whole file layouts need to check that req is inside of + * pgio->pg_lseg. + */ + if (pgio->pg_test && !pgio->pg_test(pgio, prev, req)) + return 0; return 1; } @@ -286,7 +294,7 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, if (newlen > desc->pg_bsize) return 0; prev = nfs_list_entry(desc->pg_list.prev); - if (!nfs_can_coalesce_requests(prev, req)) + if (!nfs_can_coalesce_requests(prev, req, desc)) return 0; } else desc->pg_base = req->wb_pgbase; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 3afa82e45438..330cee115de0 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -858,6 +858,22 @@ out_forget_reply: goto out; } +static void +pnfs_set_pg_test(struct inode *inode, struct nfs_pageio_descriptor *pgio) +{ + struct pnfs_layoutdriver_type *ld; + + ld = NFS_SERVER(inode)->pnfs_curr_ld; + pgio->pg_test = (ld ? ld->pg_test : NULL); +} + +void +pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, + struct inode *inode) +{ + pnfs_set_pg_test(inode, pgio); +} + /* * Device ID cache. Currently supports one layout type per struct nfs_client. * Add layout type to the lookup key to expand to support multiple types. diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 9a994bc9899f..db52d9658570 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -30,6 +30,8 @@ #ifndef FS_NFS_PNFS_H #define FS_NFS_PNFS_H +#include + enum { NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */ NFS_LSEG_ROC, /* roc bit received from server */ @@ -65,6 +67,9 @@ struct pnfs_layoutdriver_type { int (*clear_layoutdriver) (struct nfs_server *); struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr); void (*free_lseg) (struct pnfs_layout_segment *lseg); + + /* test for nfs page cache coalescing */ + int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); }; struct pnfs_layout_hdr { @@ -151,6 +156,7 @@ pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, enum pnfs_iomode access_type); void set_pnfs_layoutdriver(struct nfs_server *, u32 id); void unset_pnfs_layoutdriver(struct nfs_server *); +void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *); int pnfs_layout_process(struct nfs4_layoutget *lgp); void pnfs_free_lseg_list(struct list_head *tmp_list); void pnfs_destroy_layout(struct nfs_inode *); @@ -250,6 +256,12 @@ static inline void unset_pnfs_layoutdriver(struct nfs_server *s) { } +static inline void +pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *ino) +{ + pgio->pg_test = NULL; +} + #endif /* CONFIG_NFS_V4_1 */ #endif /* FS_NFS_PNFS_H */ diff --git a/fs/nfs/read.c b/fs/nfs/read.c index aedcaa7f291f..2a2765975e1f 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -626,6 +626,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping, goto read_complete; /* all pages were read */ pnfs_update_layout(inode, desc.ctx, IOMODE_READ); + pnfs_pageio_init_read(&pgio, inode); if (rsize < PAGE_CACHE_SIZE) nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0); else diff --git a/fs/nfs/write.c b/fs/nfs/write.c index ae528b98b804..40143c4747a5 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -28,6 +28,7 @@ #include "iostat.h" #include "nfs4_fs.h" #include "fscache.h" +#include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_PAGECACHE @@ -982,6 +983,8 @@ static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, { size_t wsize = NFS_SERVER(inode)->wsize; + pgio->pg_test = NULL; + if (wsize < PAGE_CACHE_SIZE) nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags); else diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index d55cee73f634..4eaf27a1282d 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -62,6 +62,8 @@ struct nfs_pageio_descriptor { int (*pg_doio)(struct inode *, struct list_head *, unsigned int, size_t, int); int pg_ioflags; int pg_error; + struct pnfs_layout_segment *pg_lseg; + int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); }; #define NFS_WBACK_BUSY(req) (test_bit(PG_BUSY,&(req)->wb_flags)) From bae724ef95b0d0a1f4518f5451e7c8aabc41f820 Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Tue, 1 Mar 2011 01:34:15 +0000 Subject: [PATCH 17/54] NFSv4.1: shift pnfs_update_layout locations Move the pnfs_update_layout call location to nfs_pageio_do_add_request(). Grab the lseg sent in the doio function to nfs_read_rpcsetup and attach it to each nfs_read_data so it can be sent to the layout driver. Signed-off-by: Andy Adamson Signed-off-by: Andy Adamson Signed-off-by: Dean Hildebrand Signed-off-by: Fred Isaman Signed-off-by: Fred Isaman Signed-off-by: Benny Halevy Signed-off-by: Boaz Harrosh Signed-off-by: Oleg Drokin Signed-off-by: Tao Guo Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 4 ---- fs/nfs/pagelist.c | 7 +++++-- fs/nfs/pnfs.c | 29 +++++++++++++++++------------ fs/nfs/pnfs.h | 1 + fs/nfs/read.c | 40 ++++++++++++++++++++++++---------------- fs/nfs/write.c | 4 ++-- include/linux/nfs_page.h | 4 ++-- include/linux/nfs_xdr.h | 1 + 8 files changed, 52 insertions(+), 38 deletions(-) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 7bf029ef4084..d85a534b15cd 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -387,10 +387,6 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping, file->f_path.dentry->d_name.name, mapping->host->i_ino, len, (long long) pos); - pnfs_update_layout(mapping->host, - nfs_file_open_context(file), - IOMODE_RW); - start: /* * Prevent starvation issues if someone is doing a consistency diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 9b9a65c9bb4f..45b0fb8add39 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -20,6 +20,7 @@ #include #include "internal.h" +#include "pnfs.h" static struct kmem_cache *nfs_page_cachep; @@ -213,7 +214,7 @@ nfs_wait_on_request(struct nfs_page *req) */ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, struct inode *inode, - int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int), + int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int, struct pnfs_layout_segment *), size_t bsize, int io_flags) { @@ -315,7 +316,9 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc) nfs_page_array_len(desc->pg_base, desc->pg_count), desc->pg_count, - desc->pg_ioflags); + desc->pg_ioflags, + desc->pg_lseg); + desc->pg_lseg = NULL; if (error < 0) desc->pg_error = error; else diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 330cee115de0..77966ecb0a2c 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -245,7 +245,7 @@ put_lseg_common(struct pnfs_layout_segment *lseg) rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq); } -static void +void put_lseg(struct pnfs_layout_segment *lseg) { struct inode *inode; @@ -784,7 +784,6 @@ pnfs_update_layout(struct inode *ino, out: dprintk("%s end, state 0x%lx lseg %p\n", __func__, nfsi->layout ? nfsi->layout->plh_flags : -1, lseg); - put_lseg(lseg); /* STUB - callers currently ignore return value */ return lseg; out_unlock: spin_unlock(&ino->i_lock); @@ -858,20 +857,26 @@ out_forget_reply: goto out; } -static void -pnfs_set_pg_test(struct inode *inode, struct nfs_pageio_descriptor *pgio) +static int pnfs_read_pg_test(struct nfs_pageio_descriptor *pgio, + struct nfs_page *prev, + struct nfs_page *req) +{ + if (pgio->pg_count == prev->wb_bytes) { + /* This is first coelesce call for a series of nfs_pages */ + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, + prev->wb_context, + IOMODE_READ); + } + return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req); +} + +void +pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode) { struct pnfs_layoutdriver_type *ld; ld = NFS_SERVER(inode)->pnfs_curr_ld; - pgio->pg_test = (ld ? ld->pg_test : NULL); -} - -void -pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, - struct inode *inode) -{ - pnfs_set_pg_test(inode, pgio); + pgio->pg_test = (ld && ld->pg_test) ? pnfs_read_pg_test : NULL; } /* diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index db52d9658570..5107d14db485 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -151,6 +151,7 @@ extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp); /* pnfs.c */ void get_layout_hdr(struct pnfs_layout_hdr *lo); +void put_lseg(struct pnfs_layout_segment *lseg); struct pnfs_layout_segment * pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, enum pnfs_iomode access_type); diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 2a2765975e1f..6dc9eaf00e5c 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -20,17 +20,17 @@ #include #include +#include "pnfs.h" #include "nfs4_fs.h" #include "internal.h" #include "iostat.h" #include "fscache.h" -#include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_PAGECACHE -static int nfs_pagein_multi(struct inode *, struct list_head *, unsigned int, size_t, int); -static int nfs_pagein_one(struct inode *, struct list_head *, unsigned int, size_t, int); +static int nfs_pagein_multi(struct inode *, struct list_head *, unsigned int, size_t, int, struct pnfs_layout_segment *); +static int nfs_pagein_one(struct inode *, struct list_head *, unsigned int, size_t, int, struct pnfs_layout_segment *); static const struct rpc_call_ops nfs_read_partial_ops; static const struct rpc_call_ops nfs_read_full_ops; @@ -69,6 +69,7 @@ void nfs_readdata_free(struct nfs_read_data *p) static void nfs_readdata_release(struct nfs_read_data *rdata) { + put_lseg(rdata->lseg); put_nfs_open_context(rdata->args.context); nfs_readdata_free(rdata); } @@ -121,7 +122,6 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, len = nfs_page_length(page); if (len == 0) return nfs_return_empty_page(page); - pnfs_update_layout(inode, ctx, IOMODE_READ); new = nfs_create_request(ctx, inode, page, 0, len); if (IS_ERR(new)) { unlock_page(page); @@ -132,9 +132,9 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, nfs_list_add_request(new, &one_request); if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE) - nfs_pagein_multi(inode, &one_request, 1, len, 0); + nfs_pagein_multi(inode, &one_request, 1, len, 0, NULL); else - nfs_pagein_one(inode, &one_request, 1, len, 0); + nfs_pagein_one(inode, &one_request, 1, len, 0, NULL); return 0; } @@ -160,7 +160,8 @@ static void nfs_readpage_release(struct nfs_page *req) */ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, const struct rpc_call_ops *call_ops, - unsigned int count, unsigned int offset) + unsigned int count, unsigned int offset, + struct pnfs_layout_segment *lseg) { struct inode *inode = req->wb_context->path.dentry->d_inode; int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0; @@ -183,6 +184,7 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, data->req = req; data->inode = inode; data->cred = msg.rpc_cred; + data->lseg = get_lseg(lseg); data->args.fh = NFS_FH(inode); data->args.offset = req_offset(req) + offset; @@ -240,7 +242,7 @@ nfs_async_read_error(struct list_head *head) * won't see the new data until our attribute cache is updated. This is more * or less conventional NFS client behavior. */ -static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags) +static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags, struct pnfs_layout_segment *lseg) { struct nfs_page *req = nfs_list_entry(head->next); struct page *page = req->wb_page; @@ -266,6 +268,8 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne } while(nbytes != 0); atomic_set(&req->wb_complete, requests); + /* We know lseg==NULL */ + lseg = pnfs_update_layout(inode, req->wb_context, IOMODE_READ); ClearPageError(page); offset = 0; nbytes = count; @@ -280,12 +284,13 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne if (nbytes < rsize) rsize = nbytes; ret2 = nfs_read_rpcsetup(req, data, &nfs_read_partial_ops, - rsize, offset); + rsize, offset, lseg); if (ret == 0) ret = ret2; offset += rsize; nbytes -= rsize; } while (nbytes != 0); + put_lseg(lseg); return ret; @@ -300,7 +305,7 @@ out_bad: return -ENOMEM; } -static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags) +static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags, struct pnfs_layout_segment *lseg) { struct nfs_page *req; struct page **pages; @@ -308,8 +313,10 @@ static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned int ret = -ENOMEM; data = nfs_readdata_alloc(npages); - if (!data) - goto out_bad; + if (!data) { + nfs_async_read_error(head); + goto out; + } pages = data->pagevec; while (!list_empty(head)) { @@ -320,10 +327,12 @@ static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned *pages++ = req->wb_page; } req = nfs_list_entry(data->pages.next); + if ((!lseg) && list_is_singular(&data->pages)) + lseg = pnfs_update_layout(inode, req->wb_context, IOMODE_READ); - return nfs_read_rpcsetup(req, data, &nfs_read_full_ops, count, 0); -out_bad: - nfs_async_read_error(head); + ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, count, 0, lseg); +out: + put_lseg(lseg); return ret; } @@ -625,7 +634,6 @@ int nfs_readpages(struct file *filp, struct address_space *mapping, if (ret == 0) goto read_complete; /* all pages were read */ - pnfs_update_layout(inode, desc.ctx, IOMODE_READ); pnfs_pageio_init_read(&pgio, inode); if (rsize < PAGE_CACHE_SIZE) nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 40143c4747a5..f033fa0d7d33 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -880,7 +880,7 @@ static void nfs_redirty_request(struct nfs_page *req) * Generate multiple small requests to write out a single * contiguous dirty area on one page. */ -static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how) +static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how, struct pnfs_layout_segment *lseg) { struct nfs_page *req = nfs_list_entry(head->next); struct page *page = req->wb_page; @@ -947,7 +947,7 @@ out_bad: * This is the case if nfs_updatepage detects a conflicting request * that has been written but not committed. */ -static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how) +static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how, struct pnfs_layout_segment *lseg) { struct nfs_page *req; struct page **pages; diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index 4eaf27a1282d..ba88ff4f8186 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -59,7 +59,7 @@ struct nfs_pageio_descriptor { unsigned int pg_base; struct inode *pg_inode; - int (*pg_doio)(struct inode *, struct list_head *, unsigned int, size_t, int); + int (*pg_doio)(struct inode *, struct list_head *, unsigned int, size_t, int, struct pnfs_layout_segment *); int pg_ioflags; int pg_error; struct pnfs_layout_segment *pg_lseg; @@ -81,7 +81,7 @@ extern int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *dst, pgoff_t idx_start, unsigned int npages, int tag); extern void nfs_pageio_init(struct nfs_pageio_descriptor *desc, struct inode *inode, - int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int), + int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int, struct pnfs_layout_segment *), size_t bsize, int how); extern int nfs_pageio_add_request(struct nfs_pageio_descriptor *, diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index d159fe733381..560923e28723 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1017,6 +1017,7 @@ struct nfs_read_data { struct nfs_readargs args; struct nfs_readres res; unsigned long timestamp; /* For lease renewal */ + struct pnfs_layout_segment *lseg; struct page *page_array[NFS_PAGEVEC_SIZE]; }; From 64419a9b20938d9070fdd8c58c2fa23c911915f8 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Tue, 1 Mar 2011 01:34:16 +0000 Subject: [PATCH 18/54] NFSv4.1: generic read Separate the rpc run portion of nfs_read_rpcsetup into a new function nfs_initiate_read that is called for normal NFS I/O. Add a pNFS read_pagelist function that is called instead of nfs_intitate_read for pNFS reads. Signed-off-by: Andy Adamson Signed-off-by: Boaz Harrosh Signed-off-by: Dean Hildebrand Signed-off-by: Fred Isaman Signed-off-by: Fred Isaman Signed-off-by: Mike Sager Signed-off-by: Mingyang Guo Signed-off-by: Ricardo Labiaga Signed-off-by: Tao Guo Signed-off-by: Andy Adamson Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 28 ++++++++++++++ fs/nfs/pnfs.h | 20 ++++++++++ fs/nfs/read.c | 77 +++++++++++++++++++++++--------------- include/linux/nfs_iostat.h | 1 + include/linux/nfs_xdr.h | 1 + 5 files changed, 96 insertions(+), 31 deletions(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 77966ecb0a2c..86c154bad5db 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -30,6 +30,7 @@ #include #include "internal.h" #include "pnfs.h" +#include "iostat.h" #define NFSDBG_FACILITY NFSDBG_PNFS @@ -879,6 +880,33 @@ pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode) pgio->pg_test = (ld && ld->pg_test) ? pnfs_read_pg_test : NULL; } +/* + * Call the appropriate parallel I/O subsystem read function. + */ +enum pnfs_try_status +pnfs_try_to_read_data(struct nfs_read_data *rdata, + const struct rpc_call_ops *call_ops) +{ + struct inode *inode = rdata->inode; + struct nfs_server *nfss = NFS_SERVER(inode); + enum pnfs_try_status trypnfs; + + rdata->mds_ops = call_ops; + + dprintk("%s: Reading ino:%lu %u@%llu\n", + __func__, inode->i_ino, rdata->args.count, rdata->args.offset); + + trypnfs = nfss->pnfs_curr_ld->read_pagelist(rdata); + if (trypnfs == PNFS_NOT_ATTEMPTED) { + put_lseg(rdata->lseg); + rdata->lseg = NULL; + } else { + nfs_inc_stats(inode, NFSIOS_PNFS_READ); + } + dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); + return trypnfs; +} + /* * Device ID cache. Currently supports one layout type per struct nfs_client. * Add layout type to the lookup key to expand to support multiple types. diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 5107d14db485..585023fabb55 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -45,6 +45,11 @@ struct pnfs_layout_segment { struct pnfs_layout_hdr *pls_layout; }; +enum pnfs_try_status { + PNFS_ATTEMPTED = 0, + PNFS_NOT_ATTEMPTED = 1, +}; + #ifdef CONFIG_NFS_V4_1 #define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4" @@ -70,6 +75,12 @@ struct pnfs_layoutdriver_type { /* test for nfs page cache coalescing */ int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); + + /* + * Return PNFS_ATTEMPTED to indicate the layout code has attempted + * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS + */ + enum pnfs_try_status (*read_pagelist) (struct nfs_read_data *nfs_data); }; struct pnfs_layout_hdr { @@ -157,6 +168,8 @@ pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, enum pnfs_iomode access_type); void set_pnfs_layoutdriver(struct nfs_server *, u32 id); void unset_pnfs_layoutdriver(struct nfs_server *); +enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *, + const struct rpc_call_ops *); void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *); int pnfs_layout_process(struct nfs4_layoutget *lgp); void pnfs_free_lseg_list(struct list_head *tmp_list); @@ -227,6 +240,13 @@ pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, return NULL; } +static inline enum pnfs_try_status +pnfs_try_to_read_data(struct nfs_read_data *data, + const struct rpc_call_ops *call_ops) +{ + return PNFS_NOT_ATTEMPTED; +} + static inline bool pnfs_roc(struct inode *ino) { diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 6dc9eaf00e5c..4127a1c0eec6 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -18,6 +18,8 @@ #include #include #include +#include +#include #include #include "pnfs.h" @@ -155,6 +157,45 @@ static void nfs_readpage_release(struct nfs_page *req) nfs_release_request(req); } +static int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, + const struct rpc_call_ops *call_ops) +{ + struct inode *inode = data->inode; + int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0; + struct rpc_task *task; + struct rpc_message msg = { + .rpc_argp = &data->args, + .rpc_resp = &data->res, + .rpc_cred = data->cred, + }; + struct rpc_task_setup task_setup_data = { + .task = &data->task, + .rpc_client = clnt, + .rpc_message = &msg, + .callback_ops = call_ops, + .callback_data = data, + .workqueue = nfsiod_workqueue, + .flags = RPC_TASK_ASYNC | swap_flags, + }; + + /* Set up the initial task struct. */ + NFS_PROTO(inode)->read_setup(data, &msg); + + dprintk("NFS: %5u initiated read call (req %s/%lld, %u bytes @ " + "offset %llu)\n", + data->task.tk_pid, + inode->i_sb->s_id, + (long long)NFS_FILEID(inode), + data->args.count, + (unsigned long long)data->args.offset); + + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return PTR_ERR(task); + rpc_put_task(task); + return 0; +} + /* * Set up the NFS read request struct */ @@ -164,26 +205,10 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, struct pnfs_layout_segment *lseg) { struct inode *inode = req->wb_context->path.dentry->d_inode; - int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0; - struct rpc_task *task; - struct rpc_message msg = { - .rpc_argp = &data->args, - .rpc_resp = &data->res, - .rpc_cred = req->wb_context->cred, - }; - struct rpc_task_setup task_setup_data = { - .task = &data->task, - .rpc_client = NFS_CLIENT(inode), - .rpc_message = &msg, - .callback_ops = call_ops, - .callback_data = data, - .workqueue = nfsiod_workqueue, - .flags = RPC_TASK_ASYNC | swap_flags, - }; data->req = req; data->inode = inode; - data->cred = msg.rpc_cred; + data->cred = req->wb_context->cred; data->lseg = get_lseg(lseg); data->args.fh = NFS_FH(inode); @@ -199,21 +224,11 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, data->res.eof = 0; nfs_fattr_init(&data->fattr); - /* Set up the initial task struct. */ - NFS_PROTO(inode)->read_setup(data, &msg); + if (data->lseg && + (pnfs_try_to_read_data(data, call_ops) == PNFS_ATTEMPTED)) + return 0; - dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n", - data->task.tk_pid, - inode->i_sb->s_id, - (long long)NFS_FILEID(inode), - count, - (unsigned long long)data->args.offset); - - task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) - return PTR_ERR(task); - rpc_put_task(task); - return 0; + return nfs_initiate_read(data, NFS_CLIENT(inode), call_ops); } static void diff --git a/include/linux/nfs_iostat.h b/include/linux/nfs_iostat.h index 68b10f5f8907..37a143732d02 100644 --- a/include/linux/nfs_iostat.h +++ b/include/linux/nfs_iostat.h @@ -113,6 +113,7 @@ enum nfs_stat_eventcounters { NFSIOS_SHORTREAD, NFSIOS_SHORTWRITE, NFSIOS_DELAY, + NFSIOS_PNFS_READ, __NFSIOS_COUNTSMAX, }; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 560923e28723..9d2b9dae277d 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1018,6 +1018,7 @@ struct nfs_read_data { struct nfs_readres res; unsigned long timestamp; /* For lease renewal */ struct pnfs_layout_segment *lseg; + const struct rpc_call_ops *mds_ops; struct page *page_array[NFS_PAGEVEC_SIZE]; }; From d83217c13531fd59730d77b5c2284e90e56c0a50 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Tue, 1 Mar 2011 01:34:17 +0000 Subject: [PATCH 19/54] NFSv4.1: data server connection Introduce a data server set_client and init session following the nfs4_set_client and nfs4_init_session convention. Once a new nfs_client is on the nfs_client_list, the nfs_client cl_cons_state serializes access to creating an nfs_client struct with matching properties. Use the new nfs_get_client() that initializes new clients. Signed-off-by: Andy Adamson Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 41 +++++++++++++++++++++++++ fs/nfs/internal.h | 5 ++++ fs/nfs/nfs4_fs.h | 12 ++++++++ fs/nfs/nfs4filelayoutdev.c | 61 ++++++++++++++++++++++++++++++++++++++ fs/nfs/nfs4proc.c | 29 ++++++++++++++++-- include/linux/nfs_xdr.h | 1 + 6 files changed, 147 insertions(+), 2 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index d5c5bdfa4231..6dd50ac5b545 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -1417,6 +1417,47 @@ error: return error; } +/* + * Set up a pNFS Data Server client. + * + * Return any existing nfs_client that matches server address,port,version + * and minorversion. + * + * For a new nfs_client, use a soft mount (default), a low retrans and a + * low timeout interval so that if a connection is lost, we retry through + * the MDS. + */ +struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, + const struct sockaddr *ds_addr, + int ds_addrlen, int ds_proto) +{ + struct nfs_client_initdata cl_init = { + .addr = ds_addr, + .addrlen = ds_addrlen, + .rpc_ops = &nfs_v4_clientops, + .proto = ds_proto, + .minorversion = mds_clp->cl_minorversion, + }; + struct rpc_timeout ds_timeout = { + .to_initval = 15 * HZ, + .to_maxval = 15 * HZ, + .to_retries = 1, + .to_exponential = 1, + }; + struct nfs_client *clp; + + /* + * Set an authflavor equual to the MDS value. Use the MDS nfs_client + * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS + * (section 13.1 RFC 5661). + */ + clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr, + mds_clp->cl_rpcclient->cl_auth->au_flavor, 0); + + dprintk("<-- %s %p\n", __func__, clp); + return clp; +} +EXPORT_SYMBOL(nfs4_set_ds_client); /* * Session has been established, and the client marked ready. diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 4d7b3a97e522..5cc92014259e 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -148,6 +148,9 @@ extern struct nfs_server *nfs_clone_server(struct nfs_server *, struct nfs_fattr *); extern void nfs_mark_client_ready(struct nfs_client *clp, int state); extern int nfs4_check_client_ready(struct nfs_client *clp); +extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp, + const struct sockaddr *ds_addr, + int ds_addrlen, int ds_proto); #ifdef CONFIG_PROC_FS extern int __init nfs_fs_proc_init(void); extern void nfs_fs_proc_exit(void); @@ -213,6 +216,8 @@ extern const u32 nfs41_maxwrite_overhead; extern struct rpc_procinfo nfs4_procedures[]; #endif +extern int nfs4_init_ds_session(struct nfs_client *clp); + /* proc.c */ void nfs_close_context(struct nfs_open_context *ctx, int is_sync); extern int nfs_init_client(struct nfs_client *clp, diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index d4cfacc40003..7058a9f75e7f 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -266,6 +266,12 @@ is_ds_only_client(struct nfs_client *clp) return (clp->cl_exchange_flags & EXCHGID4_FLAG_MASK_PNFS) == EXCHGID4_FLAG_USE_PNFS_DS; } + +static inline bool +is_ds_client(struct nfs_client *clp) +{ + return clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS; +} #else /* CONFIG_NFS_v4_1 */ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) { @@ -289,6 +295,12 @@ is_ds_only_client(struct nfs_client *clp) { return false; } + +static inline bool +is_ds_client(struct nfs_client *clp) +{ + return false; +} #endif /* CONFIG_NFS_V4_1 */ extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[]; diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c index b73c34375f60..8bc91fb8b6fa 100644 --- a/fs/nfs/nfs4filelayoutdev.c +++ b/fs/nfs/nfs4filelayoutdev.c @@ -104,6 +104,67 @@ _data_server_lookup_locked(u32 ip_addr, u32 port) return NULL; } +/* + * Create an rpc connection to the nfs4_pnfs_ds data server + * Currently only support IPv4 + */ +static int +nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds) +{ + struct nfs_client *clp; + struct sockaddr_in sin; + int status = 0; + + dprintk("--> %s ip:port %x:%hu au_flavor %d\n", __func__, + ntohl(ds->ds_ip_addr), ntohs(ds->ds_port), + mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor); + + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = ds->ds_ip_addr; + sin.sin_port = ds->ds_port; + + clp = nfs4_set_ds_client(mds_srv->nfs_client, (struct sockaddr *)&sin, + sizeof(sin), IPPROTO_TCP); + if (IS_ERR(clp)) { + status = PTR_ERR(clp); + goto out; + } + + if ((clp->cl_exchange_flags & EXCHGID4_FLAG_MASK_PNFS) != 0) { + if (!is_ds_client(clp)) { + status = -ENODEV; + goto out_put; + } + ds->ds_clp = clp; + dprintk("%s [existing] ip=%x, port=%hu\n", __func__, + ntohl(ds->ds_ip_addr), ntohs(ds->ds_port)); + goto out; + } + + /* + * Do not set NFS_CS_CHECK_LEASE_TIME instead set the DS lease to + * be equal to the MDS lease. Renewal is scheduled in create_session. + */ + spin_lock(&mds_srv->nfs_client->cl_lock); + clp->cl_lease_time = mds_srv->nfs_client->cl_lease_time; + spin_unlock(&mds_srv->nfs_client->cl_lock); + clp->cl_last_renewal = jiffies; + + /* New nfs_client */ + status = nfs4_init_ds_session(clp); + if (status) + goto out_put; + + ds->ds_clp = clp; + dprintk("%s [new] ip=%x, port=%hu\n", __func__, ntohl(ds->ds_ip_addr), + ntohs(ds->ds_port)); +out: + return status; +out_put: + nfs_put_client(clp); + goto out; +} + static void destroy_ds(struct nfs4_pnfs_ds *ds) { diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 55a8fc2f3df4..07d1a43f40f5 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1573,9 +1573,8 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) return 0; } -static int nfs4_recover_expired_lease(struct nfs_server *server) +static int nfs4_client_recover_expired_lease(struct nfs_client *clp) { - struct nfs_client *clp = server->nfs_client; unsigned int loop; int ret; @@ -1592,6 +1591,11 @@ static int nfs4_recover_expired_lease(struct nfs_server *server) return ret; } +static int nfs4_recover_expired_lease(struct nfs_server *server) +{ + return nfs4_client_recover_expired_lease(server->nfs_client); +} + /* * OPEN_EXPIRED: * reclaim state on the server after a network partition. @@ -5118,6 +5122,27 @@ int nfs4_init_session(struct nfs_server *server) return ret; } +int nfs4_init_ds_session(struct nfs_client *clp) +{ + struct nfs4_session *session = clp->cl_session; + int ret; + + if (!test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) + return 0; + + ret = nfs4_client_recover_expired_lease(clp); + if (!ret) + /* Test for the DS role */ + if (!is_ds_client(clp)) + ret = -ENODEV; + if (!ret) + ret = nfs4_check_client_ready(clp); + return ret; + +} +EXPORT_SYMBOL_GPL(nfs4_init_ds_session); + + /* * Renew the cl_session lease. */ diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 9d2b9dae277d..c66ff7fe1b6b 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1018,6 +1018,7 @@ struct nfs_read_data { struct nfs_readres res; unsigned long timestamp; /* For lease renewal */ struct pnfs_layout_segment *lseg; + struct nfs_client *ds_clp; /* pNFS data server */ const struct rpc_call_ops *mds_ops; struct page *page_array[NFS_PAGEVEC_SIZE]; }; From cfe7f4120f8b1b9465c333d1e42efd4669b1799f Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Tue, 1 Mar 2011 01:34:18 +0000 Subject: [PATCH 20/54] NFSv4.1: filelayout i/o helpers Prepare for filelayout_read_pagelist with helper functions that find the correct data server, filehandle, and offset. Signed-off-by: Andy Adamson Signed-off-by: Dean Hildebrand Signed-off-by: Fred Isaman Signed-off-by: Marc Eshel Signed-off-by: Mike Sager Signed-off-by: Oleg Drokin Signed-off-by: Tao Guo Signed-off-by: Tigran Mkrtchyan Signed-off-by: Tigran Mkrtchyan Signed-off-by: Andy Adamson Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust --- fs/nfs/nfs4filelayout.c | 34 +++++++++++++++++++ fs/nfs/nfs4filelayout.h | 7 ++++ fs/nfs/nfs4filelayoutdev.c | 67 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 108 insertions(+) diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 0efe8cbd9e3c..ed833705dcee 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -66,6 +66,40 @@ filelayout_clear_layoutdriver(struct nfs_server *nfss) return 0; } +static loff_t +filelayout_get_dense_offset(struct nfs4_filelayout_segment *flseg, + loff_t offset) +{ + u32 stripe_width = flseg->stripe_unit * flseg->dsaddr->stripe_count; + u64 tmp; + + offset -= flseg->pattern_offset; + tmp = offset; + do_div(tmp, stripe_width); + + return tmp * flseg->stripe_unit + do_div(offset, flseg->stripe_unit); +} + +/* This function is used by the layout driver to calculate the + * offset of the file on the dserver based on whether the + * layout type is STRIPE_DENSE or STRIPE_SPARSE + */ +static loff_t +filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset) +{ + struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); + + switch (flseg->stripe_type) { + case STRIPE_SPARSE: + return offset; + + case STRIPE_DENSE: + return filelayout_get_dense_offset(flseg, offset); + } + + BUG(); +} + /* * filelayout_check_layout() * diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h index bbf60dd2ab9d..9fef76e04936 100644 --- a/fs/nfs/nfs4filelayout.h +++ b/fs/nfs/nfs4filelayout.h @@ -83,9 +83,16 @@ FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg) generic_hdr); } +extern struct nfs_fh * +nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j); + extern void nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *); extern void print_ds(struct nfs4_pnfs_ds *ds); extern void print_deviceid(struct nfs4_deviceid *dev_id); +u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset); +u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j); +struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, + u32 ds_idx); extern struct nfs4_file_layout_dsaddr * nfs4_fl_find_get_deviceid(struct nfs_client *, struct nfs4_deviceid *dev_id); struct nfs4_file_layout_dsaddr * diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c index 8bc91fb8b6fa..f466fed2f466 100644 --- a/fs/nfs/nfs4filelayoutdev.c +++ b/fs/nfs/nfs4filelayoutdev.c @@ -516,3 +516,70 @@ nfs4_fl_find_get_deviceid(struct nfs_client *clp, struct nfs4_deviceid *id) return (d == NULL) ? NULL : container_of(d, struct nfs4_file_layout_dsaddr, deviceid); } + +/* + * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit + * Then: ((res + fsi) % dsaddr->stripe_count) + */ +u32 +nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset) +{ + struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); + u64 tmp; + + tmp = offset - flseg->pattern_offset; + do_div(tmp, flseg->stripe_unit); + tmp += flseg->first_stripe_index; + return do_div(tmp, flseg->dsaddr->stripe_count); +} + +u32 +nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j) +{ + return FILELAYOUT_LSEG(lseg)->dsaddr->stripe_indices[j]; +} + +struct nfs_fh * +nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j) +{ + struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg); + u32 i; + + if (flseg->stripe_type == STRIPE_SPARSE) { + if (flseg->num_fh == 1) + i = 0; + else if (flseg->num_fh == 0) + /* Use the MDS OPEN fh set in nfs_read_rpcsetup */ + return NULL; + else + i = nfs4_fl_calc_ds_index(lseg, j); + } else + i = j; + return flseg->fh_array[i]; +} + +struct nfs4_pnfs_ds * +nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) +{ + struct nfs4_file_layout_dsaddr *dsaddr = FILELAYOUT_LSEG(lseg)->dsaddr; + struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx]; + + if (ds == NULL) { + printk(KERN_ERR "%s: No data server for offset index %d\n", + __func__, ds_idx); + return NULL; + } + + if (!ds->ds_clp) { + int err; + + err = nfs4_ds_connect(NFS_SERVER(lseg->pls_layout->plh_inode), + dsaddr->ds_list[ds_idx]); + if (err) { + printk(KERN_ERR "%s nfs4_ds_connect error %d\n", + __func__, err); + return NULL; + } + } + return ds; +} From dc70d7b3189597f313df7bd2da849cfc39063b15 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Tue, 1 Mar 2011 01:34:19 +0000 Subject: [PATCH 21/54] NFSv4.1: filelayout read Attempt a pNFS file layout read by setting up the nfs_read_data struct and calling nfs_initiate_read with the data server rpc client and the filelayout rpc call ops. Error handling is implemented in a subsequent patch. Signed-off-by: Andy Adamson Signed-off-by: Dean Hildebrand Signed-off-by: Fred Isaman Signed-off-by: Fred Isaman Signed-off-by: Mingyang Guo Signed-off-by: Oleg Drokin Signed-off-by: Ricardo Labiaga Tested-by: Guo Mingyang Signed-off-by: Andy Adamson Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust --- fs/nfs/internal.h | 2 + fs/nfs/nfs4_fs.h | 3 ++ fs/nfs/nfs4filelayout.c | 82 +++++++++++++++++++++++++++++++++++++++++ fs/nfs/nfs4proc.c | 3 +- fs/nfs/read.c | 3 +- include/linux/nfs_xdr.h | 1 + 6 files changed, 92 insertions(+), 2 deletions(-) diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 5cc92014259e..5e9df992cd73 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -271,6 +271,8 @@ extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh); #endif /* read.c */ +extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, + const struct rpc_call_ops *call_ops); extern void nfs_read_prepare(struct rpc_task *task, void *calldata); /* write.c */ diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 7058a9f75e7f..c64be1cff080 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -252,6 +252,9 @@ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *ser extern int nfs4_setup_sequence(const struct nfs_server *server, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, int cache_reply, struct rpc_task *task); +extern int nfs41_setup_sequence(struct nfs4_session *session, + struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, + int cache_reply, struct rpc_task *task); extern void nfs4_destroy_session(struct nfs4_session *session); extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp); extern int nfs4_proc_create_session(struct nfs_client *); diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index ed833705dcee..3608411653dc 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -100,6 +100,87 @@ filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset) BUG(); } +/* + * Call ops for the async read/write cases + * In the case of dense layouts, the offset needs to be reset to its + * original value. + */ +static void filelayout_read_prepare(struct rpc_task *task, void *data) +{ + struct nfs_read_data *rdata = (struct nfs_read_data *)data; + + if (nfs41_setup_sequence(rdata->ds_clp->cl_session, + &rdata->args.seq_args, &rdata->res.seq_res, + 0, task)) + return; + + rpc_call_start(task); +} + +static void filelayout_read_call_done(struct rpc_task *task, void *data) +{ + struct nfs_read_data *rdata = (struct nfs_read_data *)data; + + dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status); + + /* Note this may cause RPC to be resent */ + rdata->mds_ops->rpc_call_done(task, data); +} + +static void filelayout_read_release(void *data) +{ + struct nfs_read_data *rdata = (struct nfs_read_data *)data; + + rdata->mds_ops->rpc_release(data); +} + +struct rpc_call_ops filelayout_read_call_ops = { + .rpc_call_prepare = filelayout_read_prepare, + .rpc_call_done = filelayout_read_call_done, + .rpc_release = filelayout_read_release, +}; + +static enum pnfs_try_status +filelayout_read_pagelist(struct nfs_read_data *data) +{ + struct pnfs_layout_segment *lseg = data->lseg; + struct nfs4_pnfs_ds *ds; + loff_t offset = data->args.offset; + u32 j, idx; + struct nfs_fh *fh; + int status; + + dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n", + __func__, data->inode->i_ino, + data->args.pgbase, (size_t)data->args.count, offset); + + /* Retrieve the correct rpc_client for the byte range */ + j = nfs4_fl_calc_j_index(lseg, offset); + idx = nfs4_fl_calc_ds_index(lseg, j); + ds = nfs4_fl_prepare_ds(lseg, idx); + if (!ds) { + printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__); + return PNFS_NOT_ATTEMPTED; + } + dprintk("%s USE DS:ip %x %hu\n", __func__, + ntohl(ds->ds_ip_addr), ntohs(ds->ds_port)); + + /* No multipath support. Use first DS */ + data->ds_clp = ds->ds_clp; + fh = nfs4_fl_select_ds_fh(lseg, j); + if (fh) + data->args.fh = fh; + + data->args.offset = filelayout_get_dserver_offset(lseg, offset); + data->mds_offset = offset; + + /* Perform an asynchronous read to ds */ + status = nfs_initiate_read(data, ds->ds_clp->cl_rpcclient, + &filelayout_read_call_ops); + BUG_ON(status != 0); + return PNFS_ATTEMPTED; +} + /* * filelayout_check_layout() * @@ -320,6 +401,7 @@ static struct pnfs_layoutdriver_type filelayout_type = { .alloc_lseg = filelayout_alloc_lseg, .free_lseg = filelayout_free_lseg, .pg_test = filelayout_pg_test, + .read_pagelist = filelayout_read_pagelist, }; static int __init nfs4filelayout_init(void) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 07d1a43f40f5..d09623933302 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -505,7 +505,7 @@ out: return ret_id; } -static int nfs41_setup_sequence(struct nfs4_session *session, +int nfs41_setup_sequence(struct nfs4_session *session, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, int cache_reply, @@ -571,6 +571,7 @@ static int nfs41_setup_sequence(struct nfs4_session *session, res->sr_status = 1; return 0; } +EXPORT_SYMBOL_GPL(nfs41_setup_sequence); int nfs4_setup_sequence(const struct nfs_server *server, struct nfs4_sequence_args *args, diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 4127a1c0eec6..f4d0fcffcb5a 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -157,7 +157,7 @@ static void nfs_readpage_release(struct nfs_page *req) nfs_release_request(req); } -static int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, +int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, const struct rpc_call_ops *call_ops) { struct inode *inode = data->inode; @@ -195,6 +195,7 @@ static int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, rpc_put_task(task); return 0; } +EXPORT_SYMBOL_GPL(nfs_initiate_read); /* * Set up the NFS read request struct diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index c66ff7fe1b6b..b63faef5052e 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1020,6 +1020,7 @@ struct nfs_read_data { struct pnfs_layout_segment *lseg; struct nfs_client *ds_clp; /* pNFS data server */ const struct rpc_call_ops *mds_ops; + __u64 mds_offset; struct page *page_array[NFS_PAGEVEC_SIZE]; }; From cbdabc7f8bf14ca1d40ab1cb86f64b3bc09716e8 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Tue, 1 Mar 2011 01:34:20 +0000 Subject: [PATCH 22/54] NFSv4.1: filelayout async error handler Use our own async error handler. Mark the layout as failed and retry i/o through the MDS on specified errors. Update the mds_offset in nfs_readpage_retry so that a failed short-read retry to a DS gets correctly resent through the MDS. Signed-off-by: Andy Adamson Signed-off-by: Trond Myklebust --- fs/nfs/internal.h | 1 + fs/nfs/nfs4filelayout.c | 81 +++++++++++++++++++++++++++++++++++++ fs/nfs/nfs4proc.c | 35 +++++++++++++--- fs/nfs/nfs4state.c | 1 + fs/nfs/read.c | 1 + include/linux/nfs_xdr.h | 1 + include/linux/sunrpc/clnt.h | 1 + net/sunrpc/clnt.c | 8 ++++ 8 files changed, 123 insertions(+), 6 deletions(-) diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 5e9df992cd73..1a3228e9ea22 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -285,6 +285,7 @@ extern int nfs_migrate_page(struct address_space *, #endif /* nfs4proc.c */ +extern void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data); extern int nfs4_init_client(struct nfs_client *clp, const struct rpc_timeout *timeparms, const char *ip_addr, diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 3608411653dc..6a424c19abea 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -40,6 +40,8 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Dean Hildebrand "); MODULE_DESCRIPTION("The NFSv4 file layout driver"); +#define FILELAYOUT_POLL_RETRY_MAX (15*HZ) + static int filelayout_set_layoutdriver(struct nfs_server *nfss) { @@ -100,6 +102,83 @@ filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset) BUG(); } +/* For data server errors we don't recover from */ +static void +filelayout_set_lo_fail(struct pnfs_layout_segment *lseg) +{ + if (lseg->pls_range.iomode == IOMODE_RW) { + dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__); + set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags); + } else { + dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__); + set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); + } +} + +static int filelayout_async_handle_error(struct rpc_task *task, + struct nfs4_state *state, + struct nfs_client *clp, + int *reset) +{ + if (task->tk_status >= 0) + return 0; + + *reset = 0; + + switch (task->tk_status) { + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_DEADSESSION: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + case -NFS4ERR_SEQ_FALSE_RETRY: + case -NFS4ERR_SEQ_MISORDERED: + dprintk("%s ERROR %d, Reset session. Exchangeid " + "flags 0x%x\n", __func__, task->tk_status, + clp->cl_exchange_flags); + nfs4_schedule_session_recovery(clp->cl_session); + break; + case -NFS4ERR_DELAY: + case -NFS4ERR_GRACE: + case -EKEYEXPIRED: + rpc_delay(task, FILELAYOUT_POLL_RETRY_MAX); + break; + default: + dprintk("%s DS error. Retry through MDS %d\n", __func__, + task->tk_status); + *reset = 1; + break; + } + task->tk_status = 0; + return -EAGAIN; +} + +/* NFS_PROTO call done callback routines */ + +static int filelayout_read_done_cb(struct rpc_task *task, + struct nfs_read_data *data) +{ + struct nfs_client *clp = data->ds_clp; + int reset = 0; + + dprintk("%s DS read\n", __func__); + + if (filelayout_async_handle_error(task, data->args.context->state, + data->ds_clp, &reset) == -EAGAIN) { + dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n", + __func__, data->ds_clp, data->ds_clp->cl_session); + if (reset) { + filelayout_set_lo_fail(data->lseg); + nfs4_reset_read(task, data); + clp = NFS_SERVER(data->inode)->nfs_client; + } + nfs_restart_rpc(task, clp); + return -EAGAIN; + } + + return 0; +} + /* * Call ops for the async read/write cases * In the case of dense layouts, the offset needs to be reset to its @@ -109,6 +188,8 @@ static void filelayout_read_prepare(struct rpc_task *task, void *data) { struct nfs_read_data *rdata = (struct nfs_read_data *)data; + rdata->read_done_cb = filelayout_read_done_cb; + if (nfs41_setup_sequence(rdata->ds_clp->cl_session, &rdata->args.seq_args, &rdata->res.seq_res, 0, task)) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index d09623933302..1dc809039448 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3074,15 +3074,10 @@ static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, return err; } -static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) +static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data) { struct nfs_server *server = NFS_SERVER(data->inode); - dprintk("--> %s\n", __func__); - - if (!nfs4_sequence_done(task, &data->res.seq_res)) - return -EAGAIN; - if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) { nfs_restart_rpc(task, server->nfs_client); return -EAGAIN; @@ -3094,12 +3089,40 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) return 0; } +static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) +{ + + dprintk("--> %s\n", __func__); + + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return -EAGAIN; + + return data->read_done_cb(task, data); +} + static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg) { data->timestamp = jiffies; + data->read_done_cb = nfs4_read_done_cb; msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; } +/* Reset the the nfs_read_data to send the read to the MDS. */ +void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data) +{ + dprintk("%s Reset task for i/o through\n", __func__); + put_lseg(data->lseg); + data->lseg = NULL; + /* offsets will differ in the dense stripe case */ + data->args.offset = data->mds_offset; + data->ds_clp = NULL; + data->args.fh = NFS_FH(data->inode); + data->read_done_cb = nfs4_read_done_cb; + task->tk_ops = data->mds_ops; + rpc_task_reset_client(task, NFS_CLIENT(data->inode)); +} +EXPORT_SYMBOL_GPL(nfs4_reset_read); + static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data) { struct inode *inode = data->inode; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 69c836373124..ab1bf5bb021f 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1453,6 +1453,7 @@ void nfs4_schedule_session_recovery(struct nfs4_session *session) { nfs4_schedule_lease_recovery(session->clp); } +EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery); void nfs41_handle_recall_slot(struct nfs_client *clp) { diff --git a/fs/nfs/read.c b/fs/nfs/read.c index f4d0fcffcb5a..f40c7f4dc16b 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -391,6 +391,7 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data return; /* Yes, so retry the read at the end of the data */ + data->mds_offset += resp->count; argp->offset += resp->count; argp->pgbase += resp->count; argp->count -= resp->count; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index b63faef5052e..eb0e87084353 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1020,6 +1020,7 @@ struct nfs_read_data { struct pnfs_layout_segment *lseg; struct nfs_client *ds_clp; /* pNFS data server */ const struct rpc_call_ops *mds_ops; + int (*read_done_cb) (struct rpc_task *task, struct nfs_read_data *data); __u64 mds_offset; struct page *page_array[NFS_PAGEVEC_SIZE]; }; diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index ef9476a36ff7..db7bcaf7c5bd 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -129,6 +129,7 @@ struct rpc_create_args { struct rpc_clnt *rpc_create(struct rpc_create_args *args); struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *, struct rpc_program *, u32); +void rpc_task_reset_client(struct rpc_task *task, struct rpc_clnt *clnt); struct rpc_clnt *rpc_clone_client(struct rpc_clnt *); void rpc_shutdown_client(struct rpc_clnt *); void rpc_release_client(struct rpc_clnt *); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 8b5a6b40d37c..edaf56e2ed29 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -597,6 +597,14 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt) } } +void rpc_task_reset_client(struct rpc_task *task, struct rpc_clnt *clnt) +{ + rpc_task_release_client(task); + rpc_task_set_client(task, clnt); +} +EXPORT_SYMBOL_GPL(rpc_task_reset_client); + + static void rpc_task_set_rpc_message(struct rpc_task *task, const struct rpc_message *msg) { From ea8eecdd11ee6becd09c095c8efa88aa7df95961 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Mar 2011 01:34:21 +0000 Subject: [PATCH 23/54] NFSv4.1 move deviceid cache to filelayout driver No need for generic cache with only one user. Keep a simple hash of deviceids in the filelayout driver. Signed-off-by: Christoph Hellwig Acked-by: Andy Adamson Signed-off-by: Trond Myklebust --- fs/nfs/nfs4filelayout.c | 46 ++---------- fs/nfs/nfs4filelayout.h | 8 +- fs/nfs/nfs4filelayoutdev.c | 106 ++++++++++++++++++-------- fs/nfs/pnfs.c | 147 +------------------------------------ fs/nfs/pnfs.h | 48 ------------ include/linux/nfs_fs_sb.h | 1 - 6 files changed, 92 insertions(+), 264 deletions(-) diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 6a424c19abea..a922e75af42e 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -42,32 +42,6 @@ MODULE_DESCRIPTION("The NFSv4 file layout driver"); #define FILELAYOUT_POLL_RETRY_MAX (15*HZ) -static int -filelayout_set_layoutdriver(struct nfs_server *nfss) -{ - int status = pnfs_alloc_init_deviceid_cache(nfss->nfs_client, - nfs4_fl_free_deviceid_callback); - if (status) { - printk(KERN_WARNING "%s: deviceid cache could not be " - "initialized\n", __func__); - return status; - } - dprintk("%s: deviceid cache has been initialized successfully\n", - __func__); - return 0; -} - -/* Clear out the layout by destroying its device list */ -static int -filelayout_clear_layoutdriver(struct nfs_server *nfss) -{ - dprintk("--> %s\n", __func__); - - if (nfss->nfs_client->cl_devid_cache) - pnfs_put_deviceid_cache(nfss->nfs_client); - return 0; -} - static loff_t filelayout_get_dense_offset(struct nfs4_filelayout_segment *flseg, loff_t offset) @@ -295,7 +269,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo, } /* find and reference the deviceid */ - dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, id); + dsaddr = nfs4_fl_find_get_deviceid(id); if (dsaddr == NULL) { dsaddr = get_device_info(lo->plh_inode, id); if (dsaddr == NULL) @@ -330,7 +304,7 @@ out: dprintk("--> %s returns %d\n", __func__, status); return status; out_put: - pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache, &dsaddr->deviceid); + nfs4_fl_put_deviceid(dsaddr); goto out; } @@ -439,12 +413,10 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid, static void filelayout_free_lseg(struct pnfs_layout_segment *lseg) { - struct nfs_server *nfss = NFS_SERVER(lseg->pls_layout->plh_inode); struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); dprintk("--> %s\n", __func__); - pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache, - &fl->dsaddr->deviceid); + nfs4_fl_put_deviceid(fl->dsaddr); _filelayout_free_lseg(fl); } @@ -474,13 +446,11 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, } static struct pnfs_layoutdriver_type filelayout_type = { - .id = LAYOUT_NFSV4_1_FILES, - .name = "LAYOUT_NFSV4_1_FILES", - .owner = THIS_MODULE, - .set_layoutdriver = filelayout_set_layoutdriver, - .clear_layoutdriver = filelayout_clear_layoutdriver, - .alloc_lseg = filelayout_alloc_lseg, - .free_lseg = filelayout_free_lseg, + .id = LAYOUT_NFSV4_1_FILES, + .name = "LAYOUT_NFSV4_1_FILES", + .owner = THIS_MODULE, + .alloc_lseg = filelayout_alloc_lseg, + .free_lseg = filelayout_free_lseg, .pg_test = filelayout_pg_test, .read_pagelist = filelayout_read_pagelist, }; diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h index 9fef76e04936..23f1e1e2a0f5 100644 --- a/fs/nfs/nfs4filelayout.h +++ b/fs/nfs/nfs4filelayout.h @@ -56,7 +56,9 @@ struct nfs4_pnfs_ds { }; struct nfs4_file_layout_dsaddr { - struct pnfs_deviceid_node deviceid; + struct hlist_node node; + struct nfs4_deviceid deviceid; + atomic_t ref; u32 stripe_count; u8 *stripe_indices; u32 ds_num; @@ -86,7 +88,6 @@ FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg) extern struct nfs_fh * nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j); -extern void nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *); extern void print_ds(struct nfs4_pnfs_ds *ds); extern void print_deviceid(struct nfs4_deviceid *dev_id); u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset); @@ -94,7 +95,8 @@ u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j); struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx); extern struct nfs4_file_layout_dsaddr * -nfs4_fl_find_get_deviceid(struct nfs_client *, struct nfs4_deviceid *dev_id); +nfs4_fl_find_get_deviceid(struct nfs4_deviceid *dev_id); +extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); struct nfs4_file_layout_dsaddr * get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id); diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c index f466fed2f466..f594ca35a996 100644 --- a/fs/nfs/nfs4filelayoutdev.c +++ b/fs/nfs/nfs4filelayoutdev.c @@ -36,6 +36,30 @@ #define NFSDBG_FACILITY NFSDBG_PNFS_LD +/* + * Device ID RCU cache. A device ID is unique per client ID and layout type. + */ +#define NFS4_FL_DEVICE_ID_HASH_BITS 5 +#define NFS4_FL_DEVICE_ID_HASH_SIZE (1 << NFS4_FL_DEVICE_ID_HASH_BITS) +#define NFS4_FL_DEVICE_ID_HASH_MASK (NFS4_FL_DEVICE_ID_HASH_SIZE - 1) + +static inline u32 +nfs4_fl_deviceid_hash(struct nfs4_deviceid *id) +{ + unsigned char *cptr = (unsigned char *)id->data; + unsigned int nbytes = NFS4_DEVICEID4_SIZE; + u32 x = 0; + + while (nbytes--) { + x *= 37; + x += *cptr++; + } + return x & NFS4_FL_DEVICE_ID_HASH_MASK; +} + +static struct hlist_head filelayout_deviceid_cache[NFS4_FL_DEVICE_ID_HASH_SIZE]; +static DEFINE_SPINLOCK(filelayout_deviceid_lock); + /* * Data server cache * @@ -183,7 +207,7 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) struct nfs4_pnfs_ds *ds; int i; - print_deviceid(&dsaddr->deviceid.de_id); + print_deviceid(&dsaddr->deviceid); for (i = 0; i < dsaddr->ds_num; i++) { ds = dsaddr->ds_list[i]; @@ -200,15 +224,6 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) kfree(dsaddr); } -void -nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *device) -{ - struct nfs4_file_layout_dsaddr *dsaddr = - container_of(device, struct nfs4_file_layout_dsaddr, deviceid); - - nfs4_fl_free_deviceid(dsaddr); -} - static struct nfs4_pnfs_ds * nfs4_pnfs_ds_add(struct inode *inode, u32 ip_addr, u32 port) { @@ -361,7 +376,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev) dsaddr->stripe_count = cnt; dsaddr->ds_num = num; - memcpy(&dsaddr->deviceid.de_id, &pdev->dev_id, sizeof(pdev->dev_id)); + memcpy(&dsaddr->deviceid, &pdev->dev_id, sizeof(pdev->dev_id)); /* Go back an read stripe indices */ p = indicesp; @@ -411,28 +426,37 @@ out_err: } /* - * Decode the opaque device specified in 'dev' - * and add it to the list of available devices. - * If the deviceid is already cached, nfs4_add_deviceid will return - * a pointer to the cached struct and throw away the new. + * Decode the opaque device specified in 'dev' and add it to the cache of + * available devices. */ -static struct nfs4_file_layout_dsaddr* +static struct nfs4_file_layout_dsaddr * decode_and_add_device(struct inode *inode, struct pnfs_device *dev) { - struct nfs4_file_layout_dsaddr *dsaddr; - struct pnfs_deviceid_node *d; + struct nfs4_file_layout_dsaddr *d, *new; + long hash; - dsaddr = decode_device(inode, dev); - if (!dsaddr) { + new = decode_device(inode, dev); + if (!new) { printk(KERN_WARNING "%s: Could not decode or add device\n", __func__); return NULL; } - d = pnfs_add_deviceid(NFS_SERVER(inode)->nfs_client->cl_devid_cache, - &dsaddr->deviceid); + spin_lock(&filelayout_deviceid_lock); + d = nfs4_fl_find_get_deviceid(&new->deviceid); + if (d) { + spin_unlock(&filelayout_deviceid_lock); + nfs4_fl_free_deviceid(new); + return d; + } - return container_of(d, struct nfs4_file_layout_dsaddr, deviceid); + INIT_HLIST_NODE(&new->node); + atomic_set(&new->ref, 1); + hash = nfs4_fl_deviceid_hash(&new->deviceid); + hlist_add_head_rcu(&new->node, &filelayout_deviceid_cache[hash]); + spin_unlock(&filelayout_deviceid_lock); + + return new; } /* @@ -507,14 +531,38 @@ out_free: return dsaddr; } -struct nfs4_file_layout_dsaddr * -nfs4_fl_find_get_deviceid(struct nfs_client *clp, struct nfs4_deviceid *id) +void +nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) { - struct pnfs_deviceid_node *d; + if (atomic_dec_and_lock(&dsaddr->ref, &filelayout_deviceid_lock)) { + hlist_del_rcu(&dsaddr->node); + spin_unlock(&filelayout_deviceid_lock); - d = pnfs_find_get_deviceid(clp->cl_devid_cache, id); - return (d == NULL) ? NULL : - container_of(d, struct nfs4_file_layout_dsaddr, deviceid); + synchronize_rcu(); + nfs4_fl_free_deviceid(dsaddr); + } +} + +struct nfs4_file_layout_dsaddr * +nfs4_fl_find_get_deviceid(struct nfs4_deviceid *id) +{ + struct nfs4_file_layout_dsaddr *d; + struct hlist_node *n; + long hash = nfs4_fl_deviceid_hash(id); + + + rcu_read_lock(); + hlist_for_each_entry_rcu(d, n, &filelayout_deviceid_cache[hash], node) { + if (!memcmp(&d->deviceid, id, sizeof(*id))) { + if (!atomic_inc_not_zero(&d->ref)) + goto fail; + rcu_read_unlock(); + return d; + } + } +fail: + rcu_read_unlock(); + return NULL; } /* diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 86c154bad5db..1f4c153441a1 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -75,10 +75,8 @@ find_pnfs_driver(u32 id) void unset_pnfs_layoutdriver(struct nfs_server *nfss) { - if (nfss->pnfs_curr_ld) { - nfss->pnfs_curr_ld->clear_layoutdriver(nfss); + if (nfss->pnfs_curr_ld) module_put(nfss->pnfs_curr_ld->owner); - } nfss->pnfs_curr_ld = NULL; } @@ -116,13 +114,7 @@ set_pnfs_layoutdriver(struct nfs_server *server, u32 id) goto out_no_driver; } server->pnfs_curr_ld = ld_type; - if (ld_type->set_layoutdriver(server)) { - printk(KERN_ERR - "%s: Error initializing mount point for layout driver %u.\n", - __func__, id); - module_put(ld_type->owner); - goto out_no_driver; - } + dprintk("%s: pNFS module for %u set\n", __func__, id); return; @@ -906,138 +898,3 @@ pnfs_try_to_read_data(struct nfs_read_data *rdata, dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); return trypnfs; } - -/* - * Device ID cache. Currently supports one layout type per struct nfs_client. - * Add layout type to the lookup key to expand to support multiple types. - */ -int -pnfs_alloc_init_deviceid_cache(struct nfs_client *clp, - void (*free_callback)(struct pnfs_deviceid_node *)) -{ - struct pnfs_deviceid_cache *c; - - c = kzalloc(sizeof(struct pnfs_deviceid_cache), GFP_KERNEL); - if (!c) - return -ENOMEM; - spin_lock(&clp->cl_lock); - if (clp->cl_devid_cache != NULL) { - atomic_inc(&clp->cl_devid_cache->dc_ref); - dprintk("%s [kref [%d]]\n", __func__, - atomic_read(&clp->cl_devid_cache->dc_ref)); - kfree(c); - } else { - /* kzalloc initializes hlists */ - spin_lock_init(&c->dc_lock); - atomic_set(&c->dc_ref, 1); - c->dc_free_callback = free_callback; - clp->cl_devid_cache = c; - dprintk("%s [new]\n", __func__); - } - spin_unlock(&clp->cl_lock); - return 0; -} -EXPORT_SYMBOL_GPL(pnfs_alloc_init_deviceid_cache); - -/* - * Called from pnfs_layoutdriver_type->free_lseg - * last layout segment reference frees deviceid - */ -void -pnfs_put_deviceid(struct pnfs_deviceid_cache *c, - struct pnfs_deviceid_node *devid) -{ - struct nfs4_deviceid *id = &devid->de_id; - struct pnfs_deviceid_node *d; - struct hlist_node *n; - long h = nfs4_deviceid_hash(id); - - dprintk("%s [%d]\n", __func__, atomic_read(&devid->de_ref)); - if (!atomic_dec_and_lock(&devid->de_ref, &c->dc_lock)) - return; - - hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[h], de_node) - if (!memcmp(&d->de_id, id, sizeof(*id))) { - hlist_del_rcu(&d->de_node); - spin_unlock(&c->dc_lock); - synchronize_rcu(); - c->dc_free_callback(devid); - return; - } - spin_unlock(&c->dc_lock); - /* Why wasn't it found in the list? */ - BUG(); -} -EXPORT_SYMBOL_GPL(pnfs_put_deviceid); - -/* Find and reference a deviceid */ -struct pnfs_deviceid_node * -pnfs_find_get_deviceid(struct pnfs_deviceid_cache *c, struct nfs4_deviceid *id) -{ - struct pnfs_deviceid_node *d; - struct hlist_node *n; - long hash = nfs4_deviceid_hash(id); - - dprintk("--> %s hash %ld\n", __func__, hash); - rcu_read_lock(); - hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) { - if (!memcmp(&d->de_id, id, sizeof(*id))) { - if (!atomic_inc_not_zero(&d->de_ref)) { - goto fail; - } else { - rcu_read_unlock(); - return d; - } - } - } -fail: - rcu_read_unlock(); - return NULL; -} -EXPORT_SYMBOL_GPL(pnfs_find_get_deviceid); - -/* - * Add a deviceid to the cache. - * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new - */ -struct pnfs_deviceid_node * -pnfs_add_deviceid(struct pnfs_deviceid_cache *c, struct pnfs_deviceid_node *new) -{ - struct pnfs_deviceid_node *d; - long hash = nfs4_deviceid_hash(&new->de_id); - - dprintk("--> %s hash %ld\n", __func__, hash); - spin_lock(&c->dc_lock); - d = pnfs_find_get_deviceid(c, &new->de_id); - if (d) { - spin_unlock(&c->dc_lock); - dprintk("%s [discard]\n", __func__); - c->dc_free_callback(new); - return d; - } - INIT_HLIST_NODE(&new->de_node); - atomic_set(&new->de_ref, 1); - hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]); - spin_unlock(&c->dc_lock); - dprintk("%s [new]\n", __func__); - return new; -} -EXPORT_SYMBOL_GPL(pnfs_add_deviceid); - -void -pnfs_put_deviceid_cache(struct nfs_client *clp) -{ - struct pnfs_deviceid_cache *local = clp->cl_devid_cache; - - dprintk("--> %s ({%d})\n", __func__, atomic_read(&local->dc_ref)); - if (atomic_dec_and_lock(&local->dc_ref, &clp->cl_lock)) { - int i; - /* Verify cache is empty */ - for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++) - BUG_ON(!hlist_empty(&local->dc_deviceids[i])); - clp->cl_devid_cache = NULL; - spin_unlock(&clp->cl_lock); - kfree(local); - } -} -EXPORT_SYMBOL_GPL(pnfs_put_deviceid_cache); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 585023fabb55..acbb77802075 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -68,8 +68,6 @@ struct pnfs_layoutdriver_type { const u32 id; const char *name; struct module *owner; - int (*set_layoutdriver) (struct nfs_server *); - int (*clear_layoutdriver) (struct nfs_server *); struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr); void (*free_lseg) (struct pnfs_layout_segment *lseg); @@ -106,52 +104,6 @@ struct pnfs_device { unsigned int pglen; }; -/* - * Device ID RCU cache. A device ID is unique per client ID and layout type. - */ -#define NFS4_DEVICE_ID_HASH_BITS 5 -#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS) -#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1) - -static inline u32 -nfs4_deviceid_hash(struct nfs4_deviceid *id) -{ - unsigned char *cptr = (unsigned char *)id->data; - unsigned int nbytes = NFS4_DEVICEID4_SIZE; - u32 x = 0; - - while (nbytes--) { - x *= 37; - x += *cptr++; - } - return x & NFS4_DEVICE_ID_HASH_MASK; -} - -struct pnfs_deviceid_node { - struct hlist_node de_node; - struct nfs4_deviceid de_id; - atomic_t de_ref; -}; - -struct pnfs_deviceid_cache { - spinlock_t dc_lock; - atomic_t dc_ref; - void (*dc_free_callback)(struct pnfs_deviceid_node *); - struct hlist_head dc_deviceids[NFS4_DEVICE_ID_HASH_SIZE]; -}; - -extern int pnfs_alloc_init_deviceid_cache(struct nfs_client *, - void (*free_callback)(struct pnfs_deviceid_node *)); -extern void pnfs_put_deviceid_cache(struct nfs_client *); -extern struct pnfs_deviceid_node *pnfs_find_get_deviceid( - struct pnfs_deviceid_cache *, - struct nfs4_deviceid *); -extern struct pnfs_deviceid_node *pnfs_add_deviceid( - struct pnfs_deviceid_cache *, - struct pnfs_deviceid_node *); -extern void pnfs_put_deviceid(struct pnfs_deviceid_cache *c, - struct pnfs_deviceid_node *devid); - extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *); extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index c00d4ec47ec3..0cbf109a4056 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -77,7 +77,6 @@ struct nfs_client { u32 cl_exchange_flags; struct nfs4_session *cl_session; /* sharred session */ struct list_head cl_layouts; - struct pnfs_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */ #endif /* CONFIG_NFS_V4 */ #ifdef CONFIG_NFS_FSCACHE From 568e8c494ded95a28c5dd8b79b4d3ffb95b6d845 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Tue, 1 Mar 2011 01:34:22 +0000 Subject: [PATCH 24/54] NFSv4.1: turn off pNFS on ds connection failure If a data server is unavailable, go through MDS. Mark the deviceid containing the data server as a negative cache entry. Do not try to connect to any data server on a deviceid marked as a negative cache entry. Mark any layout that tries to use the marked deviceid as failed. Inodes with a layout marked as fails will not use the layout for I/O, and will not perform any more layoutgets. Inodes without a layout will still do layoutget, but the layout will get marked immediately. Signed-off-by: Andy Adamson Signed-off-by: Trond Myklebust --- fs/nfs/nfs4filelayout.c | 4 +++- fs/nfs/nfs4filelayout.h | 4 ++++ fs/nfs/nfs4filelayoutdev.c | 28 ++++++++++++++++++++++++---- fs/nfs/pnfs.c | 9 +++++---- 4 files changed, 36 insertions(+), 9 deletions(-) diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index a922e75af42e..0040a5ee6208 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -214,7 +214,9 @@ filelayout_read_pagelist(struct nfs_read_data *data) idx = nfs4_fl_calc_ds_index(lseg, j); ds = nfs4_fl_prepare_ds(lseg, idx); if (!ds) { - printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__); + /* Either layout fh index faulty, or ds connect failed */ + set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags); + set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); return PNFS_NOT_ATTEMPTED; } dprintk("%s USE DS:ip %x %hu\n", __func__, diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h index 23f1e1e2a0f5..ee0c907742b5 100644 --- a/fs/nfs/nfs4filelayout.h +++ b/fs/nfs/nfs4filelayout.h @@ -55,10 +55,14 @@ struct nfs4_pnfs_ds { atomic_t ds_count; }; +/* nfs4_file_layout_dsaddr flags */ +#define NFS4_DEVICE_ID_NEG_ENTRY 0x00000001 + struct nfs4_file_layout_dsaddr { struct hlist_node node; struct nfs4_deviceid deviceid; atomic_t ref; + unsigned long flags; u32 stripe_count; u8 *stripe_indices; u32 ds_num; diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c index f594ca35a996..68143c162e3b 100644 --- a/fs/nfs/nfs4filelayoutdev.c +++ b/fs/nfs/nfs4filelayoutdev.c @@ -606,6 +606,21 @@ nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j) return flseg->fh_array[i]; } +static void +filelayout_mark_devid_negative(struct nfs4_file_layout_dsaddr *dsaddr, + int err, u32 ds_addr) +{ + u32 *p = (u32 *)&dsaddr->deviceid; + + printk(KERN_ERR "NFS: data server %x connection error %d." + " Deviceid [%x%x%x%x] marked out of use.\n", + ds_addr, err, p[0], p[1], p[2], p[3]); + + spin_lock(&filelayout_deviceid_lock); + dsaddr->flags |= NFS4_DEVICE_ID_NEG_ENTRY; + spin_unlock(&filelayout_deviceid_lock); +} + struct nfs4_pnfs_ds * nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) { @@ -619,13 +634,18 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) } if (!ds->ds_clp) { + struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode); int err; - err = nfs4_ds_connect(NFS_SERVER(lseg->pls_layout->plh_inode), - dsaddr->ds_list[ds_idx]); + if (dsaddr->flags & NFS4_DEVICE_ID_NEG_ENTRY) { + /* Already tried to connect, don't try again */ + dprintk("%s Deviceid marked out of use\n", __func__); + return NULL; + } + err = nfs4_ds_connect(s, ds); if (err) { - printk(KERN_ERR "%s nfs4_ds_connect error %d\n", - __func__, err); + filelayout_mark_devid_negative(dsaddr, err, + ntohl(ds->ds_ip_addr)); return NULL; } } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 1f4c153441a1..3e545144a0b2 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -739,15 +739,16 @@ pnfs_update_layout(struct inode *ino, dprintk("%s matches recall, use MDS\n", __func__); goto out_unlock; } - /* Check to see if the layout for the given range already exists */ - lseg = pnfs_find_lseg(lo, iomode); - if (lseg) - goto out_unlock; /* if LAYOUTGET already failed once we don't try again */ if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags)) goto out_unlock; + /* Check to see if the layout for the given range already exists */ + lseg = pnfs_find_lseg(lo, iomode); + if (lseg) + goto out_unlock; + if (pnfs_layoutgets_blocked(lo, NULL, 0)) goto out_unlock; atomic_inc(&lo->plh_outstanding); From 80fe2b192dbc53261e385dc26d90f5195f1c62e7 Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Tue, 1 Mar 2011 01:34:23 +0000 Subject: [PATCH 25/54] NFSv4.1: lseg documentation Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- Documentation/filesystems/nfs/pnfs.txt | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Documentation/filesystems/nfs/pnfs.txt b/Documentation/filesystems/nfs/pnfs.txt index bc0b9cfe095b..983e14abe7e9 100644 --- a/Documentation/filesystems/nfs/pnfs.txt +++ b/Documentation/filesystems/nfs/pnfs.txt @@ -46,3 +46,10 @@ data server cache file driver devices refer to data servers, which are kept in a module level cache. Its reference is held over the lifetime of the deviceid pointing to it. + +lseg +---- +lseg maintains an extra reference corresponding to the NFS_LSEG_VALID +bit which holds it in the pnfs_layout_hdr's list. When the final lseg +is removed from the pnfs_layout_hdr's list, the NFS_LAYOUT_DESTROYED +bit is set, preventing any new lsegs from being added. From d138d5d17be6a60d883e8bd4e22bc218d3adfab3 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Thu, 3 Mar 2011 15:13:41 +0000 Subject: [PATCH 26/54] NFSv4.1: rearrange nfs_write_rpcsetup Reorder nfs_write_rpcsetup, preparing for a pnfs entry point. Signed-off-by: Andy Adamson Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 82 ++++++++++++++++++++++++++++---------------------- 1 file changed, 46 insertions(+), 36 deletions(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index f033fa0d7d33..ae035990941a 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -782,25 +782,21 @@ static int flush_task_priority(int how) return RPC_PRIORITY_NORMAL; } -/* - * Set up the argument/result storage required for the RPC call. - */ -static int nfs_write_rpcsetup(struct nfs_page *req, - struct nfs_write_data *data, - const struct rpc_call_ops *call_ops, - unsigned int count, unsigned int offset, - int how) +static int nfs_initiate_write(struct nfs_write_data *data, + struct rpc_clnt *clnt, + const struct rpc_call_ops *call_ops, + int how) { - struct inode *inode = req->wb_context->path.dentry->d_inode; + struct inode *inode = data->inode; int priority = flush_task_priority(how); struct rpc_task *task; struct rpc_message msg = { .rpc_argp = &data->args, .rpc_resp = &data->res, - .rpc_cred = req->wb_context->cred, + .rpc_cred = data->cred, }; struct rpc_task_setup task_setup_data = { - .rpc_client = NFS_CLIENT(inode), + .rpc_client = clnt, .task = &data->task, .rpc_message = &msg, .callback_ops = call_ops, @@ -811,12 +807,49 @@ static int nfs_write_rpcsetup(struct nfs_page *req, }; int ret = 0; + /* Set up the initial task struct. */ + NFS_PROTO(inode)->write_setup(data, &msg); + + dprintk("NFS: %5u initiated write call " + "(req %s/%lld, %u bytes @ offset %llu)\n", + data->task.tk_pid, + inode->i_sb->s_id, + (long long)NFS_FILEID(inode), + data->args.count, + (unsigned long long)data->args.offset); + + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) { + ret = PTR_ERR(task); + goto out; + } + if (how & FLUSH_SYNC) { + ret = rpc_wait_for_completion_task(task); + if (ret == 0) + ret = task->tk_status; + } + rpc_put_task(task); +out: + return ret; +} + +/* + * Set up the argument/result storage required for the RPC call. + */ +static int nfs_write_rpcsetup(struct nfs_page *req, + struct nfs_write_data *data, + const struct rpc_call_ops *call_ops, + unsigned int count, unsigned int offset, + int how) +{ + struct inode *inode = req->wb_context->path.dentry->d_inode; + /* Set up the RPC argument and reply structs * NB: take care not to mess about with data->commit et al. */ data->req = req; data->inode = inode = req->wb_context->path.dentry->d_inode; - data->cred = msg.rpc_cred; + data->cred = req->wb_context->cred; data->args.fh = NFS_FH(inode); data->args.offset = req_offset(req) + offset; @@ -837,30 +870,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req, data->res.verf = &data->verf; nfs_fattr_init(&data->fattr); - /* Set up the initial task struct. */ - NFS_PROTO(inode)->write_setup(data, &msg); - - dprintk("NFS: %5u initiated write call " - "(req %s/%lld, %u bytes @ offset %llu)\n", - data->task.tk_pid, - inode->i_sb->s_id, - (long long)NFS_FILEID(inode), - count, - (unsigned long long)data->args.offset); - - task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) { - ret = PTR_ERR(task); - goto out; - } - if (how & FLUSH_SYNC) { - ret = rpc_wait_for_completion_task(task); - if (ret == 0) - ret = task->tk_status; - } - rpc_put_task(task); -out: - return ret; + return nfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how); } /* If a nfs_flush_* function fails, it should remove reqs from @head and From b029bc9b0880cbaf999f580c0ea8f06dd274fc77 Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Thu, 3 Mar 2011 15:13:42 +0000 Subject: [PATCH 27/54] NFSv4.1: add callback to nfs4_write_done Add callback that pnfs layout driver can use to do its own handling of data server WRITE response. Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 14 ++++++++++---- include/linux/nfs_xdr.h | 1 + 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 1dc809039448..15248549c89f 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3123,13 +3123,10 @@ void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data) } EXPORT_SYMBOL_GPL(nfs4_reset_read); -static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data) +static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data) { struct inode *inode = data->inode; - if (!nfs4_sequence_done(task, &data->res.seq_res)) - return -EAGAIN; - if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) { nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); return -EAGAIN; @@ -3141,11 +3138,20 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data) return 0; } +static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data) +{ + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return -EAGAIN; + return data->write_done_cb(task, data); +} + static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg) { struct nfs_server *server = NFS_SERVER(data->inode); data->args.bitmask = server->cache_consistency_bitmask; + if (!data->write_done_cb) + data->write_done_cb = nfs4_write_done_cb; data->res.server = server; data->timestamp = jiffies; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index eb0e87084353..21cd41ddbca6 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1038,6 +1038,7 @@ struct nfs_write_data { unsigned int npages; /* Max length of pagevec */ struct nfs_writeargs args; /* argument struct */ struct nfs_writeres res; /* result struct */ + int (*write_done_cb) (struct rpc_task *task, struct nfs_write_data *data); #ifdef CONFIG_NFS_V4 unsigned long timestamp; /* For lease renewal */ #endif From 5053aa568d4017aeb1fa35247d4ad96be262920f Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Thu, 3 Mar 2011 15:13:43 +0000 Subject: [PATCH 28/54] NFSv4.1: Send lseg down into nfs_write_rpcsetup We grab the lseg sent in from the doio function and attach it to each struct nfs_write_data created. This is how the lseg will be sent to the layout driver. Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 7 +++++-- include/linux/nfs_xdr.h | 1 + 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index ae035990941a..72b0ec0bb0e1 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -97,6 +97,7 @@ void nfs_writedata_free(struct nfs_write_data *p) static void nfs_writedata_release(struct nfs_write_data *wdata) { + put_lseg(wdata->lseg); put_nfs_open_context(wdata->args.context); nfs_writedata_free(wdata); } @@ -840,6 +841,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req, struct nfs_write_data *data, const struct rpc_call_ops *call_ops, unsigned int count, unsigned int offset, + struct pnfs_layout_segment *lseg, int how) { struct inode *inode = req->wb_context->path.dentry->d_inode; @@ -850,6 +852,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req, data->req = req; data->inode = inode = req->wb_context->path.dentry->d_inode; data->cred = req->wb_context->cred; + data->lseg = get_lseg(lseg); data->args.fh = NFS_FH(inode); data->args.offset = req_offset(req) + offset; @@ -930,7 +933,7 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned if (nbytes < wsize) wsize = nbytes; ret2 = nfs_write_rpcsetup(req, data, &nfs_write_partial_ops, - wsize, offset, how); + wsize, offset, lseg, how); if (ret == 0) ret = ret2; offset += wsize; @@ -978,7 +981,7 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned i req = nfs_list_entry(data->pages.next); /* Set up the argument struct */ - return nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, how); + return nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, lseg, how); out_bad: while (!list_empty(head)) { req = nfs_list_entry(head->next); diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 21cd41ddbca6..09d96812d6d0 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1038,6 +1038,7 @@ struct nfs_write_data { unsigned int npages; /* Max length of pagevec */ struct nfs_writeargs args; /* argument struct */ struct nfs_writeres res; /* result struct */ + struct pnfs_layout_segment *lseg; int (*write_done_cb) (struct rpc_task *task, struct nfs_write_data *data); #ifdef CONFIG_NFS_V4 unsigned long timestamp; /* For lease renewal */ From 44b83799a922a153957c65ccfc985a8c902958c8 Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Thu, 3 Mar 2011 15:13:44 +0000 Subject: [PATCH 29/54] NFSv4.1: trigger LAYOUTGET for writes Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 22 ++++++++++++++++++++++ fs/nfs/pnfs.h | 7 +++++++ fs/nfs/write.c | 32 ++++++++++++++++++++------------ 3 files changed, 49 insertions(+), 12 deletions(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 3e545144a0b2..5f205d31d96c 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -873,6 +873,28 @@ pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode) pgio->pg_test = (ld && ld->pg_test) ? pnfs_read_pg_test : NULL; } +static int pnfs_write_pg_test(struct nfs_pageio_descriptor *pgio, + struct nfs_page *prev, + struct nfs_page *req) +{ + if (pgio->pg_count == prev->wb_bytes) { + /* This is first coelesce call for a series of nfs_pages */ + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, + prev->wb_context, + IOMODE_RW); + } + return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req); +} + +void +pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode) +{ + struct pnfs_layoutdriver_type *ld; + + ld = NFS_SERVER(inode)->pnfs_curr_ld; + pgio->pg_test = (ld && ld->pg_test) ? pnfs_write_pg_test : NULL; +} + /* * Call the appropriate parallel I/O subsystem read function. */ diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index acbb77802075..1d4e6317fa95 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -123,6 +123,7 @@ void unset_pnfs_layoutdriver(struct nfs_server *); enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *, const struct rpc_call_ops *); void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *); +void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *); int pnfs_layout_process(struct nfs4_layoutget *lgp); void pnfs_free_lseg_list(struct list_head *tmp_list); void pnfs_destroy_layout(struct nfs_inode *); @@ -235,6 +236,12 @@ pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *ino) pgio->pg_test = NULL; } +static inline void +pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *ino) +{ + pgio->pg_test = NULL; +} + #endif /* CONFIG_NFS_V4_1 */ #endif /* FS_NFS_PNFS_H */ diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 72b0ec0bb0e1..49c4784c24e5 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -919,6 +919,8 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned } while (nbytes != 0); atomic_set(&req->wb_complete, requests); + BUG_ON(lseg); + lseg = pnfs_update_layout(inode, req->wb_context, IOMODE_RW); ClearPageError(page); offset = 0; nbytes = count; @@ -940,6 +942,7 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned nbytes -= wsize; } while (nbytes != 0); + put_lseg(lseg); return ret; out_bad: @@ -965,11 +968,18 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned i struct nfs_page *req; struct page **pages; struct nfs_write_data *data; + int ret; data = nfs_writedata_alloc(npages); - if (!data) - goto out_bad; - + if (!data) { + while (!list_empty(head)) { + req = nfs_list_entry(head->next); + nfs_list_remove_request(req); + nfs_redirty_request(req); + } + ret = -ENOMEM; + goto out; + } pages = data->pagevec; while (!list_empty(head)) { req = nfs_list_entry(head->next); @@ -979,16 +989,14 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned i *pages++ = req->wb_page; } req = nfs_list_entry(data->pages.next); + if ((!lseg) && list_is_singular(&data->pages)) + lseg = pnfs_update_layout(inode, req->wb_context, IOMODE_RW); /* Set up the argument struct */ - return nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, lseg, how); - out_bad: - while (!list_empty(head)) { - req = nfs_list_entry(head->next); - nfs_list_remove_request(req); - nfs_redirty_request(req); - } - return -ENOMEM; + ret = nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, lseg, how); +out: + put_lseg(lseg); /* Cleans any gotten in ->pg_test */ + return ret; } static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, @@ -996,7 +1004,7 @@ static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, { size_t wsize = NFS_SERVER(inode)->wsize; - pgio->pg_test = NULL; + pnfs_pageio_init_write(pgio, inode); if (wsize < PAGE_CACHE_SIZE) nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags); From 0382b74409c6b9ef12c952b50bb44f557a361a43 Mon Sep 17 00:00:00 2001 From: Andy Adamson Date: Thu, 3 Mar 2011 15:13:45 +0000 Subject: [PATCH 30/54] NFSv4.1: implement generic pnfs layer write switch Signed-off-by: Andy Adamson Signed-off-by: Boaz Harrosh Signed-off-by: Dean Hildebrand Signed-off-by: Fred Isaman Signed-off-by: J. Bruce Fields Signed-off-by: Mike Sager Signed-off-by: Ricardo Labiaga Signed-off-by: Tao Guo Signed-off-by: Andy Adamson Signed-off-by: Benny Halevy Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/nfs4filelayout.c | 7 +++++++ fs/nfs/pnfs.c | 24 ++++++++++++++++++++++++ fs/nfs/pnfs.h | 10 ++++++++++ fs/nfs/write.c | 4 ++++ include/linux/nfs_iostat.h | 1 + include/linux/nfs_xdr.h | 1 + 6 files changed, 47 insertions(+) diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 0040a5ee6208..9d21bfeec88f 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -238,6 +238,12 @@ filelayout_read_pagelist(struct nfs_read_data *data) return PNFS_ATTEMPTED; } +static enum pnfs_try_status +filelayout_write_pagelist(struct nfs_write_data *data, int sync) +{ + return PNFS_NOT_ATTEMPTED; +} + /* * filelayout_check_layout() * @@ -455,6 +461,7 @@ static struct pnfs_layoutdriver_type filelayout_type = { .free_lseg = filelayout_free_lseg, .pg_test = filelayout_pg_test, .read_pagelist = filelayout_read_pagelist, + .write_pagelist = filelayout_write_pagelist, }; static int __init nfs4filelayout_init(void) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 5f205d31d96c..f38813a0a295 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -895,6 +895,30 @@ pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode) pgio->pg_test = (ld && ld->pg_test) ? pnfs_write_pg_test : NULL; } +enum pnfs_try_status +pnfs_try_to_write_data(struct nfs_write_data *wdata, + const struct rpc_call_ops *call_ops, int how) +{ + struct inode *inode = wdata->inode; + enum pnfs_try_status trypnfs; + struct nfs_server *nfss = NFS_SERVER(inode); + + wdata->mds_ops = call_ops; + + dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__, + inode->i_ino, wdata->args.count, wdata->args.offset, how); + + trypnfs = nfss->pnfs_curr_ld->write_pagelist(wdata, how); + if (trypnfs == PNFS_NOT_ATTEMPTED) { + put_lseg(wdata->lseg); + wdata->lseg = NULL; + } else + nfs_inc_stats(inode, NFSIOS_PNFS_WRITE); + + dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); + return trypnfs; +} + /* * Call the appropriate parallel I/O subsystem read function. */ diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 1d4e6317fa95..6380b9405bcd 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -79,6 +79,7 @@ struct pnfs_layoutdriver_type { * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS */ enum pnfs_try_status (*read_pagelist) (struct nfs_read_data *nfs_data); + enum pnfs_try_status (*write_pagelist) (struct nfs_write_data *nfs_data, int how); }; struct pnfs_layout_hdr { @@ -120,6 +121,8 @@ pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, enum pnfs_iomode access_type); void set_pnfs_layoutdriver(struct nfs_server *, u32 id); void unset_pnfs_layoutdriver(struct nfs_server *); +enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *, + const struct rpc_call_ops *, int); enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *, const struct rpc_call_ops *); void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *); @@ -200,6 +203,13 @@ pnfs_try_to_read_data(struct nfs_read_data *data, return PNFS_NOT_ATTEMPTED; } +static inline enum pnfs_try_status +pnfs_try_to_write_data(struct nfs_write_data *data, + const struct rpc_call_ops *call_ops, int how) +{ + return PNFS_NOT_ATTEMPTED; +} + static inline bool pnfs_roc(struct inode *ino) { diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 49c4784c24e5..df99c5b0ee65 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -873,6 +873,10 @@ static int nfs_write_rpcsetup(struct nfs_page *req, data->res.verf = &data->verf; nfs_fattr_init(&data->fattr); + if (data->lseg && + (pnfs_try_to_write_data(data, call_ops, how) == PNFS_ATTEMPTED)) + return 0; + return nfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how); } diff --git a/include/linux/nfs_iostat.h b/include/linux/nfs_iostat.h index 37a143732d02..8866bb3502ee 100644 --- a/include/linux/nfs_iostat.h +++ b/include/linux/nfs_iostat.h @@ -114,6 +114,7 @@ enum nfs_stat_eventcounters { NFSIOS_SHORTWRITE, NFSIOS_DELAY, NFSIOS_PNFS_READ, + NFSIOS_PNFS_WRITE, __NFSIOS_COUNTSMAX, }; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 09d96812d6d0..c82ad33cffe1 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1039,6 +1039,7 @@ struct nfs_write_data { struct nfs_writeargs args; /* argument struct */ struct nfs_writeres res; /* result struct */ struct pnfs_layout_segment *lseg; + const struct rpc_call_ops *mds_ops; int (*write_done_cb) (struct rpc_task *task, struct nfs_write_data *data); #ifdef CONFIG_NFS_V4 unsigned long timestamp; /* For lease renewal */ From 7ffd10640dc008f6d5a375bd6450755745c63c7d Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Thu, 3 Mar 2011 15:13:46 +0000 Subject: [PATCH 31/54] NFSv4.1: remove GETATTR from ds writes Any WRITE compound directed to a data server needs to have the GETATTR calls suppressed. Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 6 +++++- fs/nfs/nfs4xdr.c | 8 +++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 15248549c89f..da902123ec53 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3149,7 +3149,11 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag { struct nfs_server *server = NFS_SERVER(data->inode); - data->args.bitmask = server->cache_consistency_bitmask; + if (data->lseg) { + data->args.bitmask = NULL; + data->res.fattr = NULL; + } else + data->args.bitmask = server->cache_consistency_bitmask; if (!data->write_done_cb) data->write_done_cb = nfs4_write_done_cb; data->res.server = server; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index a656b6e179b0..0f2dcfb41f29 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -2275,7 +2275,8 @@ static void nfs4_xdr_enc_write(struct rpc_rqst *req, struct xdr_stream *xdr, encode_putfh(xdr, args->fh, &hdr); encode_write(xdr, args, &hdr); req->rq_snd_buf.flags |= XDRBUF_WRITE; - encode_getfattr(xdr, args->bitmask, &hdr); + if (args->bitmask) + encode_getfattr(xdr, args->bitmask, &hdr); encode_nops(&hdr); } @@ -5694,8 +5695,9 @@ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_write(xdr, res); if (status) goto out; - decode_getfattr(xdr, res->fattr, res->server, - !RPC_IS_ASYNC(rqstp->rq_task)); + if (res->fattr) + decode_getfattr(xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); if (!status) status = res->count; out: From a69aef1496726ed88386dad65abfcc8cd3195304 Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Thu, 3 Mar 2011 15:13:47 +0000 Subject: [PATCH 32/54] NFSv4.1: pnfs filelayout driver write Allows the pnfs filelayout driver to write to the data servers. Note that COMMIT to data servers will be implemented in a future patch. To avoid improper behavior, for the moment any WRITE to a data server that would also require a COMMIT to the data server is sent NFS_FILE_SYNC. Signed-off-by: Andy Adamson Signed-off-by: Dean Hildebrand Signed-off-by: Fred Isaman Signed-off-by: Mingyang Guo Signed-off-by: Oleg Drokin Signed-off-by: Ricardo Labiaga Signed-off-by: Andy Adamson Signed-off-by: Benny Halevy Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/internal.h | 5 ++ fs/nfs/nfs4filelayout.c | 101 +++++++++++++++++++++++++++++++++++++++- fs/nfs/nfs4proc.c | 17 +++++++ fs/nfs/write.c | 5 +- include/linux/nfs_xdr.h | 2 + 5 files changed, 128 insertions(+), 2 deletions(-) diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 1a3228e9ea22..d1ddc23c404d 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -276,6 +276,10 @@ extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt, extern void nfs_read_prepare(struct rpc_task *task, void *calldata); /* write.c */ +extern int nfs_initiate_write(struct nfs_write_data *data, + struct rpc_clnt *clnt, + const struct rpc_call_ops *call_ops, + int how); extern void nfs_write_prepare(struct rpc_task *task, void *calldata); #ifdef CONFIG_MIGRATION extern int nfs_migrate_page(struct address_space *, @@ -291,6 +295,7 @@ extern int nfs4_init_client(struct nfs_client *clp, const char *ip_addr, rpc_authflavor_t authflavour, int noresvport); +extern void nfs4_reset_write(struct rpc_task *task, struct nfs_write_data *data); extern int _nfs4_call_sync(struct nfs_server *server, struct rpc_message *msg, struct nfs4_sequence_args *args, diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 9d21bfeec88f..7e1d4571b7b2 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -189,12 +189,69 @@ static void filelayout_read_release(void *data) rdata->mds_ops->rpc_release(data); } +static int filelayout_write_done_cb(struct rpc_task *task, + struct nfs_write_data *data) +{ + int reset = 0; + + if (filelayout_async_handle_error(task, data->args.context->state, + data->ds_clp, &reset) == -EAGAIN) { + struct nfs_client *clp; + + dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n", + __func__, data->ds_clp, data->ds_clp->cl_session); + if (reset) { + filelayout_set_lo_fail(data->lseg); + nfs4_reset_write(task, data); + clp = NFS_SERVER(data->inode)->nfs_client; + } else + clp = data->ds_clp; + nfs_restart_rpc(task, clp); + return -EAGAIN; + } + + return 0; +} + +static void filelayout_write_prepare(struct rpc_task *task, void *data) +{ + struct nfs_write_data *wdata = (struct nfs_write_data *)data; + + if (nfs41_setup_sequence(wdata->ds_clp->cl_session, + &wdata->args.seq_args, &wdata->res.seq_res, + 0, task)) + return; + + rpc_call_start(task); +} + +static void filelayout_write_call_done(struct rpc_task *task, void *data) +{ + struct nfs_write_data *wdata = (struct nfs_write_data *)data; + + /* Note this may cause RPC to be resent */ + wdata->mds_ops->rpc_call_done(task, data); +} + +static void filelayout_write_release(void *data) +{ + struct nfs_write_data *wdata = (struct nfs_write_data *)data; + + wdata->mds_ops->rpc_release(data); +} + struct rpc_call_ops filelayout_read_call_ops = { .rpc_call_prepare = filelayout_read_prepare, .rpc_call_done = filelayout_read_call_done, .rpc_release = filelayout_read_release, }; +struct rpc_call_ops filelayout_write_call_ops = { + .rpc_call_prepare = filelayout_write_prepare, + .rpc_call_done = filelayout_write_call_done, + .rpc_release = filelayout_write_release, +}; + static enum pnfs_try_status filelayout_read_pagelist(struct nfs_read_data *data) { @@ -238,10 +295,52 @@ filelayout_read_pagelist(struct nfs_read_data *data) return PNFS_ATTEMPTED; } +/* Perform async writes. */ static enum pnfs_try_status filelayout_write_pagelist(struct nfs_write_data *data, int sync) { - return PNFS_NOT_ATTEMPTED; + struct pnfs_layout_segment *lseg = data->lseg; + struct nfs4_pnfs_ds *ds; + loff_t offset = data->args.offset; + u32 j, idx; + struct nfs_fh *fh; + int status; + + /* Retrieve the correct rpc_client for the byte range */ + j = nfs4_fl_calc_j_index(lseg, offset); + idx = nfs4_fl_calc_ds_index(lseg, j); + ds = nfs4_fl_prepare_ds(lseg, idx); + if (!ds) { + printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__); + set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags); + set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags); + return PNFS_NOT_ATTEMPTED; + } + dprintk("%s ino %lu sync %d req %Zu@%llu DS:%x:%hu\n", __func__, + data->inode->i_ino, sync, (size_t) data->args.count, offset, + ntohl(ds->ds_ip_addr), ntohs(ds->ds_port)); + + /* We can't handle commit to ds yet */ + if (!FILELAYOUT_LSEG(lseg)->commit_through_mds) + data->args.stable = NFS_FILE_SYNC; + + data->write_done_cb = filelayout_write_done_cb; + data->ds_clp = ds->ds_clp; + fh = nfs4_fl_select_ds_fh(lseg, j); + if (fh) + data->args.fh = fh; + /* + * Get the file offset on the dserver. Set the write offset to + * this offset and save the original offset. + */ + data->args.offset = filelayout_get_dserver_offset(lseg, offset); + data->mds_offset = offset; + + /* Perform an asynchronous write */ + status = nfs_initiate_write(data, ds->ds_clp->cl_rpcclient, + &filelayout_write_call_ops, sync); + BUG_ON(status != 0); + return PNFS_ATTEMPTED; } /* diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index da902123ec53..7b4b9f3e9842 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3145,6 +3145,23 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data) return data->write_done_cb(task, data); } +/* Reset the the nfs_write_data to send the write to the MDS. */ +void nfs4_reset_write(struct rpc_task *task, struct nfs_write_data *data) +{ + dprintk("%s Reset task for i/o through\n", __func__); + put_lseg(data->lseg); + data->lseg = NULL; + data->ds_clp = NULL; + data->write_done_cb = nfs4_write_done_cb; + data->args.fh = NFS_FH(data->inode); + data->args.bitmask = data->res.server->cache_consistency_bitmask; + data->args.offset = data->mds_offset; + data->res.fattr = &data->fattr; + task->tk_ops = data->mds_ops; + rpc_task_reset_client(task, NFS_CLIENT(data->inode)); +} +EXPORT_SYMBOL_GPL(nfs4_reset_write); + static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg) { struct nfs_server *server = NFS_SERVER(data->inode); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index df99c5b0ee65..ee62ddf60e7a 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -783,7 +783,7 @@ static int flush_task_priority(int how) return RPC_PRIORITY_NORMAL; } -static int nfs_initiate_write(struct nfs_write_data *data, +int nfs_initiate_write(struct nfs_write_data *data, struct rpc_clnt *clnt, const struct rpc_call_ops *call_ops, int how) @@ -833,6 +833,7 @@ static int nfs_initiate_write(struct nfs_write_data *data, out: return ret; } +EXPORT_SYMBOL_GPL(nfs_initiate_write); /* * Set up the argument/result storage required for the RPC call. @@ -1194,6 +1195,7 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) */ static unsigned long complain; + /* Note this will print the MDS for a DS write */ if (time_before(complain, jiffies)) { dprintk("NFS: faulty NFS server %s:" " (committed = %d) != (stable = %d)\n", @@ -1214,6 +1216,7 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) /* Was this an NFSv2 write or an NFSv3 stable write? */ if (resp->verf->committed != NFS_UNSTABLE) { /* Resend from where the server left off */ + data->mds_offset += resp->count; argp->offset += resp->count; argp->pgbase += resp->count; argp->count -= resp->count; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index c82ad33cffe1..3440f5ab0f54 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1039,11 +1039,13 @@ struct nfs_write_data { struct nfs_writeargs args; /* argument struct */ struct nfs_writeres res; /* result struct */ struct pnfs_layout_segment *lseg; + struct nfs_client *ds_clp; /* pNFS data server */ const struct rpc_call_ops *mds_ops; int (*write_done_cb) (struct rpc_task *task, struct nfs_write_data *data); #ifdef CONFIG_NFS_V4 unsigned long timestamp; /* For lease renewal */ #endif + __u64 mds_offset; /* Filelayout dense stripe */ struct page *page_array[NFS_PAGEVEC_SIZE]; }; From c76069bda0f17cd3e153e54d9ac01242909c6b15 Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Thu, 3 Mar 2011 15:13:48 +0000 Subject: [PATCH 33/54] NFSv4.1: rearrange ->doio args This will make it possible to clear the lseg pointer in the same function as it is put, instead of in the caller nfs_pageio_doio(). Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/pagelist.c | 10 ++-------- fs/nfs/read.c | 42 ++++++++++++++++++++++++---------------- fs/nfs/write.c | 28 +++++++++++++++------------ include/linux/nfs_page.h | 4 ++-- 4 files changed, 45 insertions(+), 39 deletions(-) diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 45b0fb8add39..9f628746f5c8 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -214,7 +214,7 @@ nfs_wait_on_request(struct nfs_page *req) */ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, struct inode *inode, - int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int, struct pnfs_layout_segment *), + int (*doio)(struct nfs_pageio_descriptor *), size_t bsize, int io_flags) { @@ -311,13 +311,7 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc) { if (!list_empty(&desc->pg_list)) { - int error = desc->pg_doio(desc->pg_inode, - &desc->pg_list, - nfs_page_array_len(desc->pg_base, - desc->pg_count), - desc->pg_count, - desc->pg_ioflags, - desc->pg_lseg); + int error = desc->pg_doio(desc); desc->pg_lseg = NULL; if (error < 0) desc->pg_error = error; diff --git a/fs/nfs/read.c b/fs/nfs/read.c index f40c7f4dc16b..ab9c7768b7c6 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -31,8 +31,8 @@ #define NFSDBG_FACILITY NFSDBG_PAGECACHE -static int nfs_pagein_multi(struct inode *, struct list_head *, unsigned int, size_t, int, struct pnfs_layout_segment *); -static int nfs_pagein_one(struct inode *, struct list_head *, unsigned int, size_t, int, struct pnfs_layout_segment *); +static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc); +static int nfs_pagein_one(struct nfs_pageio_descriptor *desc); static const struct rpc_call_ops nfs_read_partial_ops; static const struct rpc_call_ops nfs_read_full_ops; @@ -117,9 +117,9 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data) int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, struct page *page) { - LIST_HEAD(one_request); struct nfs_page *new; unsigned int len; + struct nfs_pageio_descriptor pgio; len = nfs_page_length(page); if (len == 0) @@ -132,11 +132,14 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, if (len < PAGE_CACHE_SIZE) zero_user_segment(page, len, PAGE_CACHE_SIZE); - nfs_list_add_request(new, &one_request); + nfs_pageio_init(&pgio, inode, NULL, 0, 0); + nfs_list_add_request(new, &pgio.pg_list); + pgio.pg_count = len; + if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE) - nfs_pagein_multi(inode, &one_request, 1, len, 0, NULL); + nfs_pagein_multi(&pgio); else - nfs_pagein_one(inode, &one_request, 1, len, 0, NULL); + nfs_pagein_one(&pgio); return 0; } @@ -258,20 +261,21 @@ nfs_async_read_error(struct list_head *head) * won't see the new data until our attribute cache is updated. This is more * or less conventional NFS client behavior. */ -static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags, struct pnfs_layout_segment *lseg) +static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc) { - struct nfs_page *req = nfs_list_entry(head->next); + struct nfs_page *req = nfs_list_entry(desc->pg_list.next); struct page *page = req->wb_page; struct nfs_read_data *data; - size_t rsize = NFS_SERVER(inode)->rsize, nbytes; + size_t rsize = NFS_SERVER(desc->pg_inode)->rsize, nbytes; unsigned int offset; int requests = 0; int ret = 0; + struct pnfs_layout_segment *lseg; LIST_HEAD(list); nfs_list_remove_request(req); - nbytes = count; + nbytes = desc->pg_count; do { size_t len = min(nbytes,rsize); @@ -284,11 +288,11 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne } while(nbytes != 0); atomic_set(&req->wb_complete, requests); - /* We know lseg==NULL */ - lseg = pnfs_update_layout(inode, req->wb_context, IOMODE_READ); + BUG_ON(desc->pg_lseg != NULL); + lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ); ClearPageError(page); offset = 0; - nbytes = count; + nbytes = desc->pg_count; do { int ret2; @@ -321,14 +325,17 @@ out_bad: return -ENOMEM; } -static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags, struct pnfs_layout_segment *lseg) +static int nfs_pagein_one(struct nfs_pageio_descriptor *desc) { struct nfs_page *req; struct page **pages; struct nfs_read_data *data; + struct list_head *head = &desc->pg_list; + struct pnfs_layout_segment *lseg = desc->pg_lseg; int ret = -ENOMEM; - data = nfs_readdata_alloc(npages); + data = nfs_readdata_alloc(nfs_page_array_len(desc->pg_base, + desc->pg_count)); if (!data) { nfs_async_read_error(head); goto out; @@ -344,9 +351,10 @@ static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned } req = nfs_list_entry(data->pages.next); if ((!lseg) && list_is_singular(&data->pages)) - lseg = pnfs_update_layout(inode, req->wb_context, IOMODE_READ); + lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ); - ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, count, 0, lseg); + ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, desc->pg_count, + 0, lseg); out: put_lseg(lseg); return ret; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index ee62ddf60e7a..b74200a2f753 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -898,20 +898,21 @@ static void nfs_redirty_request(struct nfs_page *req) * Generate multiple small requests to write out a single * contiguous dirty area on one page. */ -static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how, struct pnfs_layout_segment *lseg) +static int nfs_flush_multi(struct nfs_pageio_descriptor *desc) { - struct nfs_page *req = nfs_list_entry(head->next); + struct nfs_page *req = nfs_list_entry(desc->pg_list.next); struct page *page = req->wb_page; struct nfs_write_data *data; - size_t wsize = NFS_SERVER(inode)->wsize, nbytes; + size_t wsize = NFS_SERVER(desc->pg_inode)->wsize, nbytes; unsigned int offset; int requests = 0; int ret = 0; + struct pnfs_layout_segment *lseg; LIST_HEAD(list); nfs_list_remove_request(req); - nbytes = count; + nbytes = desc->pg_count; do { size_t len = min(nbytes, wsize); @@ -924,11 +925,11 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned } while (nbytes != 0); atomic_set(&req->wb_complete, requests); - BUG_ON(lseg); - lseg = pnfs_update_layout(inode, req->wb_context, IOMODE_RW); + BUG_ON(desc->pg_lseg); + lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW); ClearPageError(page); offset = 0; - nbytes = count; + nbytes = desc->pg_count; do { int ret2; @@ -940,7 +941,7 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned if (nbytes < wsize) wsize = nbytes; ret2 = nfs_write_rpcsetup(req, data, &nfs_write_partial_ops, - wsize, offset, lseg, how); + wsize, offset, lseg, desc->pg_ioflags); if (ret == 0) ret = ret2; offset += wsize; @@ -968,14 +969,17 @@ out_bad: * This is the case if nfs_updatepage detects a conflicting request * that has been written but not committed. */ -static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how, struct pnfs_layout_segment *lseg) +static int nfs_flush_one(struct nfs_pageio_descriptor *desc) { struct nfs_page *req; struct page **pages; struct nfs_write_data *data; + struct list_head *head = &desc->pg_list; + struct pnfs_layout_segment *lseg = desc->pg_lseg; int ret; - data = nfs_writedata_alloc(npages); + data = nfs_writedata_alloc(nfs_page_array_len(desc->pg_base, + desc->pg_count)); if (!data) { while (!list_empty(head)) { req = nfs_list_entry(head->next); @@ -995,10 +999,10 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned i } req = nfs_list_entry(data->pages.next); if ((!lseg) && list_is_singular(&data->pages)) - lseg = pnfs_update_layout(inode, req->wb_context, IOMODE_RW); + lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW); /* Set up the argument struct */ - ret = nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, lseg, how); + ret = nfs_write_rpcsetup(req, data, &nfs_write_full_ops, desc->pg_count, 0, lseg, desc->pg_ioflags); out: put_lseg(lseg); /* Cleans any gotten in ->pg_test */ return ret; diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index ba88ff4f8186..90907ada6d52 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -59,7 +59,7 @@ struct nfs_pageio_descriptor { unsigned int pg_base; struct inode *pg_inode; - int (*pg_doio)(struct inode *, struct list_head *, unsigned int, size_t, int, struct pnfs_layout_segment *); + int (*pg_doio)(struct nfs_pageio_descriptor *); int pg_ioflags; int pg_error; struct pnfs_layout_segment *pg_lseg; @@ -81,7 +81,7 @@ extern int nfs_scan_list(struct nfs_inode *nfsi, struct list_head *dst, pgoff_t idx_start, unsigned int npages, int tag); extern void nfs_pageio_init(struct nfs_pageio_descriptor *desc, struct inode *inode, - int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int, struct pnfs_layout_segment *), + int (*doio)(struct nfs_pageio_descriptor *desc), size_t bsize, int how); extern int nfs_pageio_add_request(struct nfs_pageio_descriptor *, From 36fe432d33e078caee5c954e15e929819c2cacae Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Thu, 3 Mar 2011 15:13:49 +0000 Subject: [PATCH 34/54] NFSv4.1: Clear lseg pointer in ->doio function Now that we have access to the pointer, clear it immediately after the put, instead of in caller. Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/pagelist.c | 1 - fs/nfs/read.c | 2 ++ fs/nfs/write.c | 2 ++ 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 9f628746f5c8..23e794410669 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -312,7 +312,6 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc) { if (!list_empty(&desc->pg_list)) { int error = desc->pg_doio(desc); - desc->pg_lseg = NULL; if (error < 0) desc->pg_error = error; else diff --git a/fs/nfs/read.c b/fs/nfs/read.c index ab9c7768b7c6..4b764c6048db 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -311,6 +311,7 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc) nbytes -= rsize; } while (nbytes != 0); put_lseg(lseg); + desc->pg_lseg = NULL; return ret; @@ -357,6 +358,7 @@ static int nfs_pagein_one(struct nfs_pageio_descriptor *desc) 0, lseg); out: put_lseg(lseg); + desc->pg_lseg = NULL; return ret; } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index b74200a2f753..47a3ad63e0d5 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -949,6 +949,7 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc) } while (nbytes != 0); put_lseg(lseg); + desc->pg_lseg = NULL; return ret; out_bad: @@ -1005,6 +1006,7 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc) ret = nfs_write_rpcsetup(req, data, &nfs_write_full_ops, desc->pg_count, 0, lseg, desc->pg_ioflags); out: put_lseg(lseg); /* Cleans any gotten in ->pg_test */ + desc->pg_lseg = NULL; return ret; } From 75247affd7930cc3dcf57f850f0d7898379ef3b3 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Tue, 22 Feb 2011 15:56:01 -0800 Subject: [PATCH 35/54] NFSv4.1: reject zero layout with zeroed stripe unit Allowing stripe_unit==0 causes the client to crash later on when dividing by zero. Reported-by: Marc Eshel Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust --- fs/nfs/nfs4filelayout.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 7e1d4571b7b2..428558464817 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -369,8 +369,8 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo, goto out; } - if (fl->stripe_unit % PAGE_SIZE) { - dprintk("%s Stripe unit (%u) not page aligned\n", + if (!fl->stripe_unit || fl->stripe_unit % PAGE_SIZE) { + dprintk("%s Invalid stripe unit (%u)\n", __func__, fl->stripe_unit); goto out; } From 5cf36cfdc8caa2724738ad0842c5c3dd02f309dc Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 22 Feb 2011 15:44:31 -0800 Subject: [PATCH 36/54] NFSv4: If the server sends us a numeric uid/gid then accept it Signed-off-by: Trond Myklebust --- fs/nfs/idmap.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index 18696882f1c6..cbe6e2fa8cef 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -33,6 +33,24 @@ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include +#include +#include + +static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res) +{ + unsigned long val; + char buf[16]; + + if (memchr(name, '@', namelen) != NULL || namelen >= sizeof(buf)) + return 0; + memcpy(buf, name, namelen); + buf[namelen] = '\0'; + if (strict_strtoul(buf, 0, &val) != 0) + return 0; + *res = val; + return 1; +} #ifdef CONFIG_NFS_USE_NEW_IDMAPPER @@ -42,7 +60,6 @@ #include #include #include -#include #include #include @@ -221,11 +238,15 @@ static int nfs_idmap_lookup_id(const char *name, size_t namelen, int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid) { + if (nfs_map_string_to_numeric(name, namelen, uid)) + return 0; return nfs_idmap_lookup_id(name, namelen, "uid", uid); } int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *gid) { + if (nfs_map_string_to_numeric(name, namelen, gid)) + return 0; return nfs_idmap_lookup_id(name, namelen, "gid", gid); } @@ -243,7 +264,6 @@ int nfs_map_gid_to_group(struct nfs_client *clp, __u32 gid, char *buf, size_t bu #include #include #include -#include #include #include #include @@ -699,6 +719,8 @@ int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen { struct idmap *idmap = clp->cl_idmap; + if (nfs_map_string_to_numeric(name, namelen, uid)) + return 0; return nfs_idmap_id(idmap, &idmap->idmap_user_hash, name, namelen, uid); } @@ -706,6 +728,8 @@ int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namele { struct idmap *idmap = clp->cl_idmap; + if (nfs_map_string_to_numeric(name, namelen, uid)) + return 0; return nfs_idmap_id(idmap, &idmap->idmap_group_hash, name, namelen, uid); } From f0b851689a5da2354f19bcbbac30cd2cab45c4a1 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 22 Feb 2011 15:44:31 -0800 Subject: [PATCH 37/54] NFSv4: Send unmapped uid/gids to the server if the idmapper fails Signed-off-by: Trond Myklebust --- fs/nfs/idmap.c | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index cbe6e2fa8cef..8518573c3ffc 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -52,6 +52,11 @@ static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *re return 1; } +static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen) +{ + return snprintf(buf, buflen, "%u", id); +} + #ifdef CONFIG_NFS_USE_NEW_IDMAPPER #include @@ -252,11 +257,20 @@ int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namele int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen) { - return nfs_idmap_lookup_name(uid, "user", buf, buflen); + int ret; + ret = nfs_idmap_lookup_name(uid, "user", buf, buflen); + if (ret < 0) + ret = nfs_map_numeric_to_string(uid, buf, buflen); + return ret; } int nfs_map_gid_to_group(struct nfs_client *clp, __u32 gid, char *buf, size_t buflen) { - return nfs_idmap_lookup_name(gid, "group", buf, buflen); + int ret; + + ret = nfs_idmap_lookup_name(gid, "group", buf, buflen); + if (ret < 0) + ret = nfs_map_numeric_to_string(gid, buf, buflen); + return ret; } #else /* CONFIG_NFS_USE_NEW_IDMAPPER not defined */ @@ -736,14 +750,22 @@ int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namele int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen) { struct idmap *idmap = clp->cl_idmap; + int ret; - return nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf); + ret = nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf); + if (ret < 0) + ret = nfs_map_numeric_to_string(uid, buf, buflen); + return ret; } int nfs_map_gid_to_group(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen) { struct idmap *idmap = clp->cl_idmap; + int ret; - return nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf); + ret = nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf); + if (ret < 0) + ret = nfs_map_numeric_to_string(uid, buf, buflen); + return ret; } #endif /* CONFIG_NFS_USE_NEW_IDMAPPER */ From e4fd72a17d2703cfd626c55893ac4ca7e7d81ce9 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 22 Feb 2011 15:44:31 -0800 Subject: [PATCH 38/54] NFSv4: cleanup idmapper functions to take an nfs_server argument ...instead of the nfs_client. Signed-off-by: Trond Myklebust --- fs/nfs/idmap.c | 24 ++++++++++++------------ fs/nfs/nfs4xdr.c | 18 ++++++++---------- include/linux/nfs_idmap.h | 9 +++++---- 3 files changed, 25 insertions(+), 26 deletions(-) diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index 8518573c3ffc..e2d579d458f1 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -241,21 +241,21 @@ static int nfs_idmap_lookup_id(const char *name, size_t namelen, return ret; } -int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid) +int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid) { if (nfs_map_string_to_numeric(name, namelen, uid)) return 0; return nfs_idmap_lookup_id(name, namelen, "uid", uid); } -int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *gid) +int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *gid) { if (nfs_map_string_to_numeric(name, namelen, gid)) return 0; return nfs_idmap_lookup_id(name, namelen, "gid", gid); } -int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen) +int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen) { int ret; ret = nfs_idmap_lookup_name(uid, "user", buf, buflen); @@ -263,7 +263,7 @@ int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buf ret = nfs_map_numeric_to_string(uid, buf, buflen); return ret; } -int nfs_map_gid_to_group(struct nfs_client *clp, __u32 gid, char *buf, size_t buflen) +int nfs_map_gid_to_group(const struct nfs_server *server, __u32 gid, char *buf, size_t buflen) { int ret; @@ -729,27 +729,27 @@ static unsigned int fnvhash32(const void *buf, size_t buflen) return hash; } -int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid) +int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid) { - struct idmap *idmap = clp->cl_idmap; + struct idmap *idmap = server->nfs_client->cl_idmap; if (nfs_map_string_to_numeric(name, namelen, uid)) return 0; return nfs_idmap_id(idmap, &idmap->idmap_user_hash, name, namelen, uid); } -int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid) +int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid) { - struct idmap *idmap = clp->cl_idmap; + struct idmap *idmap = server->nfs_client->cl_idmap; if (nfs_map_string_to_numeric(name, namelen, uid)) return 0; return nfs_idmap_id(idmap, &idmap->idmap_group_hash, name, namelen, uid); } -int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen) +int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen) { - struct idmap *idmap = clp->cl_idmap; + struct idmap *idmap = server->nfs_client->cl_idmap; int ret; ret = nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf); @@ -757,9 +757,9 @@ int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buf ret = nfs_map_numeric_to_string(uid, buf, buflen); return ret; } -int nfs_map_gid_to_group(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen) +int nfs_map_gid_to_group(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen) { - struct idmap *idmap = clp->cl_idmap; + struct idmap *idmap = server->nfs_client->cl_idmap; int ret; ret = nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf); diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 0f2dcfb41f29..686c21d8c523 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -844,7 +844,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const if (iap->ia_valid & ATTR_MODE) len += 4; if (iap->ia_valid & ATTR_UID) { - owner_namelen = nfs_map_uid_to_name(server->nfs_client, iap->ia_uid, owner_name, IDMAP_NAMESZ); + owner_namelen = nfs_map_uid_to_name(server, iap->ia_uid, owner_name, IDMAP_NAMESZ); if (owner_namelen < 0) { dprintk("nfs: couldn't resolve uid %d to string\n", iap->ia_uid); @@ -856,7 +856,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const len += 4 + (XDR_QUADLEN(owner_namelen) << 2); } if (iap->ia_valid & ATTR_GID) { - owner_grouplen = nfs_map_gid_to_group(server->nfs_client, iap->ia_gid, owner_group, IDMAP_NAMESZ); + owner_grouplen = nfs_map_gid_to_group(server, iap->ia_gid, owner_group, IDMAP_NAMESZ); if (owner_grouplen < 0) { dprintk("nfs: couldn't resolve gid %d to string\n", iap->ia_gid); @@ -3387,7 +3387,7 @@ out_overflow: } static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, - struct nfs_client *clp, uint32_t *uid, int may_sleep) + const struct nfs_server *server, uint32_t *uid, int may_sleep) { uint32_t len; __be32 *p; @@ -3407,7 +3407,7 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, if (!may_sleep) { /* do nothing */ } else if (len < XDR_MAX_NETOBJ) { - if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0) + if (nfs_map_name_to_uid(server, (char *)p, len, uid) == 0) ret = NFS_ATTR_FATTR_OWNER; else dprintk("%s: nfs_map_name_to_uid failed!\n", @@ -3425,7 +3425,7 @@ out_overflow: } static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, - struct nfs_client *clp, uint32_t *gid, int may_sleep) + const struct nfs_server *server, uint32_t *gid, int may_sleep) { uint32_t len; __be32 *p; @@ -3445,7 +3445,7 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, if (!may_sleep) { /* do nothing */ } else if (len < XDR_MAX_NETOBJ) { - if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0) + if (nfs_map_group_to_gid(server, (char *)p, len, gid) == 0) ret = NFS_ATTR_FATTR_GROUP; else dprintk("%s: nfs_map_group_to_gid failed!\n", @@ -3944,14 +3944,12 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, goto xdr_error; fattr->valid |= status; - status = decode_attr_owner(xdr, bitmap, server->nfs_client, - &fattr->uid, may_sleep); + status = decode_attr_owner(xdr, bitmap, server, &fattr->uid, may_sleep); if (status < 0) goto xdr_error; fattr->valid |= status; - status = decode_attr_group(xdr, bitmap, server->nfs_client, - &fattr->gid, may_sleep); + status = decode_attr_group(xdr, bitmap, server, &fattr->gid, may_sleep); if (status < 0) goto xdr_error; fattr->valid |= status; diff --git a/include/linux/nfs_idmap.h b/include/linux/nfs_idmap.h index e8352dc5afb5..ae7d6a380dae 100644 --- a/include/linux/nfs_idmap.h +++ b/include/linux/nfs_idmap.h @@ -65,6 +65,7 @@ struct idmap_msg { /* Forward declaration to make this header independent of others */ struct nfs_client; +struct nfs_server; #ifdef CONFIG_NFS_USE_NEW_IDMAPPER @@ -96,10 +97,10 @@ void nfs_idmap_delete(struct nfs_client *); #endif /* CONFIG_NFS_USE_NEW_IDMAPPER */ -int nfs_map_name_to_uid(struct nfs_client *, const char *, size_t, __u32 *); -int nfs_map_group_to_gid(struct nfs_client *, const char *, size_t, __u32 *); -int nfs_map_uid_to_name(struct nfs_client *, __u32, char *, size_t); -int nfs_map_gid_to_group(struct nfs_client *, __u32, char *, size_t); +int nfs_map_name_to_uid(const struct nfs_server *, const char *, size_t, __u32 *); +int nfs_map_group_to_gid(const struct nfs_server *, const char *, size_t, __u32 *); +int nfs_map_uid_to_name(const struct nfs_server *, __u32, char *, size_t); +int nfs_map_gid_to_group(const struct nfs_server *, __u32, char *, size_t); extern unsigned int nfs_idmap_cache_timeout; #endif /* __KERNEL__ */ From 3ddeb7c5c61d0d6bfd837487d3454ffdb788bb91 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 22 Feb 2011 15:44:31 -0800 Subject: [PATCH 39/54] NFSv4: Propagate the error NFS4ERR_BADOWNER to nfs4_do_setattr This will be required in order to switch uid/gid mapping back on if the admin has tried to disable it. Note that we also propagate NFS4ERR_BADNAME at the same time, in order to work around a Linux server bug. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 3 +++ fs/nfs/nfs4xdr.c | 2 -- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 7b4b9f3e9842..8f3ada04ea19 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -85,6 +85,9 @@ static int nfs4_map_errors(int err) switch (err) { case -NFS4ERR_RESOURCE: return -EREMOTEIO; + case -NFS4ERR_BADOWNER: + case -NFS4ERR_BADNAME: + return -EINVAL; default: dprintk("%s could not handle NFSv4 error %d\n", __func__, -err); diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 686c21d8c523..0cf560f77884 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -6171,8 +6171,6 @@ static struct { { NFS4ERR_DQUOT, -EDQUOT }, { NFS4ERR_STALE, -ESTALE }, { NFS4ERR_BADHANDLE, -EBADHANDLE }, - { NFS4ERR_BADOWNER, -EINVAL }, - { NFS4ERR_BADNAME, -EINVAL }, { NFS4ERR_BAD_COOKIE, -EBADCOOKIE }, { NFS4ERR_NOTSUPP, -ENOTSUPP }, { NFS4ERR_TOOSMALL, -ETOOSMALL }, From b064eca2cf6440bf9d5843b24cc4010624031694 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 22 Feb 2011 15:44:32 -0800 Subject: [PATCH 40/54] NFSv4: Send unmapped uid/gids to the server when using auth_sys The new behaviour is enabled using the new module parameter 'nfs4_disable_idmapping'. Note that if the server rejects an unmapped uid or gid, then the client will automatically switch back to using the idmapper. Signed-off-by: Trond Myklebust --- Documentation/kernel-parameters.txt | 8 ++++++++ fs/nfs/client.c | 16 ++++++++++++++++ fs/nfs/idmap.c | 24 ++++++++++++++++-------- fs/nfs/nfs4proc.c | 15 ++++++++++++++- include/linux/nfs_fs_sb.h | 1 + 5 files changed, 55 insertions(+), 9 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index f4a04c0c7edc..14dcf1b7dd9c 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1580,6 +1580,14 @@ bytes respectively. Such letter suffixes can also be entirely omitted. of returning the full 64-bit number. The default is to return 64-bit inode numbers. + nfs.nfs4_disable_idmapping= + [NFSv4] When set, this option disables the NFSv4 + idmapper on the client, but only if the mount + is using the 'sec=sys' security flavour. This may + make migration from legacy NFSv2/v3 systems easier + provided that the server has the appropriate support. + The default is to always enable NFSv4 idmapping. + nmi_debug= [KNL,AVR32,SH] Specify one or more actions to take when a NMI is triggered. Format: [state][,regs][,debounce][,die] diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 6dd50ac5b545..139be9647d80 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -81,6 +81,11 @@ retry: } #endif /* CONFIG_NFS_V4 */ +/* + * Turn off NFSv4 uid/gid mapping when using AUTH_SYS + */ +static int nfs4_disable_idmapping = 0; + /* * RPC cruft for NFS */ @@ -1567,6 +1572,13 @@ static int nfs4_init_server(struct nfs_server *server, if (error < 0) goto error; + /* + * Don't use NFS uid/gid mapping if we're using AUTH_SYS or lower + * authentication. + */ + if (nfs4_disable_idmapping && data->auth_flavors[0] == RPC_AUTH_UNIX) + server->caps |= NFS_CAP_UIDGID_NOMAP; + if (data->rsize) server->rsize = nfs_block_size(data->rsize, NULL); if (data->wsize) @@ -1984,3 +1996,7 @@ void nfs_fs_proc_exit(void) } #endif /* CONFIG_PROC_FS */ + +module_param(nfs4_disable_idmapping, bool, 0644); +MODULE_PARM_DESC(nfs4_disable_idmapping, + "Turn off NFSv4 idmapping when using 'sec=sys'"); diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index e2d579d458f1..79664a1025af 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -61,6 +61,9 @@ static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen) #include #include +#include +#include +#include #include #include #include @@ -257,17 +260,20 @@ int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen) { - int ret; - ret = nfs_idmap_lookup_name(uid, "user", buf, buflen); + int ret = -EINVAL; + + if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) + ret = nfs_idmap_lookup_name(uid, "user", buf, buflen); if (ret < 0) ret = nfs_map_numeric_to_string(uid, buf, buflen); return ret; } int nfs_map_gid_to_group(const struct nfs_server *server, __u32 gid, char *buf, size_t buflen) { - int ret; + int ret = -EINVAL; - ret = nfs_idmap_lookup_name(gid, "group", buf, buflen); + if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) + ret = nfs_idmap_lookup_name(gid, "group", buf, buflen); if (ret < 0) ret = nfs_map_numeric_to_string(gid, buf, buflen); return ret; @@ -750,9 +756,10 @@ int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen) { struct idmap *idmap = server->nfs_client->cl_idmap; - int ret; + int ret = -EINVAL; - ret = nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf); + if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) + ret = nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf); if (ret < 0) ret = nfs_map_numeric_to_string(uid, buf, buflen); return ret; @@ -760,9 +767,10 @@ int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, s int nfs_map_gid_to_group(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen) { struct idmap *idmap = server->nfs_client->cl_idmap; - int ret; + int ret = -EINVAL; - ret = nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf); + if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) + ret = nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf); if (ret < 0) ret = nfs_map_numeric_to_string(uid, buf, buflen); return ret; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 8f3ada04ea19..1d84e7088af9 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -244,7 +244,7 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout) /* This is the error handling routine for processes that are allowed * to sleep. */ -static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception) +static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_exception *exception) { struct nfs_client *clp = server->nfs_client; struct nfs4_state *state = exception->state; @@ -296,6 +296,19 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, break; case -NFS4ERR_OLD_STATEID: exception->retry = 1; + break; + case -NFS4ERR_BADOWNER: + /* The following works around a Linux server bug! */ + case -NFS4ERR_BADNAME: + if (server->caps & NFS_CAP_UIDGID_NOMAP) { + server->caps &= ~NFS_CAP_UIDGID_NOMAP; + exception->retry = 1; + printk(KERN_WARNING "NFS: v4 server %s " + "does not accept raw " + "uid/gids. " + "Reenabling the idmapper.\n", + server->nfs_client->cl_hostname); + } } /* We failed to handle the error */ return nfs4_map_errors(ret); diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 0cbf109a4056..216cea5db0aa 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -177,6 +177,7 @@ struct nfs_server { #define NFS_CAP_CTIME (1U << 12) #define NFS_CAP_MTIME (1U << 13) #define NFS_CAP_POSIX_LOCK (1U << 14) +#define NFS_CAP_UIDGID_NOMAP (1U << 15) /* maximum number of slots to use */ From bd7ea31b9e8a342be76e0fe8d638343886c2d8c5 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Wed, 9 Feb 2011 19:45:28 +0000 Subject: [PATCH 41/54] RPCRDMA: Fix to XDR page base interpretation in marshalling logic. The RPCRDMA marshalling logic assumed that xdr->page_base was an offset into the first page of xdr->page_list. It is in fact an offset into the xdr->page_list itself, that is, it selects the first page in the page_list and the offset into that page. The symptom depended in part on the rpc_memreg_strategy, if it was FRMR, or some other one-shot mapping mode, the connection would get torn down on a base and bounds error. When the badly marshalled RPC was retransmitted it would reconnect, get the error, and tear down the connection again in a loop forever. This resulted in a hung-mount. For the other modes, it would result in silent data corruption. This bug is most easily reproduced by writing more data than the filesystem has space for. This fix corrects the page_base assumption and otherwise simplifies the iov mapping logic. Signed-off-by: Tom Tucker Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/rpc_rdma.c | 86 +++++++++++++++++----------------- 1 file changed, 42 insertions(+), 44 deletions(-) diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 2ac3f6e8adff..554d0814c875 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -87,6 +87,8 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg, int nsegs) { int len, n = 0, p; + int page_base; + struct page **ppages; if (pos == 0 && xdrbuf->head[0].iov_len) { seg[n].mr_page = NULL; @@ -95,34 +97,32 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, ++n; } - if (xdrbuf->page_len && (xdrbuf->pages[0] != NULL)) { - if (n == nsegs) - return 0; - seg[n].mr_page = xdrbuf->pages[0]; - seg[n].mr_offset = (void *)(unsigned long) xdrbuf->page_base; - seg[n].mr_len = min_t(u32, - PAGE_SIZE - xdrbuf->page_base, xdrbuf->page_len); - len = xdrbuf->page_len - seg[n].mr_len; + len = xdrbuf->page_len; + ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT); + page_base = xdrbuf->page_base & ~PAGE_MASK; + p = 0; + while (len && n < nsegs) { + seg[n].mr_page = ppages[p]; + seg[n].mr_offset = (void *)(unsigned long) page_base; + seg[n].mr_len = min_t(u32, PAGE_SIZE - page_base, len); + BUG_ON(seg[n].mr_len > PAGE_SIZE); + len -= seg[n].mr_len; ++n; - p = 1; - while (len > 0) { - if (n == nsegs) - return 0; - seg[n].mr_page = xdrbuf->pages[p]; - seg[n].mr_offset = NULL; - seg[n].mr_len = min_t(u32, PAGE_SIZE, len); - len -= seg[n].mr_len; - ++n; - ++p; - } + ++p; + page_base = 0; /* page offset only applies to first page */ } + /* Message overflows the seg array */ + if (len && n == nsegs) + return 0; + if (xdrbuf->tail[0].iov_len) { /* the rpcrdma protocol allows us to omit any trailing * xdr pad bytes, saving the server an RDMA operation. */ if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize) return n; if (n == nsegs) + /* Tail remains, but we're out of segments */ return 0; seg[n].mr_page = NULL; seg[n].mr_offset = xdrbuf->tail[0].iov_base; @@ -296,6 +296,8 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad) int copy_len; unsigned char *srcp, *destp; struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); + int page_base; + struct page **ppages; destp = rqst->rq_svec[0].iov_base; curlen = rqst->rq_svec[0].iov_len; @@ -324,28 +326,25 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad) __func__, destp + copy_len, curlen); rqst->rq_svec[0].iov_len += curlen; } - r_xprt->rx_stats.pullup_copy_count += copy_len; - npages = PAGE_ALIGN(rqst->rq_snd_buf.page_base+copy_len) >> PAGE_SHIFT; + + page_base = rqst->rq_snd_buf.page_base; + ppages = rqst->rq_snd_buf.pages + (page_base >> PAGE_SHIFT); + page_base &= ~PAGE_MASK; + npages = PAGE_ALIGN(page_base+copy_len) >> PAGE_SHIFT; for (i = 0; copy_len && i < npages; i++) { - if (i == 0) - curlen = PAGE_SIZE - rqst->rq_snd_buf.page_base; - else - curlen = PAGE_SIZE; + curlen = PAGE_SIZE - page_base; if (curlen > copy_len) curlen = copy_len; dprintk("RPC: %s: page %d destp 0x%p len %d curlen %d\n", __func__, i, destp, copy_len, curlen); - srcp = kmap_atomic(rqst->rq_snd_buf.pages[i], - KM_SKB_SUNRPC_DATA); - if (i == 0) - memcpy(destp, srcp+rqst->rq_snd_buf.page_base, curlen); - else - memcpy(destp, srcp, curlen); + srcp = kmap_atomic(ppages[i], KM_SKB_SUNRPC_DATA); + memcpy(destp, srcp+page_base, curlen); kunmap_atomic(srcp, KM_SKB_SUNRPC_DATA); rqst->rq_svec[0].iov_len += curlen; destp += curlen; copy_len -= curlen; + page_base = 0; } /* header now contains entire send message */ return pad; @@ -606,6 +605,8 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) { int i, npages, curlen, olen; char *destp; + struct page **ppages; + int page_base; curlen = rqst->rq_rcv_buf.head[0].iov_len; if (curlen > copy_len) { /* write chunk header fixup */ @@ -624,32 +625,29 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) olen = copy_len; i = 0; rpcx_to_rdmax(rqst->rq_xprt)->rx_stats.fixup_copy_count += olen; + page_base = rqst->rq_rcv_buf.page_base; + ppages = rqst->rq_rcv_buf.pages + (page_base >> PAGE_SHIFT); + page_base &= ~PAGE_MASK; + if (copy_len && rqst->rq_rcv_buf.page_len) { - npages = PAGE_ALIGN(rqst->rq_rcv_buf.page_base + + npages = PAGE_ALIGN(page_base + rqst->rq_rcv_buf.page_len) >> PAGE_SHIFT; for (; i < npages; i++) { - if (i == 0) - curlen = PAGE_SIZE - rqst->rq_rcv_buf.page_base; - else - curlen = PAGE_SIZE; + curlen = PAGE_SIZE - page_base; if (curlen > copy_len) curlen = copy_len; dprintk("RPC: %s: page %d" " srcp 0x%p len %d curlen %d\n", __func__, i, srcp, copy_len, curlen); - destp = kmap_atomic(rqst->rq_rcv_buf.pages[i], - KM_SKB_SUNRPC_DATA); - if (i == 0) - memcpy(destp + rqst->rq_rcv_buf.page_base, - srcp, curlen); - else - memcpy(destp, srcp, curlen); - flush_dcache_page(rqst->rq_rcv_buf.pages[i]); + destp = kmap_atomic(ppages[i], KM_SKB_SUNRPC_DATA); + memcpy(destp + page_base, srcp, curlen); + flush_dcache_page(ppages[i]); kunmap_atomic(destp, KM_SKB_SUNRPC_DATA); srcp += curlen; copy_len -= curlen; if (copy_len == 0) break; + page_base = 0; } rqst->rq_rcv_buf.page_len = olen - copy_len; } else From 5c635e09cec0feeeb310968e51dad01040244851 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Wed, 9 Feb 2011 19:45:34 +0000 Subject: [PATCH 42/54] RPCRDMA: Fix FRMR registration/invalidate handling. When the rpc_memreg_strategy is 5, FRMR are used to map RPC data. This mode uses an FRMR to map the RPC data, then invalidates (i.e. unregisers) the data in xprt_rdma_free. These FRMR are used across connections on the same mount, i.e. if the connection goes away on an idle timeout and reconnects later, the FRMR are not destroyed and recreated. This creates a problem for transport errors because the WR that invalidate an FRMR may be flushed (i.e. fail) leaving the FRMR valid. When the FRMR is later used to map an RPC it will fail, tearing down the transport and starting over. Over time, more and more of the FRMR pool end up in the wrong state resulting in seemingly random disconnects. This fix keeps track of the FRMR state explicitly by setting it's state based on the successful completion of a reg/inv WR. If the FRMR is ever used and found to be in the wrong state, an invalidate WR is prepended, re-syncing the FRMR state and avoiding the connection loss. Signed-off-by: Tom Tucker Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/verbs.c | 52 ++++++++++++++++++++++++++++----- net/sunrpc/xprtrdma/xprt_rdma.h | 1 + 2 files changed, 45 insertions(+), 8 deletions(-) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 5f4c7b3bc711..570f08dc0b03 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -144,6 +144,7 @@ rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context) static inline void rpcrdma_event_process(struct ib_wc *wc) { + struct rpcrdma_mw *frmr; struct rpcrdma_rep *rep = (struct rpcrdma_rep *)(unsigned long) wc->wr_id; @@ -154,15 +155,23 @@ void rpcrdma_event_process(struct ib_wc *wc) return; if (IB_WC_SUCCESS != wc->status) { - dprintk("RPC: %s: %s WC status %X, connection lost\n", - __func__, (wc->opcode & IB_WC_RECV) ? "recv" : "send", - wc->status); + dprintk("RPC: %s: WC opcode %d status %X, connection lost\n", + __func__, wc->opcode, wc->status); rep->rr_len = ~0U; - rpcrdma_schedule_tasklet(rep); + if (wc->opcode != IB_WC_FAST_REG_MR && wc->opcode != IB_WC_LOCAL_INV) + rpcrdma_schedule_tasklet(rep); return; } switch (wc->opcode) { + case IB_WC_FAST_REG_MR: + frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; + frmr->r.frmr.state = FRMR_IS_VALID; + break; + case IB_WC_LOCAL_INV: + frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; + frmr->r.frmr.state = FRMR_IS_INVALID; + break; case IB_WC_RECV: rep->rr_len = wc->byte_len; ib_dma_sync_single_for_cpu( @@ -1450,6 +1459,11 @@ rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing) seg->mr_dma = ib_dma_map_single(ia->ri_id->device, seg->mr_offset, seg->mr_dmalen, seg->mr_dir); + if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) { + dprintk("RPC: %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n", + __func__, + seg->mr_dma, seg->mr_offset, seg->mr_dmalen); + } } static void @@ -1469,7 +1483,8 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, struct rpcrdma_xprt *r_xprt) { struct rpcrdma_mr_seg *seg1 = seg; - struct ib_send_wr frmr_wr, *bad_wr; + struct ib_send_wr invalidate_wr, frmr_wr, *bad_wr, *post_wr; + u8 key; int len, pageoff; int i, rc; @@ -1484,6 +1499,7 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, rpcrdma_map_one(ia, seg, writing); seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->page_list[i] = seg->mr_dma; len += seg->mr_len; + BUG_ON(seg->mr_len > PAGE_SIZE); ++seg; ++i; /* Check for holes */ @@ -1494,26 +1510,45 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, dprintk("RPC: %s: Using frmr %p to map %d segments\n", __func__, seg1->mr_chunk.rl_mw, i); + if (unlikely(seg1->mr_chunk.rl_mw->r.frmr.state == FRMR_IS_VALID)) { + dprintk("RPC: %s: frmr %x left valid, posting invalidate.\n", + __func__, + seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey); + /* Invalidate before using. */ + memset(&invalidate_wr, 0, sizeof invalidate_wr); + invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw; + invalidate_wr.next = &frmr_wr; + invalidate_wr.opcode = IB_WR_LOCAL_INV; + invalidate_wr.send_flags = IB_SEND_SIGNALED; + invalidate_wr.ex.invalidate_rkey = + seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; + DECR_CQCOUNT(&r_xprt->rx_ep); + post_wr = &invalidate_wr; + } else + post_wr = &frmr_wr; + /* Bump the key */ key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF); ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key); /* Prepare FRMR WR */ memset(&frmr_wr, 0, sizeof frmr_wr); + frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw; frmr_wr.opcode = IB_WR_FAST_REG_MR; - frmr_wr.send_flags = 0; /* unsignaled */ + frmr_wr.send_flags = IB_SEND_SIGNALED; frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma; frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl; frmr_wr.wr.fast_reg.page_list_len = i; frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT; frmr_wr.wr.fast_reg.length = i << PAGE_SHIFT; + BUG_ON(frmr_wr.wr.fast_reg.length < len); frmr_wr.wr.fast_reg.access_flags = (writing ? IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : IB_ACCESS_REMOTE_READ); frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; DECR_CQCOUNT(&r_xprt->rx_ep); - rc = ib_post_send(ia->ri_id->qp, &frmr_wr, &bad_wr); + rc = ib_post_send(ia->ri_id->qp, post_wr, &bad_wr); if (rc) { dprintk("RPC: %s: failed ib_post_send for register," @@ -1542,8 +1577,9 @@ rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg, rpcrdma_unmap_one(ia, seg++); memset(&invalidate_wr, 0, sizeof invalidate_wr); + invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw; invalidate_wr.opcode = IB_WR_LOCAL_INV; - invalidate_wr.send_flags = 0; /* unsignaled */ + invalidate_wr.send_flags = IB_SEND_SIGNALED; invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; DECR_CQCOUNT(&r_xprt->rx_ep); diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index c7a7eba991bc..cae761a8536c 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -164,6 +164,7 @@ struct rpcrdma_mr_seg { /* chunk descriptors */ struct { struct ib_fast_reg_page_list *fr_pgl; struct ib_mr *fr_mr; + enum { FRMR_IS_INVALID, FRMR_IS_VALID } state; } frmr; } r; struct list_head mw_list; From f8628220bb395104697be9c447c1085846dfc97c Mon Sep 17 00:00:00 2001 From: Kevin Coffman Date: Thu, 3 Mar 2011 00:51:41 +0000 Subject: [PATCH 43/54] gss:krb5 only include enctype numbers in gm_upcall_enctypes Make the value in gm_upcall_enctypes just the enctype values. This allows the values to be used more easily elsewhere. Signed-off-by: Kevin Coffman Signed-off-by: Trond Myklebust --- net/sunrpc/auth_gss/auth_gss.c | 2 +- net/sunrpc/auth_gss/gss_krb5_mech.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 45dbf1521b9a..f3914d0c5079 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -417,7 +417,7 @@ static void gss_encode_v1_msg(struct gss_upcall_msg *gss_msg, gss_msg->msg.len += len; } if (mech->gm_upcall_enctypes) { - len = sprintf(p, mech->gm_upcall_enctypes); + len = sprintf(p, "enctypes=%s ", mech->gm_upcall_enctypes); p += len; gss_msg->msg.len += len; } diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index f375decc024b..9022f0a6503e 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -750,7 +750,7 @@ static struct gss_api_mech gss_kerberos_mech = { .gm_ops = &gss_kerberos_ops, .gm_pf_num = ARRAY_SIZE(gss_kerberos_pfs), .gm_pfs = gss_kerberos_pfs, - .gm_upcall_enctypes = "enctypes=18,17,16,23,3,1,2 ", + .gm_upcall_enctypes = "18,17,16,23,3,1,2", }; static int __init init_kerberos_module(void) From 7ec10f26e1fd5fcceb9c96e508c1292a816199f7 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 22 Feb 2011 00:28:34 +0300 Subject: [PATCH 44/54] NFS: account direct-io into task io accounting Account NFS direct-io reads and writes into Task I/O Accounting. Do it before complition to handle aio. NFS have unusual direct-io implementation, thus accounting in generic code does not work. Signed-off-by: Konstantin Khlebnikov Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index f493bdd74f78..8eea25366717 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include @@ -937,6 +938,8 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov, if (retval) goto out; + task_io_account_read(count); + retval = nfs_direct_read(iocb, iov, nr_segs, pos); if (retval > 0) iocb->ki_pos = pos + retval; @@ -998,6 +1001,8 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov, if (retval) goto out; + task_io_account_write(count); + retval = nfs_direct_write(iocb, iov, nr_segs, pos, count); if (retval > 0) From c12bacec458bef16d843c052f38422862f3da8fe Mon Sep 17 00:00:00 2001 From: Rob Landley Date: Wed, 9 Mar 2011 15:54:13 -0600 Subject: [PATCH 45/54] cleanup: save 60 lines/100 bytes by combining two mostly duplicate functions. Eliminate two mostly duplicate functions (nfs_parse_simple_hostname() and nfs_parse_protected_hostname()) and instead just make the calling function (nfs_parse_devname()) do everything. Signed-off-by: Rob Landley Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 137 +++++++++++++------------------------------------ 1 file changed, 37 insertions(+), 100 deletions(-) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index b68c8607770f..a74e9740190f 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1665,39 +1665,55 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args, return nfs_walk_authlist(args, &request); } -static int nfs_parse_simple_hostname(const char *dev_name, - char **hostname, size_t maxnamlen, - char **export_path, size_t maxpathlen) +/* + * Split "dev_name" into "hostname:export_path". + * + * The leftmost colon demarks the split between the server's hostname + * and the export path. If the hostname starts with a left square + * bracket, then it may contain colons. + * + * Note: caller frees hostname and export path, even on error. + */ +static int nfs_parse_devname(const char *dev_name, + char **hostname, size_t maxnamlen, + char **export_path, size_t maxpathlen) { size_t len; - char *colon, *comma; + char *end; - colon = strchr(dev_name, ':'); - if (colon == NULL) - goto out_bad_devname; + /* Is the host name protected with square brakcets? */ + if (*dev_name == '[') { + end = strchr(++dev_name, ']'); + if (end == NULL || end[1] != ':') + goto out_bad_devname; + + len = end - dev_name; + end++; + } else { + char *comma; + + end = strchr(dev_name, ':'); + if (end == NULL) + goto out_bad_devname; + len = end - dev_name; + + /* kill possible hostname list: not supported */ + comma = strchr(dev_name, ','); + if (comma != NULL && comma < end) + *comma = 0; + } - len = colon - dev_name; if (len > maxnamlen) goto out_hostname; /* N.B. caller will free nfs_server.hostname in all cases */ *hostname = kstrndup(dev_name, len, GFP_KERNEL); - if (!*hostname) + if (*hostname == NULL) goto out_nomem; - - /* kill possible hostname list: not supported */ - comma = strchr(*hostname, ','); - if (comma != NULL) { - if (comma == *hostname) - goto out_bad_devname; - *comma = '\0'; - } - - colon++; - len = strlen(colon); + len = strlen(++end); if (len > maxpathlen) goto out_path; - *export_path = kstrndup(colon, len, GFP_KERNEL); + *export_path = kstrndup(end, len, GFP_KERNEL); if (!*export_path) goto out_nomem; @@ -1721,85 +1737,6 @@ out_path: return -ENAMETOOLONG; } -/* - * Hostname has square brackets around it because it contains one or - * more colons. We look for the first closing square bracket, and a - * colon must follow it. - */ -static int nfs_parse_protected_hostname(const char *dev_name, - char **hostname, size_t maxnamlen, - char **export_path, size_t maxpathlen) -{ - size_t len; - char *start, *end; - - start = (char *)(dev_name + 1); - - end = strchr(start, ']'); - if (end == NULL) - goto out_bad_devname; - if (*(end + 1) != ':') - goto out_bad_devname; - - len = end - start; - if (len > maxnamlen) - goto out_hostname; - - /* N.B. caller will free nfs_server.hostname in all cases */ - *hostname = kstrndup(start, len, GFP_KERNEL); - if (*hostname == NULL) - goto out_nomem; - - end += 2; - len = strlen(end); - if (len > maxpathlen) - goto out_path; - *export_path = kstrndup(end, len, GFP_KERNEL); - if (!*export_path) - goto out_nomem; - - return 0; - -out_bad_devname: - dfprintk(MOUNT, "NFS: device name not in host:path format\n"); - return -EINVAL; - -out_nomem: - dfprintk(MOUNT, "NFS: not enough memory to parse device name\n"); - return -ENOMEM; - -out_hostname: - dfprintk(MOUNT, "NFS: server hostname too long\n"); - return -ENAMETOOLONG; - -out_path: - dfprintk(MOUNT, "NFS: export pathname too long\n"); - return -ENAMETOOLONG; -} - -/* - * Split "dev_name" into "hostname:export_path". - * - * The leftmost colon demarks the split between the server's hostname - * and the export path. If the hostname starts with a left square - * bracket, then it may contain colons. - * - * Note: caller frees hostname and export path, even on error. - */ -static int nfs_parse_devname(const char *dev_name, - char **hostname, size_t maxnamlen, - char **export_path, size_t maxpathlen) -{ - if (*dev_name == '[') - return nfs_parse_protected_hostname(dev_name, - hostname, maxnamlen, - export_path, maxpathlen); - - return nfs_parse_simple_hostname(dev_name, - hostname, maxnamlen, - export_path, maxpathlen); -} - /* * Validate the NFS2/NFS3 mount data * - fills in the mount root filehandle From c5cb09b6f898609922f9b873661f6cbc26cb29e1 Mon Sep 17 00:00:00 2001 From: Rob Landley Date: Wed, 9 Mar 2011 16:02:37 -0600 Subject: [PATCH 46/54] Cleanup: Factor out some cut-and-paste code. Factor out some cut-and-paste code in options parsing. Saves about 800 bytes on x86-64. Signed-off-by: Rob Landley Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 155 ++++++++++++++----------------------------------- 1 file changed, 44 insertions(+), 111 deletions(-) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index a74e9740190f..7e13e1a6b391 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -979,6 +979,27 @@ static int nfs_parse_security_flavors(char *value, return 1; } +static int nfs_get_option_str(substring_t args[], char **option) +{ + kfree(*option); + *option = match_strdup(args); + return !option; +} + +static int nfs_get_option_ul(substring_t args[], unsigned long *option) +{ + int rc; + char *string; + + string = match_strdup(args); + if (string == NULL) + return -ENOMEM; + rc = strict_strtoul(string, 10, option); + kfree(string); + + return rc; +} + /* * Error-check and convert a string of mount options from user space into * a data structure. The whole mount string is processed; bad options are @@ -1127,155 +1148,82 @@ static int nfs_parse_mount_options(char *raw, * options that take numeric values */ case Opt_port: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = strict_strtoul(string, 10, &option); - kfree(string); - if (rc != 0 || option > USHRT_MAX) + if (nfs_get_option_ul(args, &option) || + option > USHRT_MAX) goto out_invalid_value; mnt->nfs_server.port = option; break; case Opt_rsize: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = strict_strtoul(string, 10, &option); - kfree(string); - if (rc != 0) + if (nfs_get_option_ul(args, &option)) goto out_invalid_value; mnt->rsize = option; break; case Opt_wsize: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = strict_strtoul(string, 10, &option); - kfree(string); - if (rc != 0) + if (nfs_get_option_ul(args, &option)) goto out_invalid_value; mnt->wsize = option; break; case Opt_bsize: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = strict_strtoul(string, 10, &option); - kfree(string); - if (rc != 0) + if (nfs_get_option_ul(args, &option)) goto out_invalid_value; mnt->bsize = option; break; case Opt_timeo: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = strict_strtoul(string, 10, &option); - kfree(string); - if (rc != 0 || option == 0) + if (nfs_get_option_ul(args, &option) || option == 0) goto out_invalid_value; mnt->timeo = option; break; case Opt_retrans: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = strict_strtoul(string, 10, &option); - kfree(string); - if (rc != 0 || option == 0) + if (nfs_get_option_ul(args, &option) || option == 0) goto out_invalid_value; mnt->retrans = option; break; case Opt_acregmin: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = strict_strtoul(string, 10, &option); - kfree(string); - if (rc != 0) + if (nfs_get_option_ul(args, &option)) goto out_invalid_value; mnt->acregmin = option; break; case Opt_acregmax: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = strict_strtoul(string, 10, &option); - kfree(string); - if (rc != 0) + if (nfs_get_option_ul(args, &option)) goto out_invalid_value; mnt->acregmax = option; break; case Opt_acdirmin: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = strict_strtoul(string, 10, &option); - kfree(string); - if (rc != 0) + if (nfs_get_option_ul(args, &option)) goto out_invalid_value; mnt->acdirmin = option; break; case Opt_acdirmax: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = strict_strtoul(string, 10, &option); - kfree(string); - if (rc != 0) + if (nfs_get_option_ul(args, &option)) goto out_invalid_value; mnt->acdirmax = option; break; case Opt_actimeo: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = strict_strtoul(string, 10, &option); - kfree(string); - if (rc != 0) + if (nfs_get_option_ul(args, &option)) goto out_invalid_value; mnt->acregmin = mnt->acregmax = mnt->acdirmin = mnt->acdirmax = option; break; case Opt_namelen: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = strict_strtoul(string, 10, &option); - kfree(string); - if (rc != 0) + if (nfs_get_option_ul(args, &option)) goto out_invalid_value; mnt->namlen = option; break; case Opt_mountport: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = strict_strtoul(string, 10, &option); - kfree(string); - if (rc != 0 || option > USHRT_MAX) + if (nfs_get_option_ul(args, &option) || + option > USHRT_MAX) goto out_invalid_value; mnt->mount_server.port = option; break; case Opt_mountvers: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = strict_strtoul(string, 10, &option); - kfree(string); - if (rc != 0 || + if (nfs_get_option_ul(args, &option) || option < NFS_MNT_VERSION || option > NFS_MNT3_VERSION) goto out_invalid_value; mnt->mount_server.version = option; break; case Opt_nfsvers: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = strict_strtoul(string, 10, &option); - kfree(string); - if (rc != 0) + if (nfs_get_option_ul(args, &option)) goto out_invalid_value; switch (option) { case NFS2_VERSION: @@ -1295,12 +1243,7 @@ static int nfs_parse_mount_options(char *raw, } break; case Opt_minorversion: - string = match_strdup(args); - if (string == NULL) - goto out_nomem; - rc = strict_strtoul(string, 10, &option); - kfree(string); - if (rc != 0) + if (nfs_get_option_ul(args, &option)) goto out_invalid_value; if (option > NFS4_MAX_MINOR_VERSION) goto out_invalid_value; @@ -1336,21 +1279,18 @@ static int nfs_parse_mount_options(char *raw, case Opt_xprt_udp: mnt->flags &= ~NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; - kfree(string); break; case Opt_xprt_tcp6: protofamily = AF_INET6; case Opt_xprt_tcp: mnt->flags |= NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP; - kfree(string); break; case Opt_xprt_rdma: /* vector side protocols to TCP */ mnt->flags |= NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA; xprt_load_transport(string); - kfree(string); break; default: dfprintk(MOUNT, "NFS: unrecognized " @@ -1358,6 +1298,7 @@ static int nfs_parse_mount_options(char *raw, kfree(string); return 0; } + kfree(string); break; case Opt_mountproto: string = match_strdup(args); @@ -1400,18 +1341,13 @@ static int nfs_parse_mount_options(char *raw, goto out_invalid_address; break; case Opt_clientaddr: - string = match_strdup(args); - if (string == NULL) + if (nfs_get_option_str(args, &mnt->client_address)) goto out_nomem; - kfree(mnt->client_address); - mnt->client_address = string; break; case Opt_mounthost: - string = match_strdup(args); - if (string == NULL) + if (nfs_get_option_str(args, + &mnt->mount_server.hostname)) goto out_nomem; - kfree(mnt->mount_server.hostname); - mnt->mount_server.hostname = string; break; case Opt_mountaddr: string = match_strdup(args); @@ -1451,11 +1387,8 @@ static int nfs_parse_mount_options(char *raw, }; break; case Opt_fscache_uniq: - string = match_strdup(args); - if (string == NULL) + if (nfs_get_option_str(args, &mnt->fscache_uniq)) goto out_nomem; - kfree(mnt->fscache_uniq); - mnt->fscache_uniq = string; mnt->options |= NFS_OPTION_FSCACHE; break; case Opt_local_lock: From e0dca7a05df4e23a8f5b07742e99e2a6f7d67db1 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 14 Mar 2011 18:20:01 -0400 Subject: [PATCH 47/54] NFS: Fix a warning in fs/nfs/idmap.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 45a52a02072b2a7e265f024cfdb00127e08dd9f2 (NFS move nfs_client initialization into nfs_get_client) introduces a new warning in fs/nfs/idmap.c: ‘struct rpc_timeout’ declared inside parameter list Fix it by adding a forward declaration for the struct rpc_timeout in include/linux/nfs_xdr.h Signed-off-by: Trond Myklebust --- include/linux/nfs_xdr.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 3440f5ab0f54..2c2c67d2eb42 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1051,6 +1051,7 @@ struct nfs_write_data { struct nfs_access_entry; struct nfs_client; +struct rpc_timeout; /* * RPC procedure vector for NFSv2/NFSv3 demuxing From 8f68cd42d85f31fb58dd2cabf3ff4aad0a2bafd9 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Tue, 15 Mar 2011 18:37:09 +1100 Subject: [PATCH 48/54] nfs: BKL is no longer needed, so remove the include Signed-off-by: Stephen Rothwell Signed-off-by: Trond Myklebust --- fs/nfs/read.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 4b764c6048db..7cded2b12a05 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include From 4d4a76f3309edc671918a767b336492fbc80a16d Mon Sep 17 00:00:00 2001 From: "j223yang@asset.uwaterloo.ca" Date: Thu, 10 Mar 2011 12:40:28 -0500 Subject: [PATCH 49/54] xprt: remove redundant null check 'req' is dereferenced before checked for NULL. The patch simply removes the check. Signed-off-by: Jinqiu Yang Signed-off-by: Trond Myklebust --- net/sunrpc/xprt.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 856274d7e85c..8bdcdbe07b98 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -202,10 +202,9 @@ int xprt_reserve_xprt(struct rpc_task *task) goto out_sleep; } xprt->snd_task = task; - if (req) { - req->rq_bytes_sent = 0; - req->rq_ntrans++; - } + req->rq_bytes_sent = 0; + req->rq_ntrans++; + return 1; out_sleep: From 986d4abbddf9e76184f6cabf66654ea8e61bcde5 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 15 Mar 2011 17:11:59 -0700 Subject: [PATCH 50/54] sunrpc: fix printk format warning Fix printk format build warning: net/sunrpc/xprtrdma/verbs.c:1463: warning: format '%llx' expects type 'long long unsigned int', but argument 3 has type 'dma_addr_t' Signed-off-by: Randy Dunlap Signed-off-by: Trond Myklebust --- net/sunrpc/xprtrdma/verbs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 570f08dc0b03..d4297dc43dc4 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1462,7 +1462,8 @@ rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing) if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) { dprintk("RPC: %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n", __func__, - seg->mr_dma, seg->mr_offset, seg->mr_dmalen); + (unsigned long long)seg->mr_dma, + seg->mr_offset, seg->mr_dmalen); } } From e020c6800c9621a77223bf2c1ff68180e41e8ebf Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 15 Mar 2011 19:56:30 -0400 Subject: [PATCH 51/54] SUNRPC: Ensure we always run the tk_callback before tk_action This fixes a race in which the task->tk_callback() puts the rpc_task to sleep, setting a new callback. Under certain circumstances, the current code may end up executing the task->tk_action before it gets round to the callback. Signed-off-by: Trond Myklebust Cc: stable@kernel.org --- net/sunrpc/sched.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 5681c6a12d20..2e9387b23841 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -630,14 +630,12 @@ static void __rpc_execute(struct rpc_task *task) save_callback = task->tk_callback; task->tk_callback = NULL; save_callback(task); - } - - /* - * Perform the next FSM step. - * tk_action may be NULL when the task has been killed - * by someone else. - */ - if (!RPC_IS_QUEUED(task)) { + } else { + /* + * Perform the next FSM step. + * tk_action may be NULL when the task has been killed + * by someone else. + */ if (task->tk_action == NULL) break; task->tk_action(task); From a8de240a9074b72b156d9e6d53f00076e6cd5f03 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 15 Mar 2011 19:56:30 -0400 Subject: [PATCH 52/54] SUNRPC: Convert struct rpc_xprt to use atomic_t counters Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 3 +-- net/sunrpc/xprt.c | 16 ++++++++-------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index bef0f535f746..a0f998c07c65 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include @@ -146,7 +145,7 @@ enum xprt_transports { }; struct rpc_xprt { - struct kref kref; /* Reference count */ + atomic_t count; /* Reference count */ struct rpc_xprt_ops * ops; /* transport methods */ const struct rpc_timeout *timeout; /* timeout parms */ diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 8bdcdbe07b98..4499b5a51763 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -964,7 +964,7 @@ struct rpc_xprt *xprt_alloc(struct net *net, int size, int max_req) xprt = kzalloc(size, GFP_KERNEL); if (xprt == NULL) goto out; - kref_init(&xprt->kref); + atomic_set(&xprt->count, 1); xprt->max_reqs = max_req; xprt->slot = kcalloc(max_req, sizeof(struct rpc_rqst), GFP_KERNEL); @@ -1144,13 +1144,11 @@ found: /** * xprt_destroy - destroy an RPC transport, killing off all requests. - * @kref: kref for the transport to destroy + * @xprt: transport to destroy * */ -static void xprt_destroy(struct kref *kref) +static void xprt_destroy(struct rpc_xprt *xprt) { - struct rpc_xprt *xprt = container_of(kref, struct rpc_xprt, kref); - dprintk("RPC: destroying transport %p\n", xprt); xprt->shutdown = 1; del_timer_sync(&xprt->timer); @@ -1174,7 +1172,8 @@ static void xprt_destroy(struct kref *kref) */ void xprt_put(struct rpc_xprt *xprt) { - kref_put(&xprt->kref, xprt_destroy); + if (atomic_dec_and_test(&xprt->count)) + xprt_destroy(xprt); } /** @@ -1184,6 +1183,7 @@ void xprt_put(struct rpc_xprt *xprt) */ struct rpc_xprt *xprt_get(struct rpc_xprt *xprt) { - kref_get(&xprt->kref); - return xprt; + if (atomic_inc_not_zero(&xprt->count)) + return xprt; + return NULL; } From ba3c578de274a5438bafbce03f9225936698051c Mon Sep 17 00:00:00 2001 From: "j223yang@asset.uwaterloo.ca" Date: Wed, 16 Mar 2011 11:16:22 -0400 Subject: [PATCH 53/54] xprt: remove redundant check remove redundant check. Signed-off-by: Jinqiu Yang Signed-off-by: Trond Myklebust --- net/sunrpc/xprt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 4499b5a51763..9494c3767356 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -212,7 +212,7 @@ out_sleep: task->tk_pid, xprt); task->tk_timeout = 0; task->tk_status = -EAGAIN; - if (req && req->rq_ntrans) + if (req->rq_ntrans) rpc_sleep_on(&xprt->resend, task, NULL); else rpc_sleep_on(&xprt->sending, task, NULL); From 8e26de238fd794c8ea56a5c98bf67c40cfeb051d Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Thu, 17 Mar 2011 18:54:23 +0300 Subject: [PATCH 54/54] RPC: killing RPC tasks races fixed RPC task RPC_TASK_QUEUED bit is set must be checked before trying to wake up task rpc_killall_tasks() because task->tk_waitqueue can not be set (equal to NULL). Also, as Trond Myklebust mentioned, such approach (instead of checking tk_waitqueue to NULL) allows us to "optimise away the call to rpc_wake_up_queued_task() altogether for those tasks that aren't queued". Here is an example of dereferencing of tk_waitqueue equal to NULL: CPU 0 CPU 1 CPU 2 -------------------- --------------------- -------------------------- nfs4_run_open_task rpc_run_task rpc_execute rpc_set_active rpc_make_runnable (waiting) rpc_async_schedule nfs4_open_prepare nfs_wait_on_sequence nfs_umount_begin rpc_killall_tasks rpc_wake_up_task rpc_wake_up_queued_task spin_lock(tk_waitqueue == NULL) BUG() rpc_sleep_on spin_lock(&q->lock) __rpc_sleep_on task->tk_waitqueue = q Signed-off-by: Stanislav Kinsbursky Cc: stable@kernel.org Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index edaf56e2ed29..e7a96e478f63 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -436,7 +436,9 @@ void rpc_killall_tasks(struct rpc_clnt *clnt) if (!(rovr->tk_flags & RPC_TASK_KILLED)) { rovr->tk_flags |= RPC_TASK_KILLED; rpc_exit(rovr, -EIO); - rpc_wake_up_queued_task(rovr->tk_waitqueue, rovr); + if (RPC_IS_QUEUED(rovr)) + rpc_wake_up_queued_task(rovr->tk_waitqueue, + rovr); } } spin_unlock(&clnt->cl_lock);