From dc6f55e9f8dac4b6479be67c5c9128ad37bb491f Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 25 Oct 2011 10:25:49 +1100 Subject: [PATCH 01/25] NFS/sunrpc: don't use a credential with extra groups. The sunrpc layer keeps a cache of recently used credentials and 'unx_match' is used to find the credential which matches the current process. However unx_match allows a match when the cached credential has extra groups at the end of uc_gids list which are not in the process group list. So if a process with a list of (say) 4 group accesses a file and gains access because of the last group in the list, then another process with the same uid and gid, and a gid list being the first tree of the gids of the original process tries to access the file, it will be granted access even though it shouldn't as the wrong rpc credential will be used. Signed-off-by: NeilBrown Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org --- net/sunrpc/auth_unix.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c index 4cb70dc6e7ad..e50502d8ceb7 100644 --- a/net/sunrpc/auth_unix.c +++ b/net/sunrpc/auth_unix.c @@ -129,6 +129,9 @@ unx_match(struct auth_cred *acred, struct rpc_cred *rcred, int flags) for (i = 0; i < groups ; i++) if (cred->uc_gids[i] != GROUP_AT(acred->group_info, i)) return 0; + if (groups < NFS_NGROUPS && + cred->uc_gids[groups] != NOGROUP) + return 0; return 1; } From 914edb1bb2abe2ae4775368f2ffb7f41010fb81e Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Tue, 25 Oct 2011 14:16:36 +0300 Subject: [PATCH 02/25] SUNRPC: introduce helpers for reference counted rpcbind clients v6: 1) added write memory barrier to rpcb_set_local to make sure, that rpcbind clients become valid before rpcb_users assignment 2) explicitly set rpcb_users to 1 instead of incrementing it (looks clearer from my pow). v5: fixed races with rpcb_users in rpcb_get_local() This helpers will be used for dynamical creation and destruction of rpcbind clients. Variable rpcb_users is actually a counter of lauched RPC services. If rpcbind clients has been created already, then we just increase rpcb_users. Signed-off-by: Stanislav Kinsbursky Signed-off-by: Trond Myklebust --- net/sunrpc/rpcb_clnt.c | 53 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index f588b852d41c..78af56579fa1 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -114,6 +114,9 @@ static struct rpc_program rpcb_program; static struct rpc_clnt * rpcb_local_clnt; static struct rpc_clnt * rpcb_local_clnt4; +DEFINE_SPINLOCK(rpcb_clnt_lock); +unsigned int rpcb_users; + struct rpcbind_args { struct rpc_xprt * r_xprt; @@ -161,6 +164,56 @@ static void rpcb_map_release(void *data) kfree(map); } +static int rpcb_get_local(void) +{ + int cnt; + + spin_lock(&rpcb_clnt_lock); + if (rpcb_users) + rpcb_users++; + cnt = rpcb_users; + spin_unlock(&rpcb_clnt_lock); + + return cnt; +} + +void rpcb_put_local(void) +{ + struct rpc_clnt *clnt = rpcb_local_clnt; + struct rpc_clnt *clnt4 = rpcb_local_clnt4; + int shutdown; + + spin_lock(&rpcb_clnt_lock); + if (--rpcb_users == 0) { + rpcb_local_clnt = NULL; + rpcb_local_clnt4 = NULL; + } + shutdown = !rpcb_users; + spin_unlock(&rpcb_clnt_lock); + + if (shutdown) { + /* + * cleanup_rpcb_clnt - remove xprtsock's sysctls, unregister + */ + if (clnt4) + rpc_shutdown_client(clnt4); + if (clnt) + rpc_shutdown_client(clnt); + } +} + +static void rpcb_set_local(struct rpc_clnt *clnt, struct rpc_clnt *clnt4) +{ + /* Protected by rpcb_create_local_mutex */ + rpcb_local_clnt = clnt; + rpcb_local_clnt4 = clnt4; + smp_wmb(); + rpcb_users = 1; + dprintk("RPC: created new rpcb local clients (rpcb_local_clnt: " + "%p, rpcb_local_clnt4: %p)\n", rpcb_local_clnt, + rpcb_local_clnt4); +} + /* * Returns zero on success, otherwise a negative errno value * is returned. From 253fb070e78db981740b000914b04b9203092925 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Tue, 25 Oct 2011 14:16:48 +0300 Subject: [PATCH 03/25] SUNRPC: use rpcbind reference counting helpers All is simple: we just increase users counter if rpcbind clients has been created already. Otherwise we create them and set users counter to 1. Signed-off-by: Stanislav Kinsbursky Signed-off-by: Trond Myklebust --- net/sunrpc/rpcb_clnt.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 78af56579fa1..f5309aba1a14 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -258,9 +258,7 @@ static int rpcb_create_local_unix(void) clnt4 = NULL; } - /* Protected by rpcb_create_local_mutex */ - rpcb_local_clnt = clnt; - rpcb_local_clnt4 = clnt4; + rpcb_set_local(clnt, clnt4); out: return result; @@ -312,9 +310,7 @@ static int rpcb_create_local_net(void) clnt4 = NULL; } - /* Protected by rpcb_create_local_mutex */ - rpcb_local_clnt = clnt; - rpcb_local_clnt4 = clnt4; + rpcb_set_local(clnt, clnt4); out: return result; @@ -329,11 +325,11 @@ static int rpcb_create_local(void) static DEFINE_MUTEX(rpcb_create_local_mutex); int result = 0; - if (rpcb_local_clnt) + if (rpcb_get_local()) return result; mutex_lock(&rpcb_create_local_mutex); - if (rpcb_local_clnt) + if (rpcb_get_local()) goto out; if (rpcb_create_local_unix() != 0) From d99085605cd245d8f24858e9d0b06013e13aa044 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Tue, 25 Oct 2011 14:16:58 +0300 Subject: [PATCH 04/25] SUNRPC: introduce svc helpers for prepairing rpcbind infrastructure This helpers will be used only for those services, that will send portmapper registration calls. Signed-off-by: Stanislav Kinsbursky Signed-off-by: Trond Myklebust --- include/linux/sunrpc/clnt.h | 2 ++ net/sunrpc/rpcb_clnt.c | 2 +- net/sunrpc/svc.c | 35 +++++++++++++++++++++++++++++++++++ 3 files changed, 38 insertions(+), 1 deletion(-) diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index d926fd1a5313..ad09bed239fc 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -136,6 +136,8 @@ void rpc_shutdown_client(struct rpc_clnt *); void rpc_release_client(struct rpc_clnt *); void rpc_task_release_client(struct rpc_task *); +int rpcb_create_local(void); +void rpcb_put_local(void); int rpcb_register(u32, u32, int, unsigned short); int rpcb_v4_register(const u32 program, const u32 version, const struct sockaddr *address, diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index f5309aba1a14..c24626537a7d 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -320,7 +320,7 @@ out: * Returns zero on success, otherwise a negative errno value * is returned. */ -static int rpcb_create_local(void) +int rpcb_create_local(void) { static DEFINE_MUTEX(rpcb_create_local_mutex); int result = 0; diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 6a69a1131fb7..d2d61bfa3306 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -354,6 +354,41 @@ svc_pool_for_cpu(struct svc_serv *serv, int cpu) return &serv->sv_pools[pidx % serv->sv_nrpools]; } +static int svc_rpcb_setup(struct svc_serv *serv) +{ + int err; + + err = rpcb_create_local(); + if (err) + return err; + + /* Remove any stale portmap registrations */ + svc_unregister(serv); + return 0; +} + +static void svc_rpcb_cleanup(struct svc_serv *serv) +{ + svc_unregister(serv); + rpcb_put_local(); +} + +static int svc_uses_rpcbind(struct svc_serv *serv) +{ + struct svc_program *progp; + unsigned int i; + + for (progp = serv->sv_program; progp; progp = progp->pg_next) { + for (i = 0; i < progp->pg_nvers; i++) { + if (progp->pg_vers[i] == NULL) + continue; + if (progp->pg_vers[i]->vs_hidden == 0) + return 1; + } + } + + return 0; +} /* * Create an RPC service From e40f5e29ef0909ecba1d759cc930efb7a9a7d935 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Tue, 25 Oct 2011 14:17:08 +0300 Subject: [PATCH 05/25] SUNRPC: setup rpcbind clients if service requires it New function ("svc_uses_rpcbind") will be used to detect, that new service will send portmapper register calls. For such services we will create rpcbind clients and remove all stale portmap registrations. Also, svc_rpcb_cleanup() will be set as sv_shutdown callback for such services in case of this field wasn't initialized earlier. This will allow to destroy rpcbind clients when no other users of them left. Note: Currently, any creating service will be detected as portmap user. Probably, this is wrong. But now it depends on program versions "vs_hidden" flag. Signed-off-by: Stanislav Kinsbursky Signed-off-by: Trond Myklebust --- net/sunrpc/svc.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index d2d61bfa3306..918edc310fd1 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -454,8 +454,15 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, spin_lock_init(&pool->sp_lock); } - /* Remove any stale portmap registrations */ - svc_unregister(serv); + if (svc_uses_rpcbind(serv)) { + if (svc_rpcb_setup(serv) < 0) { + kfree(serv->sv_pools); + kfree(serv); + return NULL; + } + if (!serv->sv_shutdown) + serv->sv_shutdown = svc_rpcb_cleanup; + } return serv; } From 8e356b1e2a888c59d10a4842995a3273ca2d9086 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Tue, 25 Oct 2011 14:17:18 +0300 Subject: [PATCH 06/25] SUNRPC: cleanup service destruction svc_unregister() call have to be removed from svc_destroy() since it will be called in sv_shutdown callback. Signed-off-by: Stanislav Kinsbursky Signed-off-by: Trond Myklebust --- net/sunrpc/svc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 918edc310fd1..407462ff4779 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -530,7 +530,6 @@ svc_destroy(struct svc_serv *serv) if (svc_serv_is_pooled(serv)) svc_pool_map_put(); - svc_unregister(serv); kfree(serv->sv_pools); kfree(serv); } From 16d0587090ab93206768f726f71d84ecf55e05c4 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Tue, 25 Oct 2011 14:17:28 +0300 Subject: [PATCH 07/25] NFSd: call svc rpcbind cleanup explicitly We have to call svc_rpcb_cleanup() explicitly from nfsd_last_thread() since this function is registered as service shutdown callback and thus nobody else will done it for us. Signed-off-by: Stanislav Kinsbursky Signed-off-by: Trond Myklebust --- fs/nfsd/nfssvc.c | 2 ++ include/linux/sunrpc/svc.h | 1 + net/sunrpc/svc.c | 3 ++- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index dc5a1bf476b1..52cd976b6099 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -256,6 +256,8 @@ static void nfsd_last_thread(struct svc_serv *serv) nfsd_serv = NULL; nfsd_shutdown(); + svc_rpcb_cleanup(serv); + printk(KERN_WARNING "nfsd: last server has exited, flushing export " "cache\n"); nfsd_export_flush(); diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 223588a976a0..5e71a306216f 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -401,6 +401,7 @@ struct svc_procedure { /* * Function prototypes. */ +void svc_rpcb_cleanup(struct svc_serv *serv); struct svc_serv *svc_create(struct svc_program *, unsigned int, void (*shutdown)(struct svc_serv *)); struct svc_rqst *svc_prepare_thread(struct svc_serv *serv, diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 407462ff4779..252552a685dc 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -367,11 +367,12 @@ static int svc_rpcb_setup(struct svc_serv *serv) return 0; } -static void svc_rpcb_cleanup(struct svc_serv *serv) +void svc_rpcb_cleanup(struct svc_serv *serv) { svc_unregister(serv); rpcb_put_local(); } +EXPORT_SYMBOL_GPL(svc_rpcb_cleanup); static int svc_uses_rpcbind(struct svc_serv *serv) { From 0f0c01da444fbfd63556f301dde15915fd6cbb20 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Tue, 25 Oct 2011 14:17:38 +0300 Subject: [PATCH 08/25] SUNRPC: remove rpcbind clients creation during service registering We don't need this code since rpcbind clients are creating during RPC service creation. Signed-off-by: Stanislav Kinsbursky Signed-off-by: Trond Myklebust --- net/sunrpc/rpcb_clnt.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index c24626537a7d..e913039133a9 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -431,11 +431,6 @@ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port) struct rpc_message msg = { .rpc_argp = &map, }; - int error; - - error = rpcb_create_local(); - if (error) - return error; dprintk("RPC: %sregistering (%u, %u, %d, %u) with local " "rpcbind\n", (port ? "" : "un"), @@ -571,11 +566,7 @@ int rpcb_v4_register(const u32 program, const u32 version, struct rpc_message msg = { .rpc_argp = &map, }; - int error; - error = rpcb_create_local(); - if (error) - return error; if (rpcb_local_clnt4 == NULL) return -EPROTONOSUPPORT; From e20de377578e9504f8467c05ab1db98b4935d4ed Mon Sep 17 00:00:00 2001 From: Stanislav Kinsbursky Date: Tue, 25 Oct 2011 14:17:48 +0300 Subject: [PATCH 09/25] SUNRPC: remove rpcbind clients destruction on module cleanup Rpcbind clients destruction during SUNRPC module removing is obsolete since now those clients are destroying during last RPC service shutdown. Signed-off-by: Stanislav Kinsbursky Signed-off-by: Trond Myklebust --- net/sunrpc/rpcb_clnt.c | 12 ------------ net/sunrpc/sunrpc_syms.c | 3 --- 2 files changed, 15 deletions(-) diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index e913039133a9..8761bf8e36fc 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -1100,15 +1100,3 @@ static struct rpc_program rpcb_program = { .version = rpcb_version, .stats = &rpcb_stats, }; - -/** - * cleanup_rpcb_clnt - remove xprtsock's sysctls, unregister - * - */ -void cleanup_rpcb_clnt(void) -{ - if (rpcb_local_clnt4) - rpc_shutdown_client(rpcb_local_clnt4); - if (rpcb_local_clnt) - rpc_shutdown_client(rpcb_local_clnt); -} diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index 9d0809160994..8ec9778c3f4a 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -61,8 +61,6 @@ static struct pernet_operations sunrpc_net_ops = { extern struct cache_detail unix_gid_cache; -extern void cleanup_rpcb_clnt(void); - static int __init init_sunrpc(void) { @@ -102,7 +100,6 @@ out: static void __exit cleanup_sunrpc(void) { - cleanup_rpcb_clnt(); rpcauth_remove_module(); cleanup_socket_xprt(); svc_cleanup_xprt_sock(); From 92407e75ce45b41c46944891711fd8faf0714d84 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Sun, 23 Oct 2011 20:21:17 -0700 Subject: [PATCH 10/25] nfs4: serialize layoutcommit Current pnfs_layoutcommit_inode can not handle parallel layoutcommit. And as Trond suggested , there is no need for client to optimize for parallel layoutcommit. So add NFS_INO_LAYOUTCOMMITTING flag to mark inflight layoutcommit and serialize lalyoutcommit with it. Also mark_inode_dirty_sync if pnfs_layoutcommit_inode fails to issue layoutcommit. Reported-by: Vitaliy Gusev Signed-off-by: Peng Tao Signed-off-by: Jim Rees Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 6 ++++++ fs/nfs/pnfs.c | 25 ++++++++++++++++++++++--- include/linux/nfs_fs.h | 1 + 3 files changed, 29 insertions(+), 3 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index d2ae413c986a..b60fddf606f7 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5950,6 +5950,7 @@ static void nfs4_layoutcommit_release(void *calldata) { struct nfs4_layoutcommit_data *data = calldata; struct pnfs_layout_segment *lseg, *tmp; + unsigned long *bitlock = &NFS_I(data->args.inode)->flags; pnfs_cleanup_layoutcommit(data); /* Matched by references in pnfs_set_layoutcommit */ @@ -5959,6 +5960,11 @@ static void nfs4_layoutcommit_release(void *calldata) &lseg->pls_flags)) put_lseg(lseg); } + + clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock); + smp_mb__after_clear_bit(); + wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING); + put_rpccred(data->cred); kfree(data); } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index ee73d9a4f700..a2478bc74442 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1443,17 +1443,31 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */ data = kzalloc(sizeof(*data), GFP_NOFS); if (!data) { - mark_inode_dirty_sync(inode); status = -ENOMEM; goto out; } + if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) + goto out_free; + + if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) { + if (!sync) { + status = -EAGAIN; + goto out_free; + } + status = wait_on_bit_lock(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING, + nfs_wait_bit_killable, TASK_KILLABLE); + if (status) + goto out_free; + } + INIT_LIST_HEAD(&data->lseg_list); spin_lock(&inode->i_lock); if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { + clear_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags); spin_unlock(&inode->i_lock); - kfree(data); - goto out; + wake_up_bit(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING); + goto out_free; } pnfs_list_write_lseg(inode, &data->lseg_list); @@ -1475,6 +1489,11 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) status = nfs4_proc_layoutcommit(data, sync); out: + if (status) + mark_inode_dirty_sync(inode); dprintk("<-- %s status %d\n", __func__, status); return status; +out_free: + kfree(data); + goto out; } diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 60a137b7f171..ab2c6343361a 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -229,6 +229,7 @@ struct nfs_inode { #define NFS_INO_COMMIT (7) /* inode is committing unstable writes */ #define NFS_INO_PNFS_COMMIT (8) /* use pnfs code for commit */ #define NFS_INO_LAYOUTCOMMIT (9) /* layoutcommit required */ +#define NFS_INO_LAYOUTCOMMITTING (10) /* layoutcommit inflight */ static inline struct nfs_inode *NFS_I(const struct inode *inode) { From d743c3c9c236cc61403a4f7d6283d59ddf68b2bd Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Sun, 23 Oct 2011 20:22:38 -0700 Subject: [PATCH 11/25] NFS4: fix cb_recallany decode error craa_type_mask is bitmap4 per RFC5661. We need to expect a length before extracting bitmap value. Cc: Alexandros Batsakis Signed-off-by: Peng Tao Signed-off-by: Trond Myklebust --- fs/nfs/callback_xdr.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 918ad647afea..ee1a5b3cd48d 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -488,17 +488,18 @@ static __be32 decode_recallany_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_recallanyargs *args) { - __be32 *p; + uint32_t bitmap[2]; + __be32 *p, status; args->craa_addr = svc_addr(rqstp); p = read_buf(xdr, 4); if (unlikely(p == NULL)) return htonl(NFS4ERR_BADXDR); args->craa_objs_to_keep = ntohl(*p++); - p = read_buf(xdr, 4); - if (unlikely(p == NULL)) - return htonl(NFS4ERR_BADXDR); - args->craa_type_mask = ntohl(*p); + status = decode_bitmap(xdr, bitmap); + if (unlikely(status)) + return status; + args->craa_type_mask = bitmap[0]; return 0; } From c02f557dd0a026d7147da3b6f7daf52c6ff5580f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 25 Oct 2011 12:17:43 -0400 Subject: [PATCH 12/25] NFS: Fix documenting comment for nfs_create_request() Clean up: the first parameter of nfs_create_request() has been incorrectly documented since time immemorial (OK, since before 2.6.12). Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/pagelist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index b60970cc7f1f..0a5ff5c19511 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -41,7 +41,7 @@ nfs_page_free(struct nfs_page *p) /** * nfs_create_request - Create an NFS read/write request. - * @file: file descriptor to use + * @ctx: open context to use * @inode: inode to which the request is attached * @page: page to write * @offset: starting offset within the page for the write From c6e696660213a89a5bfde8b49d539553904c808f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 25 Oct 2011 12:17:53 -0400 Subject: [PATCH 13/25] NFS: Clean up nfs4_xdr_dec_secinfo() Clean up: Remove superfluous logic at the tail of nfs4_xdr_dec_secinfo() . Introduced by commit 5a5ea0d4 "NFS: Add secinfo procedure" (March 24, 2011). Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/nfs4xdr.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 1dce12f41a4f..e6161b213ed1 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -6602,8 +6602,6 @@ static int nfs4_xdr_dec_secinfo(struct rpc_rqst *rqstp, if (status) goto out; status = decode_secinfo(xdr, res); - if (status) - goto out; out: return status; } From e414966b81a74745ac8d6bfeda0d95fb721e6d91 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 25 Oct 2011 12:18:03 -0400 Subject: [PATCH 14/25] NFS: Remove no-op less-than-zero checks on unsigned variables. Introduced by commit 16b374ca "NFSv4.1: pnfs: filelayout: add driver's LAYOUTGET and GETDEVICEINFO infrastructure" (October 20, 2010). Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/nfs4filelayout.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c index 09119418402f..12185aadb349 100644 --- a/fs/nfs/nfs4filelayout.c +++ b/fs/nfs/nfs4filelayout.c @@ -449,9 +449,8 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo, fl->dsaddr = dsaddr; - if (fl->first_stripe_index < 0 || - fl->first_stripe_index >= dsaddr->stripe_count) { - dprintk("%s Bad first_stripe_index %d\n", + if (fl->first_stripe_index >= dsaddr->stripe_count) { + dprintk("%s Bad first_stripe_index %u\n", __func__, fl->first_stripe_index); goto out_put; } @@ -552,7 +551,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo, /* Note that a zero value for num_fh is legal for STRIPE_SPARSE. * Futher checking is done in filelayout_check_layout */ - if (fl->num_fh < 0 || fl->num_fh > + if (fl->num_fh > max(NFS4_PNFS_MAX_STRIPE_CNT, NFS4_PNFS_MAX_MULTI_CNT)) goto out_err; From 6f276e49fd108362be3fd67154aaaacf872ea026 Mon Sep 17 00:00:00 2001 From: Rakib Mullick Date: Tue, 1 Nov 2011 12:16:15 +0600 Subject: [PATCH 15/25] nfs: Fix unused variable warning from file.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the following unused variable warning. fs/nfs/file.c: In function ‘nfs_file_release’: fs/nfs/file.c:140:17: warning: unused variable ‘dentry’ fs/nfs/file.c: In function ‘nfs_file_read’: fs/nfs/file.c:237:9: warning: unused variable ‘count’ Signed-off-by: Rakib Mullick Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 28b8c3f3cda3..bd7dff001106 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -137,11 +137,9 @@ nfs_file_open(struct inode *inode, struct file *filp) static int nfs_file_release(struct inode *inode, struct file *filp) { - struct dentry *dentry = filp->f_path.dentry; - dprintk("NFS: release(%s/%s)\n", - dentry->d_parent->d_name.name, - dentry->d_name.name); + filp->f_path.dentry->d_parent->d_name.name, + filp->f_path.dentry->d_name.name); nfs_inc_stats(inode, NFSIOS_VFSRELEASE); return nfs_release(inode, filp); @@ -234,14 +232,13 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov, struct dentry * dentry = iocb->ki_filp->f_path.dentry; struct inode * inode = dentry->d_inode; ssize_t result; - size_t count = iov_length(iov, nr_segs); if (iocb->ki_filp->f_flags & O_DIRECT) return nfs_file_direct_read(iocb, iov, nr_segs, pos); dprintk("NFS: read(%s/%s, %lu@%lu)\n", dentry->d_parent->d_name.name, dentry->d_name.name, - (unsigned long) count, (unsigned long) pos); + (unsigned long) iov_length(iov, nr_segs), (unsigned long) pos); result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping); if (!result) { From 2b72c9ccd22c4a3299e5a358dcd639fb253730f4 Mon Sep 17 00:00:00 2001 From: Rakib Mullick Date: Tue, 1 Nov 2011 12:23:42 +0600 Subject: [PATCH 16/25] nfs: Remove unused variable from write.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When CONFIG_NFS=y and CONFIG_NFS_V3_{,V4}=n we get the following warning. fs/nfs/write.c: In function ‘nfs_writeback_done’: fs/nfs/write.c:1246:21: warning: unused variable ‘server’ Remove the variable 'server' to fix the above warning. Signed-off-by: Rakib Mullick Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 2219c88d96b2..b016b8a36399 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1243,7 +1243,6 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) { struct nfs_writeargs *argp = &data->args; struct nfs_writeres *resp = &data->res; - struct nfs_server *server = NFS_SERVER(data->inode); int status; dprintk("NFS: %5u nfs_writeback_done (status %d)\n", @@ -1277,7 +1276,7 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) if (time_before(complain, jiffies)) { dprintk("NFS: faulty NFS server %s:" " (committed = %d) != (stable = %d)\n", - server->nfs_client->cl_hostname, + NFS_SERVER(data->inode)->nfs_client->cl_hostname, resp->verf->committed, argp->stable); complain = jiffies + 300 * HZ; } From 4cdc685c7d06f659ef6c336d4242005cdd8df401 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Mon, 31 Oct 2011 14:45:06 -0700 Subject: [PATCH 17/25] pnfs-obj: Remove redundant EOF from objlayout_io_state The EOF calculation was done on .read_pagelist(), cached in objlayout_io_state->eof, and set in objlayout_read_done() into nfs_read_data->res.eof. So set it directly into nfs_read_data->res.eof and avoid the extra member. Signed-off-by: Boaz Harrosh Signed-off-by: Trond Myklebust --- fs/nfs/objlayout/objlayout.c | 16 +++++++--------- fs/nfs/objlayout/objlayout.h | 1 - 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c index 1d06f8e2adea..1300736e0fb4 100644 --- a/fs/nfs/objlayout/objlayout.c +++ b/fs/nfs/objlayout/objlayout.c @@ -287,17 +287,14 @@ static void _rpc_read_complete(struct work_struct *work) void objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync) { - int eof = state->eof; - struct nfs_read_data *rdata; + struct nfs_read_data *rdata = state->rpcdata; state->status = status; - dprintk("%s: Begin status=%zd eof=%d\n", __func__, status, eof); - rdata = state->rpcdata; + dprintk("%s: Begin status=%zd eof=%d\n", __func__, + status, rdata->res.eof); rdata->task.tk_status = status; - if (status >= 0) { + if (status >= 0) rdata->res.count = status; - rdata->res.eof = eof; - } objlayout_iodone(state); /* must not use state after this point */ @@ -330,11 +327,14 @@ objlayout_read_pagelist(struct nfs_read_data *rdata) status = 0; rdata->res.count = 0; rdata->res.eof = 1; + /*FIXME: do we need to call pnfs_ld_read_done() */ goto out; } count = eof - offset; } + rdata->res.eof = (offset + count) >= eof; + state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout, rdata->args.pages, rdata->args.pgbase, offset, count, @@ -345,8 +345,6 @@ objlayout_read_pagelist(struct nfs_read_data *rdata) goto out; } - state->eof = state->offset + state->count >= eof; - status = objio_read_pagelist(state); out: dprintk("%s: Return status %Zd\n", __func__, status); diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h index a8244c8e042d..ffb884c6fef0 100644 --- a/fs/nfs/objlayout/objlayout.h +++ b/fs/nfs/objlayout/objlayout.h @@ -86,7 +86,6 @@ struct objlayout_io_state { void *rpcdata; int status; /* res */ - int eof; /* res */ int committed; /* res */ /* Error reporting (layout_return) */ From e6c40fe3f4c4967f1cb486191ed4a5d5f55f3f7e Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Mon, 31 Oct 2011 14:45:46 -0700 Subject: [PATCH 18/25] pnfs-obj: Return PNFS_NOT_ATTEMPTED in case of read/write_pagelist objlayout driver was always returning PNFS_ATTEMPTED from it's read/write_pagelist operations. Even on error. Fix that. Start by establishing an error return API from io-engine, by not returning ssize_t (length-or-error) but returning "int" 0=OK, 0>Error. And clean up all return types in io-engine. Then if io-engine returned error return PNFS_NOT_ATTEMPTED to generic layer. (With a dprint) Signed-off-by: Boaz Harrosh Signed-off-by: Trond Myklebust --- fs/nfs/objlayout/objio_osd.c | 32 ++++++++++++++++---------------- fs/nfs/objlayout/objlayout.c | 36 +++++++++++++++++++----------------- fs/nfs/objlayout/objlayout.h | 4 ++-- 3 files changed, 37 insertions(+), 35 deletions(-) diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index d0cda12fddc3..0c7c9ec24e67 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -142,7 +142,7 @@ OBJIO_LSEG(struct pnfs_layout_segment *lseg) } struct objio_state; -typedef ssize_t (*objio_done_fn)(struct objio_state *ios); +typedef int (*objio_done_fn)(struct objio_state *ios); struct objio_state { /* Generic layer */ @@ -720,7 +720,7 @@ out: return 0; } -static ssize_t _sync_done(struct objio_state *ios) +static int _sync_done(struct objio_state *ios) { struct completion *waiting = ios->private; @@ -742,10 +742,10 @@ static void _done_io(struct osd_request *or, void *p) kref_put(&ios->kref, _last_io); } -static ssize_t _io_exec(struct objio_state *ios) +static int _io_exec(struct objio_state *ios) { DECLARE_COMPLETION_ONSTACK(wait); - ssize_t status = 0; /* sync status */ + int ret = 0; unsigned i; objio_done_fn saved_done_fn = ios->done; bool sync = ios->ol_state.sync; @@ -771,16 +771,16 @@ static ssize_t _io_exec(struct objio_state *ios) if (sync) { wait_for_completion(&wait); - status = saved_done_fn(ios); + ret = saved_done_fn(ios); } - return status; + return ret; } /* * read */ -static ssize_t _read_done(struct objio_state *ios) +static int _read_done(struct objio_state *ios) { ssize_t status; int ret = _io_check(ios, false); @@ -793,7 +793,7 @@ static ssize_t _read_done(struct objio_state *ios) status = ret; objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync); - return status; + return ret; } static int _read_mirrors(struct objio_state *ios, unsigned cur_comp) @@ -833,7 +833,7 @@ err: return ret; } -static ssize_t _read_exec(struct objio_state *ios) +static int _read_exec(struct objio_state *ios) { unsigned i; int ret; @@ -847,14 +847,14 @@ static ssize_t _read_exec(struct objio_state *ios) } ios->done = _read_done; - return _io_exec(ios); /* In sync mode exec returns the io status */ + return _io_exec(ios); err: _io_free(ios); return ret; } -ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state) +int objio_read_pagelist(struct objlayout_io_state *ol_state) { struct objio_state *ios = container_of(ol_state, struct objio_state, ol_state); @@ -870,7 +870,7 @@ ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state) /* * write */ -static ssize_t _write_done(struct objio_state *ios) +static int _write_done(struct objio_state *ios) { ssize_t status; int ret = _io_check(ios, true); @@ -887,7 +887,7 @@ static ssize_t _write_done(struct objio_state *ios) } objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync); - return status; + return ret; } static int _write_mirrors(struct objio_state *ios, unsigned cur_comp) @@ -955,7 +955,7 @@ err: return ret; } -static ssize_t _write_exec(struct objio_state *ios) +static int _write_exec(struct objio_state *ios) { unsigned i; int ret; @@ -969,14 +969,14 @@ static ssize_t _write_exec(struct objio_state *ios) } ios->done = _write_done; - return _io_exec(ios); /* In sync mode exec returns the io->status */ + return _io_exec(ios); err: _io_free(ios); return ret; } -ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable) +int objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable) { struct objio_state *ios = container_of(ol_state, struct objio_state, ol_state); diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c index 1300736e0fb4..99c807df11dc 100644 --- a/fs/nfs/objlayout/objlayout.c +++ b/fs/nfs/objlayout/objlayout.c @@ -315,16 +315,13 @@ objlayout_read_pagelist(struct nfs_read_data *rdata) loff_t offset = rdata->args.offset; size_t count = rdata->args.count; struct objlayout_io_state *state; - ssize_t status = 0; + int err; loff_t eof; - dprintk("%s: Begin inode %p offset %llu count %d\n", - __func__, rdata->inode, offset, (int)count); - eof = i_size_read(rdata->inode); if (unlikely(offset + count > eof)) { if (offset >= eof) { - status = 0; + err = 0; rdata->res.count = 0; rdata->res.eof = 1; /*FIXME: do we need to call pnfs_ld_read_done() */ @@ -341,14 +338,19 @@ objlayout_read_pagelist(struct nfs_read_data *rdata) rdata->lseg, rdata, GFP_KERNEL); if (unlikely(!state)) { - status = -ENOMEM; + err = -ENOMEM; goto out; } + dprintk("%s: inode(%lx) offset 0x%llx count 0x%Zx eof=%d\n", + __func__, rdata->inode->i_ino, offset, count, rdata->res.eof); - status = objio_read_pagelist(state); + err = objio_read_pagelist(state); out: - dprintk("%s: Return status %Zd\n", __func__, status); - rdata->pnfs_error = status; + if (unlikely(err)) { + rdata->pnfs_error = err; + dprintk("%s: Returned Error %d\n", __func__, err); + return PNFS_NOT_ATTEMPTED; + } return PNFS_ATTEMPTED; } @@ -406,10 +408,7 @@ objlayout_write_pagelist(struct nfs_write_data *wdata, int how) { struct objlayout_io_state *state; - ssize_t status; - - dprintk("%s: Begin inode %p offset %llu count %u\n", - __func__, wdata->inode, wdata->args.offset, wdata->args.count); + int err; state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout, wdata->args.pages, @@ -419,16 +418,19 @@ objlayout_write_pagelist(struct nfs_write_data *wdata, wdata->lseg, wdata, GFP_NOFS); if (unlikely(!state)) { - status = -ENOMEM; + err = -ENOMEM; goto out; } state->sync = how & FLUSH_SYNC; - status = objio_write_pagelist(state, how & FLUSH_STABLE); + err = objio_write_pagelist(state, how & FLUSH_STABLE); out: - dprintk("%s: Return status %Zd\n", __func__, status); - wdata->pnfs_error = status; + if (unlikely(err)) { + wdata->pnfs_error = err; + dprintk("%s: Returned Error %d\n", __func__, err); + return PNFS_NOT_ATTEMPTED; + } return PNFS_ATTEMPTED; } diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h index ffb884c6fef0..4edac9b6ac0c 100644 --- a/fs/nfs/objlayout/objlayout.h +++ b/fs/nfs/objlayout/objlayout.h @@ -115,8 +115,8 @@ extern int objio_alloc_io_state( gfp_t gfp_flags); extern void objio_free_io_state(struct objlayout_io_state *state); -extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state); -extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, +extern int objio_read_pagelist(struct objlayout_io_state *ol_state); +extern int objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable); /* From 96218556b03d3c6505e2880a097338bf277fd783 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Mon, 31 Oct 2011 14:47:32 -0700 Subject: [PATCH 19/25] pnfs-obj: Get rid of objlayout_{alloc,free}_io_state This is part of moving objio_osd to use the ORE. objlayout_io_state had two functions: 1. It was used in the error reporting mechanism at layout_return. This function is kept intact. (Later patch will rename objlayout_io_state => objlayout_io_res) 2. Carrier of rw io members into the objio_read/write_paglist API. This is removed in this patch. The {r,w}data received from NFS are passed directly to the objio_{read,write}_paglist API. The io_engine is now allocating it's own IO state as part of the read/write. The minimal functionality that was part of the generic allocation is passed to the io_engine. So part of this patch is rename of: ios->ol_state.foo => ios->foo At objlayout_{read,write}_done an objlayout_io_state is passed that denotes the result of the IO. (Hence the later name change). If the IO is successful objlayout calls an objio_free_result() API immediately (Which for objio_osd causes the release of the io_state). If the IO ended in an error it is hanged onto until reported in layout_return and is released later through the objio_free_result() API. (All this is not new just renamed and cleaned) Signed-off-by: Boaz Harrosh Signed-off-by: Trond Myklebust --- fs/nfs/objlayout/objio_osd.c | 92 +++++++++++++++++-------- fs/nfs/objlayout/objlayout.c | 126 +++++++++-------------------------- fs/nfs/objlayout/objlayout.h | 36 +++++----- 3 files changed, 112 insertions(+), 142 deletions(-) diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 0c7c9ec24e67..48eb91aad554 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -148,6 +148,13 @@ struct objio_state { /* Generic layer */ struct objlayout_io_state ol_state; + struct page **pages; + unsigned pgbase; + unsigned nr_pages; + unsigned long count; + loff_t offset; + bool sync; + struct objio_segment *layout; struct kref kref; @@ -394,30 +401,43 @@ void objio_free_lseg(struct pnfs_layout_segment *lseg) kfree(objio_seg); } -int objio_alloc_io_state(struct pnfs_layout_segment *lseg, - struct objlayout_io_state **outp, - gfp_t gfp_flags) +static int +objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, + struct pnfs_layout_segment *lseg, struct page **pages, unsigned pgbase, + loff_t offset, size_t count, void *rpcdata, gfp_t gfp_flags, + struct objio_state **outp) { struct objio_segment *objio_seg = OBJIO_LSEG(lseg); struct objio_state *ios; - const unsigned first_size = sizeof(*ios) + - objio_seg->num_comps * sizeof(ios->per_dev[0]); - const unsigned sec_size = objio_seg->num_comps * - sizeof(ios->ol_state.ioerrs[0]); + struct __alloc_objio_state { + struct objio_state objios; + struct _objio_per_comp per_dev[objio_seg->num_comps]; + struct pnfs_osd_ioerr ioerrs[objio_seg->num_comps]; + } *aos; - ios = kzalloc(first_size + sec_size, gfp_flags); - if (unlikely(!ios)) + aos = kzalloc(sizeof(*aos), gfp_flags); + if (unlikely(!aos)) return -ENOMEM; - ios->layout = objio_seg; - ios->ol_state.ioerrs = ((void *)ios) + first_size; - ios->ol_state.num_comps = objio_seg->num_comps; + ios = &aos->objios; - *outp = &ios->ol_state; + ios->layout = objio_seg; + objlayout_init_ioerrs(&aos->objios.ol_state, objio_seg->num_comps, + aos->ioerrs, rpcdata, pnfs_layout_type); + + ios->pages = pages; + ios->pgbase = pgbase; + ios->nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT; + ios->offset = offset; + ios->count = count; + ios->sync = 0; + BUG_ON(ios->nr_pages > (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT); + + *outp = ios; return 0; } -void objio_free_io_state(struct objlayout_io_state *ol_state) +void objio_free_result(struct objlayout_io_state *ol_state) { struct objio_state *ios = container_of(ol_state, struct objio_state, ol_state); @@ -598,7 +618,7 @@ static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg, if (per_dev->bio == NULL) { unsigned pages_in_stripe = ios->layout->group_width * (ios->layout->stripe_unit / PAGE_SIZE); - unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) / + unsigned bio_size = (ios->nr_pages + pages_in_stripe) / ios->layout->group_width; if (BIO_MAX_PAGES_KMALLOC < bio_size) @@ -615,11 +635,11 @@ static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg, unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); unsigned added_len; - BUG_ON(ios->ol_state.nr_pages <= pg); + BUG_ON(ios->nr_pages <= pg); cur_len -= pglen; added_len = bio_add_pc_page(q, per_dev->bio, - ios->ol_state.pages[pg], pglen, pgbase); + ios->pages[pg], pglen, pgbase); if (unlikely(pglen != added_len)) return -ENOMEM; pgbase = 0; @@ -660,7 +680,7 @@ static int _prepare_one_group(struct objio_state *ios, u64 length, cur_len = stripe_unit - si->unit_off; page_off = si->unit_off & ~PAGE_MASK; BUG_ON(page_off && - (page_off != ios->ol_state.pgbase)); + (page_off != ios->pgbase)); } else { /* dev > si->dev */ per_dev->offset = si->obj_offset - si->unit_off; cur_len = stripe_unit; @@ -693,8 +713,8 @@ out: static int _io_rw_pagelist(struct objio_state *ios, gfp_t gfp_flags) { - u64 length = ios->ol_state.count; - u64 offset = ios->ol_state.offset; + u64 length = ios->count; + u64 offset = ios->offset; struct _striping_info si; unsigned last_pg = 0; int ret = 0; @@ -748,7 +768,7 @@ static int _io_exec(struct objio_state *ios) int ret = 0; unsigned i; objio_done_fn saved_done_fn = ios->done; - bool sync = ios->ol_state.sync; + bool sync = ios->sync; if (sync) { ios->done = _sync_done; @@ -792,7 +812,7 @@ static int _read_done(struct objio_state *ios) else status = ret; - objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync); + objlayout_read_done(&ios->ol_state, status, ios->sync); return ret; } @@ -854,12 +874,18 @@ err: return ret; } -int objio_read_pagelist(struct objlayout_io_state *ol_state) +int objio_read_pagelist(struct nfs_read_data *rdata) { - struct objio_state *ios = container_of(ol_state, struct objio_state, - ol_state); + struct objio_state *ios; int ret; + ret = objio_alloc_io_state(NFS_I(rdata->inode)->layout, + rdata->lseg, rdata->args.pages, rdata->args.pgbase, + rdata->args.offset, rdata->args.count, rdata, + GFP_KERNEL, &ios); + if (unlikely(ret)) + return ret; + ret = _io_rw_pagelist(ios, GFP_KERNEL); if (unlikely(ret)) return ret; @@ -886,7 +912,7 @@ static int _write_done(struct objio_state *ios) status = ret; } - objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync); + objlayout_write_done(&ios->ol_state, status, ios->sync); return ret; } @@ -976,12 +1002,20 @@ err: return ret; } -int objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable) +int objio_write_pagelist(struct nfs_write_data *wdata, int how) { - struct objio_state *ios = container_of(ol_state, struct objio_state, - ol_state); + struct objio_state *ios; int ret; + ret = objio_alloc_io_state(NFS_I(wdata->inode)->layout, + wdata->lseg, wdata->args.pages, wdata->args.pgbase, + wdata->args.offset, wdata->args.count, wdata, GFP_NOFS, + &ios); + if (unlikely(ret)) + return ret; + + ios->sync = 0 != (how & FLUSH_SYNC); + /* TODO: ios->stable = stable; */ ret = _io_rw_pagelist(ios, GFP_NOFS); if (unlikely(ret)) diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c index 99c807df11dc..a82053ae5595 100644 --- a/fs/nfs/objlayout/objlayout.c +++ b/fs/nfs/objlayout/objlayout.c @@ -156,59 +156,23 @@ last_byte_offset(u64 start, u64 len) return end > start ? end - 1 : NFS4_MAX_UINT64; } -static struct objlayout_io_state * -objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, - struct page **pages, - unsigned pgbase, - loff_t offset, - size_t count, - struct pnfs_layout_segment *lseg, - void *rpcdata, - gfp_t gfp_flags) +void _fix_verify_io_params(struct pnfs_layout_segment *lseg, + struct page ***p_pages, unsigned *p_pgbase, + u64 offset, unsigned long count) { - struct objlayout_io_state *state; u64 lseg_end_offset; - dprintk("%s: allocating io_state\n", __func__); - if (objio_alloc_io_state(lseg, &state, gfp_flags)) - return NULL; - BUG_ON(offset < lseg->pls_range.offset); lseg_end_offset = end_offset(lseg->pls_range.offset, lseg->pls_range.length); BUG_ON(offset >= lseg_end_offset); - if (offset + count > lseg_end_offset) { - count = lseg->pls_range.length - - (offset - lseg->pls_range.offset); - dprintk("%s: truncated count %Zd\n", __func__, count); + WARN_ON(offset + count > lseg_end_offset); + + if (*p_pgbase > PAGE_SIZE) { + dprintk("%s: pgbase(0x%x) > PAGE_SIZE\n", __func__, *p_pgbase); + *p_pages += *p_pgbase >> PAGE_SHIFT; + *p_pgbase &= ~PAGE_MASK; } - - if (pgbase > PAGE_SIZE) { - pages += pgbase >> PAGE_SHIFT; - pgbase &= ~PAGE_MASK; - } - - INIT_LIST_HEAD(&state->err_list); - state->lseg = lseg; - state->rpcdata = rpcdata; - state->pages = pages; - state->pgbase = pgbase; - state->nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT; - state->offset = offset; - state->count = count; - state->sync = 0; - - return state; -} - -static void -objlayout_free_io_state(struct objlayout_io_state *state) -{ - dprintk("%s: freeing io_state\n", __func__); - if (unlikely(!state)) - return; - - objio_free_io_state(state); } /* @@ -217,12 +181,10 @@ objlayout_free_io_state(struct objlayout_io_state *state) static void objlayout_iodone(struct objlayout_io_state *state) { - dprintk("%s: state %p status\n", __func__, state); - if (likely(state->status >= 0)) { - objlayout_free_io_state(state); + objio_free_result(state); } else { - struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout); + struct objlayout *objlay = state->objlay; spin_lock(&objlay->lock); objlay->delta_space_valid = OBJ_DSU_INVALID; @@ -289,15 +251,15 @@ objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync) { struct nfs_read_data *rdata = state->rpcdata; - state->status = status; - dprintk("%s: Begin status=%zd eof=%d\n", __func__, - status, rdata->res.eof); - rdata->task.tk_status = status; + state->status = rdata->task.tk_status = status; if (status >= 0) rdata->res.count = status; objlayout_iodone(state); /* must not use state after this point */ + dprintk("%s: Return status=%zd eof=%d sync=%d\n", __func__, + status, rdata->res.eof, sync); + if (sync) pnfs_ld_read_done(rdata); else { @@ -314,7 +276,6 @@ objlayout_read_pagelist(struct nfs_read_data *rdata) { loff_t offset = rdata->args.offset; size_t count = rdata->args.count; - struct objlayout_io_state *state; int err; loff_t eof; @@ -331,20 +292,14 @@ objlayout_read_pagelist(struct nfs_read_data *rdata) } rdata->res.eof = (offset + count) >= eof; + _fix_verify_io_params(rdata->lseg, &rdata->args.pages, + &rdata->args.pgbase, + rdata->args.offset, rdata->args.count); - state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout, - rdata->args.pages, rdata->args.pgbase, - offset, count, - rdata->lseg, rdata, - GFP_KERNEL); - if (unlikely(!state)) { - err = -ENOMEM; - goto out; - } dprintk("%s: inode(%lx) offset 0x%llx count 0x%Zx eof=%d\n", __func__, rdata->inode->i_ino, offset, count, rdata->res.eof); - err = objio_read_pagelist(state); + err = objio_read_pagelist(rdata); out: if (unlikely(err)) { rdata->pnfs_error = err; @@ -374,23 +329,18 @@ void objlayout_write_done(struct objlayout_io_state *state, ssize_t status, bool sync) { - struct nfs_write_data *wdata; + struct nfs_write_data *wdata = state->rpcdata; - dprintk("%s: Begin\n", __func__); - wdata = state->rpcdata; - state->status = status; - wdata->task.tk_status = status; + state->status = wdata->task.tk_status = status; if (status >= 0) { wdata->res.count = status; wdata->verf.committed = state->committed; - dprintk("%s: Return status %d committed %d\n", - __func__, wdata->task.tk_status, - wdata->verf.committed); - } else - dprintk("%s: Return status %d\n", - __func__, wdata->task.tk_status); + } objlayout_iodone(state); - /* must not use state after this point */ + /* must not use oir after this point */ + + dprintk("%s: Return status %zd committed %d sync=%d\n", __func__, + status, wdata->verf.committed, sync); if (sync) pnfs_ld_write_done(wdata); @@ -407,25 +357,13 @@ enum pnfs_try_status objlayout_write_pagelist(struct nfs_write_data *wdata, int how) { - struct objlayout_io_state *state; int err; - state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout, - wdata->args.pages, - wdata->args.pgbase, - wdata->args.offset, - wdata->args.count, - wdata->lseg, wdata, - GFP_NOFS); - if (unlikely(!state)) { - err = -ENOMEM; - goto out; - } + _fix_verify_io_params(wdata->lseg, &wdata->args.pages, + &wdata->args.pgbase, + wdata->args.offset, wdata->args.count); - state->sync = how & FLUSH_SYNC; - - err = objio_write_pagelist(state, how & FLUSH_STABLE); - out: + err = objio_write_pagelist(wdata, how); if (unlikely(err)) { wdata->pnfs_error = err; dprintk("%s: Returned Error %d\n", __func__, err); @@ -564,7 +502,7 @@ encode_accumulated_error(struct objlayout *objlay, __be32 *p) merge_ioerr(&accumulated_err, ioerr); } list_del(&state->err_list); - objlayout_free_io_state(state); + objio_free_result(state); } pnfs_osd_xdr_encode_ioerr(p, &accumulated_err); @@ -632,7 +570,7 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay, goto loop_done; } list_del(&state->err_list); - objlayout_free_io_state(state); + objio_free_result(state); } loop_done: spin_unlock(&objlay->lock); diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h index 4edac9b6ac0c..d7b2ccfa2132 100644 --- a/fs/nfs/objlayout/objlayout.h +++ b/fs/nfs/objlayout/objlayout.h @@ -75,14 +75,7 @@ OBJLAYOUT(struct pnfs_layout_hdr *lo) * embedded in objects provider io_state data structure */ struct objlayout_io_state { - struct pnfs_layout_segment *lseg; - - struct page **pages; - unsigned pgbase; - unsigned nr_pages; - unsigned long count; - loff_t offset; - bool sync; + struct objlayout *objlay; void *rpcdata; int status; /* res */ @@ -99,6 +92,18 @@ struct objlayout_io_state { struct pnfs_osd_ioerr *ioerrs; }; +static inline +void objlayout_init_ioerrs(struct objlayout_io_state *oir, unsigned num_comps, + struct pnfs_osd_ioerr *ioerrs, void *rpcdata, + struct pnfs_layout_hdr *pnfs_layout_type) +{ + oir->objlay = OBJLAYOUT(pnfs_layout_type); + oir->rpcdata = rpcdata; + INIT_LIST_HEAD(&oir->err_list); + oir->num_comps = num_comps; + oir->ioerrs = ioerrs; +} + /* * Raid engine I/O API */ @@ -109,15 +114,10 @@ extern int objio_alloc_lseg(struct pnfs_layout_segment **outp, gfp_t gfp_flags); extern void objio_free_lseg(struct pnfs_layout_segment *lseg); -extern int objio_alloc_io_state( - struct pnfs_layout_segment *lseg, - struct objlayout_io_state **outp, - gfp_t gfp_flags); -extern void objio_free_io_state(struct objlayout_io_state *state); +extern void objio_free_result(struct objlayout_io_state *state); -extern int objio_read_pagelist(struct objlayout_io_state *ol_state); -extern int objio_write_pagelist(struct objlayout_io_state *ol_state, - bool stable); +extern int objio_read_pagelist(struct nfs_read_data *rdata); +extern int objio_write_pagelist(struct nfs_write_data *wdata, int how); /* * callback API @@ -127,10 +127,8 @@ extern void objlayout_io_set_result(struct objlayout_io_state *state, int osd_error, u64 offset, u64 length, bool is_write); static inline void -objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used) +objlayout_add_delta_space_used(struct objlayout *objlay, s64 space_used) { - struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout); - /* If one of the I/Os errored out and the delta_space_used was * invalid we render the complete report as invalid. Protocol mandate * the DSU be accurate or not reported. From e2e04355d9647305c666462a49223f2942a635f0 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Mon, 31 Oct 2011 15:03:35 -0700 Subject: [PATCH 20/25] pnfs-obj: Rename objlayout_io_state => objlayout_io_res * All instances of objlayout_io_state => objlayout_io_res * All instances of state => oir; * All instances of ol_state => oir; Big but nothing to it Signed-off-by: Boaz Harrosh Signed-off-by: Trond Myklebust --- fs/nfs/objlayout/objio_osd.c | 17 +++++----- fs/nfs/objlayout/objlayout.c | 63 ++++++++++++++++++------------------ fs/nfs/objlayout/objlayout.h | 15 +++++---- 3 files changed, 48 insertions(+), 47 deletions(-) diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 48eb91aad554..2347e0ac63e6 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -146,7 +146,7 @@ typedef int (*objio_done_fn)(struct objio_state *ios); struct objio_state { /* Generic layer */ - struct objlayout_io_state ol_state; + struct objlayout_io_res oir; struct page **pages; unsigned pgbase; @@ -422,7 +422,7 @@ objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, ios = &aos->objios; ios->layout = objio_seg; - objlayout_init_ioerrs(&aos->objios.ol_state, objio_seg->num_comps, + objlayout_init_ioerrs(&aos->objios.oir, objio_seg->num_comps, aos->ioerrs, rpcdata, pnfs_layout_type); ios->pages = pages; @@ -437,10 +437,9 @@ objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, return 0; } -void objio_free_result(struct objlayout_io_state *ol_state) +void objio_free_result(struct objlayout_io_res *oir) { - struct objio_state *ios = container_of(ol_state, struct objio_state, - ol_state); + struct objio_state *ios = container_of(oir, struct objio_state, oir); kfree(ios); } @@ -519,7 +518,7 @@ static int _io_check(struct objio_state *ios, bool is_write) continue; /* we recovered */ } - objlayout_io_set_result(&ios->ol_state, i, + objlayout_io_set_result(&ios->oir, i, &ios->layout->comps[i].oc_object_id, osd_pri_2_pnfs_err(osi.osd_err_pri), ios->per_dev[i].offset, @@ -812,7 +811,7 @@ static int _read_done(struct objio_state *ios) else status = ret; - objlayout_read_done(&ios->ol_state, status, ios->sync); + objlayout_read_done(&ios->oir, status, ios->sync); return ret; } @@ -906,13 +905,13 @@ static int _write_done(struct objio_state *ios) if (likely(!ret)) { /* FIXME: should be based on the OSD's persistence model * See OSD2r05 Section 4.13 Data persistence model */ - ios->ol_state.committed = NFS_FILE_SYNC; + ios->oir.committed = NFS_FILE_SYNC; status = ios->length; } else { status = ret; } - objlayout_write_done(&ios->ol_state, status, ios->sync); + objlayout_write_done(&ios->oir, status, ios->sync); return ret; } diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c index a82053ae5595..72074e3a04f9 100644 --- a/fs/nfs/objlayout/objlayout.c +++ b/fs/nfs/objlayout/objlayout.c @@ -179,16 +179,16 @@ void _fix_verify_io_params(struct pnfs_layout_segment *lseg, * I/O done common code */ static void -objlayout_iodone(struct objlayout_io_state *state) +objlayout_iodone(struct objlayout_io_res *oir) { - if (likely(state->status >= 0)) { - objio_free_result(state); + if (likely(oir->status >= 0)) { + objio_free_result(oir); } else { - struct objlayout *objlay = state->objlay; + struct objlayout *objlay = oir->objlay; spin_lock(&objlay->lock); objlay->delta_space_valid = OBJ_DSU_INVALID; - list_add(&objlay->err_list, &state->err_list); + list_add(&objlay->err_list, &oir->err_list); spin_unlock(&objlay->lock); } } @@ -200,13 +200,13 @@ objlayout_iodone(struct objlayout_io_state *state) * the error for later reporting at layout-return. */ void -objlayout_io_set_result(struct objlayout_io_state *state, unsigned index, +objlayout_io_set_result(struct objlayout_io_res *oir, unsigned index, struct pnfs_osd_objid *pooid, int osd_error, u64 offset, u64 length, bool is_write) { - struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index]; + struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[index]; - BUG_ON(index >= state->num_comps); + BUG_ON(index >= oir->num_comps); if (osd_error) { ioerr->oer_component = *pooid; ioerr->oer_comp_offset = offset; @@ -247,15 +247,15 @@ static void _rpc_read_complete(struct work_struct *work) } void -objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync) +objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync) { - struct nfs_read_data *rdata = state->rpcdata; + struct nfs_read_data *rdata = oir->rpcdata; - state->status = rdata->task.tk_status = status; + oir->status = rdata->task.tk_status = status; if (status >= 0) rdata->res.count = status; - objlayout_iodone(state); - /* must not use state after this point */ + objlayout_iodone(oir); + /* must not use oir after this point */ dprintk("%s: Return status=%zd eof=%d sync=%d\n", __func__, status, rdata->res.eof, sync); @@ -326,17 +326,16 @@ static void _rpc_write_complete(struct work_struct *work) } void -objlayout_write_done(struct objlayout_io_state *state, ssize_t status, - bool sync) +objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync) { - struct nfs_write_data *wdata = state->rpcdata; + struct nfs_write_data *wdata = oir->rpcdata; - state->status = wdata->task.tk_status = status; + oir->status = wdata->task.tk_status = status; if (status >= 0) { wdata->res.count = status; - wdata->verf.committed = state->committed; + wdata->verf.committed = oir->committed; } - objlayout_iodone(state); + objlayout_iodone(oir); /* must not use oir after this point */ dprintk("%s: Return status %zd committed %d sync=%d\n", __func__, @@ -475,14 +474,14 @@ merge_ioerr(struct pnfs_osd_ioerr *dest_err, static void encode_accumulated_error(struct objlayout *objlay, __be32 *p) { - struct objlayout_io_state *state, *tmp; + struct objlayout_io_res *oir, *tmp; struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0}; - list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) { + list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) { unsigned i; - for (i = 0; i < state->num_comps; i++) { - struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i]; + for (i = 0; i < oir->num_comps; i++) { + struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i]; if (!ioerr->oer_errno) continue; @@ -501,8 +500,8 @@ encode_accumulated_error(struct objlayout *objlay, __be32 *p) merge_ioerr(&accumulated_err, ioerr); } - list_del(&state->err_list); - objio_free_result(state); + list_del(&oir->err_list); + objio_free_result(oir); } pnfs_osd_xdr_encode_ioerr(p, &accumulated_err); @@ -514,7 +513,7 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay, const struct nfs4_layoutreturn_args *args) { struct objlayout *objlay = OBJLAYOUT(pnfslay); - struct objlayout_io_state *state, *tmp; + struct objlayout_io_res *oir, *tmp; __be32 *start; dprintk("%s: Begin\n", __func__); @@ -523,13 +522,13 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay, spin_lock(&objlay->lock); - list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) { + list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) { __be32 *last_xdr = NULL, *p; unsigned i; int res = 0; - for (i = 0; i < state->num_comps; i++) { - struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i]; + for (i = 0; i < oir->num_comps; i++) { + struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i]; if (!ioerr->oer_errno) continue; @@ -553,7 +552,7 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay, } last_xdr = p; - pnfs_osd_xdr_encode_ioerr(p, &state->ioerrs[i]); + pnfs_osd_xdr_encode_ioerr(p, &oir->ioerrs[i]); } /* TODO: use xdr_write_pages */ @@ -569,8 +568,8 @@ objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay, encode_accumulated_error(objlay, last_xdr); goto loop_done; } - list_del(&state->err_list); - objio_free_result(state); + list_del(&oir->err_list); + objio_free_result(oir); } loop_done: spin_unlock(&objlay->lock); diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h index d7b2ccfa2132..8ec34727ed21 100644 --- a/fs/nfs/objlayout/objlayout.h +++ b/fs/nfs/objlayout/objlayout.h @@ -74,7 +74,7 @@ OBJLAYOUT(struct pnfs_layout_hdr *lo) * per-I/O operation state * embedded in objects provider io_state data structure */ -struct objlayout_io_state { +struct objlayout_io_res { struct objlayout *objlay; void *rpcdata; @@ -93,7 +93,7 @@ struct objlayout_io_state { }; static inline -void objlayout_init_ioerrs(struct objlayout_io_state *oir, unsigned num_comps, +void objlayout_init_ioerrs(struct objlayout_io_res *oir, unsigned num_comps, struct pnfs_osd_ioerr *ioerrs, void *rpcdata, struct pnfs_layout_hdr *pnfs_layout_type) { @@ -114,7 +114,10 @@ extern int objio_alloc_lseg(struct pnfs_layout_segment **outp, gfp_t gfp_flags); extern void objio_free_lseg(struct pnfs_layout_segment *lseg); -extern void objio_free_result(struct objlayout_io_state *state); +/* objio_free_result will free these @oir structs recieved from + * objlayout_{read,write}_done + */ +extern void objio_free_result(struct objlayout_io_res *oir); extern int objio_read_pagelist(struct nfs_read_data *rdata); extern int objio_write_pagelist(struct nfs_write_data *wdata, int how); @@ -122,7 +125,7 @@ extern int objio_write_pagelist(struct nfs_write_data *wdata, int how); /* * callback API */ -extern void objlayout_io_set_result(struct objlayout_io_state *state, +extern void objlayout_io_set_result(struct objlayout_io_res *oir, unsigned index, struct pnfs_osd_objid *pooid, int osd_error, u64 offset, u64 length, bool is_write); @@ -141,9 +144,9 @@ objlayout_add_delta_space_used(struct objlayout *objlay, s64 space_used) spin_unlock(&objlay->lock); } -extern void objlayout_read_done(struct objlayout_io_state *state, +extern void objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync); -extern void objlayout_write_done(struct objlayout_io_state *state, +extern void objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync); extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay, From af4f5b54bcf0379089d01518e818f37258708fb7 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Mon, 31 Oct 2011 15:04:19 -0700 Subject: [PATCH 21/25] pnfs-obj: move to ore 01: ore_layout & ore_components For Ease of reviewing I split the move to ore into 3 parts move to ore 01: ore_layout & ore_components move to ore 02: move to ORE move to ore 03: Remove old raid engine This patch modifies the objio_lseg, layout-segment level and devices and components arrays to use the ORE types. Though it will be removed soon, also the raid engine is modified to actually compile, possibly run, with the new types. So it is the same old raid engine but with some new ORE types. For Ease of reviewing, some of the old code is "#if 0" but is not removed so the diff command works better. The old code will be removed in the 3rd patch. Signed-off-by: Boaz Harrosh Signed-off-by: Trond Myklebust --- fs/nfs/objlayout/objio_osd.c | 272 +++++++++++++++++------------------ 1 file changed, 128 insertions(+), 144 deletions(-) diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 2347e0ac63e6..bd7ec26e2840 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -38,7 +38,7 @@ */ #include -#include +#include #include "objlayout.h" @@ -52,7 +52,7 @@ enum { BIO_MAX_PAGES_KMALLOC = struct objio_dev_ent { struct nfs4_deviceid_node id_node; - struct osd_dev *od; + struct ore_dev od; }; static void @@ -60,8 +60,8 @@ objio_free_deviceid_node(struct nfs4_deviceid_node *d) { struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node); - dprintk("%s: free od=%p\n", __func__, de->od); - osduld_put_device(de->od); + dprintk("%s: free od=%p\n", __func__, de->od.od); + osduld_put_device(de->od.od); kfree(de); } @@ -98,12 +98,12 @@ _dev_list_add(const struct nfs_server *nfss, nfss->pnfs_curr_ld, nfss->nfs_client, d_id); - de->od = od; + de->od.od = od; d = nfs4_insert_deviceid_node(&de->id_node); n = container_of(d, struct objio_dev_ent, id_node); if (n != de) { - dprintk("%s: Race with other n->od=%p\n", __func__, n->od); + dprintk("%s: Race with other n->od=%p\n", __func__, n->od.od); objio_free_deviceid_node(&de->id_node); de = n; } @@ -111,28 +111,11 @@ _dev_list_add(const struct nfs_server *nfss, return de; } -struct caps_buffers { - u8 caps_key[OSD_CRYPTO_KEYID_SIZE]; - u8 creds[OSD_CAP_LEN]; -}; - struct objio_segment { struct pnfs_layout_segment lseg; - struct pnfs_osd_object_cred *comps; - - unsigned mirrors_p1; - unsigned stripe_unit; - unsigned group_width; /* Data stripe_units without integrity comps */ - u64 group_depth; - unsigned group_count; - - unsigned max_io_size; - - unsigned comps_index; - unsigned num_comps; - /* variable length */ - struct objio_dev_ent *ods[]; + struct ore_layout layout; + struct ore_components oc; }; static inline struct objio_segment * @@ -155,7 +138,8 @@ struct objio_state { loff_t offset; bool sync; - struct objio_segment *layout; + struct ore_layout *layout; + struct ore_components *oc; struct kref kref; objio_done_fn done; @@ -175,32 +159,33 @@ struct objio_state { /* Send and wait for a get_device_info of devices in the layout, then look them up with the osd_initiator library */ -static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay, - struct objio_segment *objio_seg, unsigned comp, - gfp_t gfp_flags) +static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay, + struct objio_segment *objio_seg, unsigned c, struct nfs4_deviceid *d_id, + gfp_t gfp_flags) { struct pnfs_osd_deviceaddr *deviceaddr; - struct nfs4_deviceid *d_id; struct objio_dev_ent *ode; struct osd_dev *od; struct osd_dev_info odi; int err; - d_id = &objio_seg->comps[comp].oc_object_id.oid_device_id; - ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id); - if (ode) - return ode; + if (ode) { + objio_seg->oc.ods[c] = &ode->od; /* must use container_of */ + return 0; + } err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags); if (unlikely(err)) { dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n", __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err); - return ERR_PTR(err); + return err; } odi.systemid_len = deviceaddr->oda_systemid.len; if (odi.systemid_len > sizeof(odi.systemid)) { + dprintk("%s: odi.systemid_len > sizeof(systemid=%zd)\n", + __func__, sizeof(odi.systemid)); err = -EINVAL; goto out; } else if (odi.systemid_len) @@ -225,38 +210,15 @@ static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay, ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od, gfp_flags); - + objio_seg->oc.ods[c] = &ode->od; /* must use container_of */ + dprintk("Adding new dev_id(%llx:%llx)\n", + _DEVID_LO(d_id), _DEVID_HI(d_id)); out: - dprintk("%s: return=%d\n", __func__, err); objlayout_put_deviceinfo(deviceaddr); - return err ? ERR_PTR(err) : ode; -} - -static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay, - struct objio_segment *objio_seg, - gfp_t gfp_flags) -{ - unsigned i; - int err; - - /* lookup all devices */ - for (i = 0; i < objio_seg->num_comps; i++) { - struct objio_dev_ent *ode; - - ode = _device_lookup(pnfslay, objio_seg, i, gfp_flags); - if (unlikely(IS_ERR(ode))) { - err = PTR_ERR(ode); - goto out; - } - objio_seg->ods[i] = ode; - } - err = 0; - -out: - dprintk("%s: return=%d\n", __func__, err); return err; } +#if 0 static int _verify_data_map(struct pnfs_osd_layout *layout) { struct pnfs_osd_data_map *data_map = &layout->olo_map; @@ -296,23 +258,45 @@ static int _verify_data_map(struct pnfs_osd_layout *layout) return 0; } +#endif -static void copy_single_comp(struct pnfs_osd_object_cred *cur_comp, - struct pnfs_osd_object_cred *src_comp, - struct caps_buffers *caps_p) +static void copy_single_comp(struct ore_components *oc, unsigned c, + struct pnfs_osd_object_cred *src_comp) { - WARN_ON(src_comp->oc_cap_key.cred_len > sizeof(caps_p->caps_key)); - WARN_ON(src_comp->oc_cap.cred_len > sizeof(caps_p->creds)); + struct ore_comp *ocomp = &oc->comps[c]; - *cur_comp = *src_comp; + WARN_ON(src_comp->oc_cap_key.cred_len > 0); /* libosd is NO_SEC only */ + WARN_ON(src_comp->oc_cap.cred_len > sizeof(ocomp->cred)); - memcpy(caps_p->caps_key, src_comp->oc_cap_key.cred, - sizeof(caps_p->caps_key)); - cur_comp->oc_cap_key.cred = caps_p->caps_key; + ocomp->obj.partition = src_comp->oc_object_id.oid_partition_id; + ocomp->obj.id = src_comp->oc_object_id.oid_object_id; - memcpy(caps_p->creds, src_comp->oc_cap.cred, - sizeof(caps_p->creds)); - cur_comp->oc_cap.cred = caps_p->creds; + memcpy(ocomp->cred, src_comp->oc_cap.cred, sizeof(ocomp->cred)); +} + +int __alloc_objio_seg(unsigned numdevs, gfp_t gfp_flags, + struct objio_segment **pseg) +{ + struct __alloc_objio_segment { + struct objio_segment olseg; + struct ore_dev *ods[numdevs]; + struct ore_comp comps[numdevs]; + } *aolseg; + + aolseg = kzalloc(sizeof(*aolseg), gfp_flags); + if (unlikely(!aolseg)) { + dprintk("%s: Faild allocation numdevs=%d size=%zd\n", __func__, + numdevs, sizeof(*aolseg)); + return -ENOMEM; + } + + aolseg->olseg.oc.numdevs = numdevs; + aolseg->olseg.oc.single_comp = EC_MULTPLE_COMPS; + aolseg->olseg.oc.comps = aolseg->comps; + aolseg->olseg.oc.ods = aolseg->ods; + + *pseg = &aolseg->olseg; + return 0; } int objio_alloc_lseg(struct pnfs_layout_segment **outp, @@ -324,59 +308,43 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp, struct objio_segment *objio_seg; struct pnfs_osd_xdr_decode_layout_iter iter; struct pnfs_osd_layout layout; - struct pnfs_osd_object_cred *cur_comp, src_comp; - struct caps_buffers *caps_p; + struct pnfs_osd_object_cred src_comp; + unsigned cur_comp; int err; err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr); if (unlikely(err)) return err; - err = _verify_data_map(&layout); + err = __alloc_objio_seg(layout.olo_num_comps, gfp_flags, &objio_seg); if (unlikely(err)) return err; - objio_seg = kzalloc(sizeof(*objio_seg) + - sizeof(objio_seg->ods[0]) * layout.olo_num_comps + - sizeof(*objio_seg->comps) * layout.olo_num_comps + - sizeof(struct caps_buffers) * layout.olo_num_comps, - gfp_flags); - if (!objio_seg) - return -ENOMEM; + objio_seg->layout.stripe_unit = layout.olo_map.odm_stripe_unit; + objio_seg->layout.group_width = layout.olo_map.odm_group_width; + objio_seg->layout.group_depth = layout.olo_map.odm_group_depth; + objio_seg->layout.mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1; + objio_seg->layout.raid_algorithm = layout.olo_map.odm_raid_algorithm; - objio_seg->comps = (void *)(objio_seg->ods + layout.olo_num_comps); - cur_comp = objio_seg->comps; - caps_p = (void *)(cur_comp + layout.olo_num_comps); - while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) - copy_single_comp(cur_comp++, &src_comp, caps_p++); + err = ore_verify_layout(layout.olo_map.odm_num_comps, + &objio_seg->layout); if (unlikely(err)) goto err; - objio_seg->num_comps = layout.olo_num_comps; - objio_seg->comps_index = layout.olo_comps_index; - err = objio_devices_lookup(pnfslay, objio_seg, gfp_flags); - if (err) - goto err; - - objio_seg->mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1; - objio_seg->stripe_unit = layout.olo_map.odm_stripe_unit; - if (layout.olo_map.odm_group_width) { - objio_seg->group_width = layout.olo_map.odm_group_width; - objio_seg->group_depth = layout.olo_map.odm_group_depth; - objio_seg->group_count = layout.olo_map.odm_num_comps / - objio_seg->mirrors_p1 / - objio_seg->group_width; - } else { - objio_seg->group_width = layout.olo_map.odm_num_comps / - objio_seg->mirrors_p1; - objio_seg->group_depth = -1; - objio_seg->group_count = 1; + objio_seg->oc.first_dev = layout.olo_comps_index; + cur_comp = 0; + while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) { + copy_single_comp(&objio_seg->oc, cur_comp, &src_comp); + err = objio_devices_lookup(pnfslay, objio_seg, cur_comp, + &src_comp.oc_object_id.oid_device_id, + gfp_flags); + if (err) + goto err; + ++cur_comp; } - - /* Cache this calculation it will hit for every page */ - objio_seg->max_io_size = (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - - objio_seg->stripe_unit) * - objio_seg->group_width; + /* pnfs_osd_xdr_decode_layout_comp returns false on error */ + if (unlikely(err)) + goto err; *outp = &objio_seg->lseg; return 0; @@ -393,10 +361,14 @@ void objio_free_lseg(struct pnfs_layout_segment *lseg) int i; struct objio_segment *objio_seg = OBJIO_LSEG(lseg); - for (i = 0; i < objio_seg->num_comps; i++) { - if (!objio_seg->ods[i]) + for (i = 0; i < objio_seg->oc.numdevs; i++) { + struct ore_dev *od = objio_seg->oc.ods[i]; + struct objio_dev_ent *ode; + + if (!od) break; - nfs4_put_deviceid_node(&objio_seg->ods[i]->id_node); + ode = container_of(od, typeof(*ode), od); + nfs4_put_deviceid_node(&ode->id_node); } kfree(objio_seg); } @@ -411,8 +383,8 @@ objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, struct objio_state *ios; struct __alloc_objio_state { struct objio_state objios; - struct _objio_per_comp per_dev[objio_seg->num_comps]; - struct pnfs_osd_ioerr ioerrs[objio_seg->num_comps]; + struct _objio_per_comp per_dev[objio_seg->oc.numdevs]; + struct pnfs_osd_ioerr ioerrs[objio_seg->oc.numdevs]; } *aos; aos = kzalloc(sizeof(*aos), gfp_flags); @@ -421,8 +393,9 @@ objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, ios = &aos->objios; - ios->layout = objio_seg; - objlayout_init_ioerrs(&aos->objios.oir, objio_seg->num_comps, + ios->layout = &objio_seg->layout; + ios->oc = &objio_seg->oc; + objlayout_init_ioerrs(&aos->objios.oir, objio_seg->oc.numdevs, aos->ioerrs, rpcdata, pnfs_layout_type); ios->pages = pages; @@ -474,6 +447,27 @@ enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep) } } +static void __on_dev_error(struct objio_state *ios, bool is_write, + struct ore_dev *od, unsigned dev_index, enum osd_err_priority oep, + u64 dev_offset, u64 dev_len) +{ + struct objio_state *objios = ios->private; + struct pnfs_osd_objid pooid; + struct objio_dev_ent *ode = container_of(od, typeof(*ode), od); + /* FIXME: what to do with more-then-one-group layouts. We need to + * translate from ore_io_state index to oc->comps index + */ + unsigned comp = dev_index; + + pooid.oid_device_id = ode->id_node.deviceid; + pooid.oid_partition_id = ios->oc->comps[comp].obj.partition; + pooid.oid_object_id = ios->oc->comps[comp].obj.id; + + objlayout_io_set_result(&objios->oir, comp, + &pooid, osd_pri_2_pnfs_err(oep), + dev_offset, dev_len, is_write); +} + static void _clear_bio(struct bio *bio) { struct bio_vec *bv; @@ -518,12 +512,9 @@ static int _io_check(struct objio_state *ios, bool is_write) continue; /* we recovered */ } - objlayout_io_set_result(&ios->oir, i, - &ios->layout->comps[i].oc_object_id, - osd_pri_2_pnfs_err(osi.osd_err_pri), - ios->per_dev[i].offset, - ios->per_dev[i].length, - is_write); + __on_dev_error(ios, is_write, ios->oc->ods[i], + ios->per_dev[i].dev, osi.osd_err_pri, + ios->per_dev[i].offset, ios->per_dev[i].length); if (osi.osd_err_pri >= oep) { oep = osi.osd_err_pri; @@ -558,11 +549,11 @@ static void _io_free(struct objio_state *ios) struct osd_dev *_io_od(struct objio_state *ios, unsigned dev) { - unsigned min_dev = ios->layout->comps_index; - unsigned max_dev = min_dev + ios->layout->num_comps; + unsigned min_dev = ios->oc->first_dev; + unsigned max_dev = min_dev + ios->oc->numdevs; BUG_ON(dev < min_dev || max_dev <= dev); - return ios->layout->ods[dev - min_dev]->od; + return ios->oc->ods[dev - min_dev]->od; } struct _striping_info { @@ -820,12 +811,9 @@ static int _read_mirrors(struct objio_state *ios, unsigned cur_comp) struct osd_request *or = NULL; struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; unsigned dev = per_dev->dev; - struct pnfs_osd_object_cred *cred = - &ios->layout->comps[cur_comp]; - struct osd_obj_id obj = { - .partition = cred->oc_object_id.oid_partition_id, - .id = cred->oc_object_id.oid_object_id, - }; + struct ore_comp *cred = + &ios->oc->comps[cur_comp]; + struct osd_obj_id obj = cred->obj; int ret; or = osd_start_request(_io_od(ios, dev), GFP_KERNEL); @@ -837,7 +825,7 @@ static int _read_mirrors(struct objio_state *ios, unsigned cur_comp) osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length); - ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL); + ret = osd_finalize_request(or, 0, cred->cred, NULL); if (ret) { dprintk("%s: Faild to osd_finalize_request() => %d\n", __func__, ret); @@ -924,12 +912,8 @@ static int _write_mirrors(struct objio_state *ios, unsigned cur_comp) for (; cur_comp < last_comp; ++cur_comp, ++dev) { struct osd_request *or = NULL; - struct pnfs_osd_object_cred *cred = - &ios->layout->comps[cur_comp]; - struct osd_obj_id obj = { - .partition = cred->oc_object_id.oid_partition_id, - .id = cred->oc_object_id.oid_object_id, - }; + struct ore_comp *cred = &ios->oc->comps[cur_comp]; + struct osd_obj_id obj = cred->obj; struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; struct bio *bio; @@ -964,7 +948,7 @@ static int _write_mirrors(struct objio_state *ios, unsigned cur_comp) osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length); - ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL); + ret = osd_finalize_request(or, 0, cred->cred, NULL); if (ret) { dprintk("%s: Faild to osd_finalize_request() => %d\n", __func__, ret); @@ -1030,7 +1014,7 @@ static bool objio_pg_test(struct nfs_pageio_descriptor *pgio, return false; return pgio->pg_count + req->wb_bytes <= - OBJIO_LSEG(pgio->pg_lseg)->max_io_size; + OBJIO_LSEG(pgio->pg_lseg)->layout.max_io_length; } static const struct nfs_pageio_ops objio_pg_read_ops = { From eecfc6312a24e6d0d2883de0a9a6ccf8e993f472 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Mon, 31 Oct 2011 15:15:38 -0700 Subject: [PATCH 22/25] pnfs-obj: move to ore 02: move to ORE In this patch we are actually moving to the ORE. (Object Raid Engine). objio_state holds a pointer to an ore_io_state. Once we have an ore_io_state at hand we can call the ore for reading/writing. We register on the done path to kick off the nfs io_done mechanism. Again for Ease of reviewing the old code is "#if 0" but is not removed so the diff command works better. The old code will be removed in the next patch. fs/exofs/Kconfig::ORE is modified to also be auto-included if PNFS_OBJLAYOUT is set. Since we now depend on ORE. (See comments in fs/exofs/Kconfig) Signed-off-by: Boaz Harrosh Signed-off-by: Trond Myklebust --- fs/exofs/Kconfig | 2 +- fs/nfs/objlayout/objio_osd.c | 133 ++++++++++++++++------------------- 2 files changed, 60 insertions(+), 75 deletions(-) diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig index fa9a286c8771..da42f32c49be 100644 --- a/fs/exofs/Kconfig +++ b/fs/exofs/Kconfig @@ -5,7 +5,7 @@ # selected by any of the users. config ORE tristate - depends on EXOFS_FS + depends on EXOFS_FS || PNFS_OBJLAYOUT select ASYNC_XOR default SCSI_OSD_ULD diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index bd7ec26e2840..00b384934c32 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -44,12 +44,6 @@ #define NFSDBG_FACILITY NFSDBG_PNFS_LD -#define _LLU(x) ((unsigned long long)x) - -enum { BIO_MAX_PAGES_KMALLOC = - (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec), -}; - struct objio_dev_ent { struct nfs4_deviceid_node id_node; struct ore_dev od; @@ -124,37 +118,13 @@ OBJIO_LSEG(struct pnfs_layout_segment *lseg) return container_of(lseg, struct objio_segment, lseg); } -struct objio_state; -typedef int (*objio_done_fn)(struct objio_state *ios); - struct objio_state { /* Generic layer */ struct objlayout_io_res oir; - struct page **pages; - unsigned pgbase; - unsigned nr_pages; - unsigned long count; - loff_t offset; bool sync; - - struct ore_layout *layout; - struct ore_components *oc; - - struct kref kref; - objio_done_fn done; - void *private; - - unsigned long length; - unsigned numdevs; /* Actually used devs in this IO */ - /* A per-device variable array of size numdevs */ - struct _objio_per_comp { - struct bio *bio; - struct osd_request *or; - unsigned long length; - u64 offset; - unsigned dev; - } per_dev[]; + /*FIXME: Support for extra_bytes at ore_get_rw_state() */ + struct ore_io_state *ios; }; /* Send and wait for a get_device_info of devices in the layout, @@ -374,16 +344,16 @@ void objio_free_lseg(struct pnfs_layout_segment *lseg) } static int -objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, +objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, bool is_reading, struct pnfs_layout_segment *lseg, struct page **pages, unsigned pgbase, loff_t offset, size_t count, void *rpcdata, gfp_t gfp_flags, struct objio_state **outp) { struct objio_segment *objio_seg = OBJIO_LSEG(lseg); - struct objio_state *ios; + struct ore_io_state *ios; + int ret; struct __alloc_objio_state { struct objio_state objios; - struct _objio_per_comp per_dev[objio_seg->oc.numdevs]; struct pnfs_osd_ioerr ioerrs[objio_seg->oc.numdevs]; } *aos; @@ -391,30 +361,33 @@ objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, if (unlikely(!aos)) return -ENOMEM; - ios = &aos->objios; - - ios->layout = &objio_seg->layout; - ios->oc = &objio_seg->oc; objlayout_init_ioerrs(&aos->objios.oir, objio_seg->oc.numdevs, aos->ioerrs, rpcdata, pnfs_layout_type); + ret = ore_get_rw_state(&objio_seg->layout, &objio_seg->oc, is_reading, + offset, count, &ios); + if (unlikely(ret)) { + kfree(aos); + return ret; + } + ios->pages = pages; ios->pgbase = pgbase; - ios->nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT; - ios->offset = offset; - ios->count = count; - ios->sync = 0; + ios->private = aos; BUG_ON(ios->nr_pages > (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT); - *outp = ios; + aos->objios.sync = 0; + aos->objios.ios = ios; + *outp = &aos->objios; return 0; } void objio_free_result(struct objlayout_io_res *oir) { - struct objio_state *ios = container_of(oir, struct objio_state, oir); + struct objio_state *objios = container_of(oir, struct objio_state, oir); - kfree(ios); + ore_put_io_state(objios->ios); + kfree(objios); } enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep) @@ -447,7 +420,7 @@ enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep) } } -static void __on_dev_error(struct objio_state *ios, bool is_write, +static void __on_dev_error(struct ore_io_state *ios, struct ore_dev *od, unsigned dev_index, enum osd_err_priority oep, u64 dev_offset, u64 dev_len) { @@ -465,9 +438,10 @@ static void __on_dev_error(struct objio_state *ios, bool is_write, objlayout_io_set_result(&objios->oir, comp, &pooid, osd_pri_2_pnfs_err(oep), - dev_offset, dev_len, is_write); + dev_offset, dev_len, !ios->reading); } +#if 0 static void _clear_bio(struct bio *bio) { struct bio_vec *bv; @@ -786,26 +760,28 @@ static int _io_exec(struct objio_state *ios) return ret; } +#endif /* * read */ -static int _read_done(struct objio_state *ios) +static void _read_done(struct ore_io_state *ios, void *private) { + struct objio_state *objios = private; ssize_t status; - int ret = _io_check(ios, false); + int ret = ore_check_io(ios, &__on_dev_error); - _io_free(ios); + /* FIXME: _io_free(ios) can we dealocate the libosd resources; */ if (likely(!ret)) status = ios->length; else status = ret; - objlayout_read_done(&ios->oir, status, ios->sync); - return ret; + objlayout_read_done(&objios->oir, status, objios->sync); } +#if 0 static int _read_mirrors(struct objio_state *ios, unsigned cur_comp) { struct osd_request *or = NULL; @@ -860,49 +836,50 @@ err: _io_free(ios); return ret; } +#endif int objio_read_pagelist(struct nfs_read_data *rdata) { - struct objio_state *ios; + struct objio_state *objios; int ret; - ret = objio_alloc_io_state(NFS_I(rdata->inode)->layout, + ret = objio_alloc_io_state(NFS_I(rdata->inode)->layout, true, rdata->lseg, rdata->args.pages, rdata->args.pgbase, rdata->args.offset, rdata->args.count, rdata, - GFP_KERNEL, &ios); + GFP_KERNEL, &objios); if (unlikely(ret)) return ret; - ret = _io_rw_pagelist(ios, GFP_KERNEL); - if (unlikely(ret)) - return ret; - - return _read_exec(ios); + objios->ios->done = _read_done; + dprintk("%s: offset=0x%llx length=0x%x\n", __func__, + rdata->args.offset, rdata->args.count); + return ore_read(objios->ios); } /* * write */ -static int _write_done(struct objio_state *ios) +static void _write_done(struct ore_io_state *ios, void *private) { + struct objio_state *objios = private; ssize_t status; - int ret = _io_check(ios, true); + int ret = ore_check_io(ios, &__on_dev_error); - _io_free(ios); + /* FIXME: _io_free(ios) can we dealocate the libosd resources; */ if (likely(!ret)) { /* FIXME: should be based on the OSD's persistence model * See OSD2r05 Section 4.13 Data persistence model */ - ios->oir.committed = NFS_FILE_SYNC; + objios->oir.committed = NFS_FILE_SYNC; status = ios->length; } else { status = ret; } - objlayout_write_done(&ios->oir, status, ios->sync); - return ret; + objlayout_write_done(&objios->oir, status, objios->sync); } +#if 0 static int _write_mirrors(struct objio_state *ios, unsigned cur_comp) { struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp]; @@ -984,27 +961,35 @@ err: _io_free(ios); return ret; } +#endif int objio_write_pagelist(struct nfs_write_data *wdata, int how) { - struct objio_state *ios; + struct objio_state *objios; int ret; - ret = objio_alloc_io_state(NFS_I(wdata->inode)->layout, + ret = objio_alloc_io_state(NFS_I(wdata->inode)->layout, false, wdata->lseg, wdata->args.pages, wdata->args.pgbase, wdata->args.offset, wdata->args.count, wdata, GFP_NOFS, - &ios); + &objios); if (unlikely(ret)) return ret; - ios->sync = 0 != (how & FLUSH_SYNC); + objios->sync = 0 != (how & FLUSH_SYNC); - /* TODO: ios->stable = stable; */ - ret = _io_rw_pagelist(ios, GFP_NOFS); + if (!objios->sync) + objios->ios->done = _write_done; + + dprintk("%s: offset=0x%llx length=0x%x\n", __func__, + wdata->args.offset, wdata->args.count); + ret = ore_write(objios->ios); if (unlikely(ret)) return ret; - return _write_exec(ios); + if (objios->sync) + _write_done(objios->ios, objios); + + return 0; } static bool objio_pg_test(struct nfs_pageio_descriptor *pgio, From 04291b628c450ab6fdb606836585f16336662a4e Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Mon, 31 Oct 2011 15:16:15 -0700 Subject: [PATCH 23/25] pnfs-obj: move to ore 03: Remove old raid engine Finally remove all the old raid engine, which is by now dead code. Signed-off-by: Boaz Harrosh Signed-off-by: Trond Myklebust --- fs/nfs/objlayout/objio_osd.c | 504 ----------------------------------- 1 file changed, 504 deletions(-) diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 00b384934c32..3161da654a9b 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -188,48 +188,6 @@ out: return err; } -#if 0 -static int _verify_data_map(struct pnfs_osd_layout *layout) -{ - struct pnfs_osd_data_map *data_map = &layout->olo_map; - u64 stripe_length; - u32 group_width; - -/* FIXME: Only raid0 for now. if not go through MDS */ - if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) { - printk(KERN_ERR "Only RAID_0 for now\n"); - return -ENOTSUPP; - } - if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) { - printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n", - data_map->odm_num_comps, data_map->odm_mirror_cnt); - return -EINVAL; - } - - if (data_map->odm_group_width) - group_width = data_map->odm_group_width; - else - group_width = data_map->odm_num_comps / - (data_map->odm_mirror_cnt + 1); - - stripe_length = (u64)data_map->odm_stripe_unit * group_width; - if (stripe_length >= (1ULL << 32)) { - printk(KERN_ERR "Total Stripe length(0x%llx)" - " >= 32bit is not supported\n", _LLU(stripe_length)); - return -ENOTSUPP; - } - - if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) { - printk(KERN_ERR "Stripe Unit(0x%llx)" - " must be Multples of PAGE_SIZE(0x%lx)\n", - _LLU(data_map->odm_stripe_unit), PAGE_SIZE); - return -ENOTSUPP; - } - - return 0; -} -#endif - static void copy_single_comp(struct ore_components *oc, unsigned c, struct pnfs_osd_object_cred *src_comp) { @@ -441,327 +399,6 @@ static void __on_dev_error(struct ore_io_state *ios, dev_offset, dev_len, !ios->reading); } -#if 0 -static void _clear_bio(struct bio *bio) -{ - struct bio_vec *bv; - unsigned i; - - __bio_for_each_segment(bv, bio, i, 0) { - unsigned this_count = bv->bv_len; - - if (likely(PAGE_SIZE == this_count)) - clear_highpage(bv->bv_page); - else - zero_user(bv->bv_page, bv->bv_offset, this_count); - } -} - -static int _io_check(struct objio_state *ios, bool is_write) -{ - enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR; - int lin_ret = 0; - int i; - - for (i = 0; i < ios->numdevs; i++) { - struct osd_sense_info osi; - struct osd_request *or = ios->per_dev[i].or; - int ret; - - if (!or) - continue; - - ret = osd_req_decode_sense(or, &osi); - if (likely(!ret)) - continue; - - if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { - /* start read offset passed endof file */ - BUG_ON(is_write); - _clear_bio(ios->per_dev[i].bio); - dprintk("%s: start read offset passed end of file " - "offset=0x%llx, length=0x%lx\n", __func__, - _LLU(ios->per_dev[i].offset), - ios->per_dev[i].length); - - continue; /* we recovered */ - } - __on_dev_error(ios, is_write, ios->oc->ods[i], - ios->per_dev[i].dev, osi.osd_err_pri, - ios->per_dev[i].offset, ios->per_dev[i].length); - - if (osi.osd_err_pri >= oep) { - oep = osi.osd_err_pri; - lin_ret = ret; - } - } - - return lin_ret; -} - -/* - * Common IO state helpers. - */ -static void _io_free(struct objio_state *ios) -{ - unsigned i; - - for (i = 0; i < ios->numdevs; i++) { - struct _objio_per_comp *per_dev = &ios->per_dev[i]; - - if (per_dev->or) { - osd_end_request(per_dev->or); - per_dev->or = NULL; - } - - if (per_dev->bio) { - bio_put(per_dev->bio); - per_dev->bio = NULL; - } - } -} - -struct osd_dev *_io_od(struct objio_state *ios, unsigned dev) -{ - unsigned min_dev = ios->oc->first_dev; - unsigned max_dev = min_dev + ios->oc->numdevs; - - BUG_ON(dev < min_dev || max_dev <= dev); - return ios->oc->ods[dev - min_dev]->od; -} - -struct _striping_info { - u64 obj_offset; - u64 group_length; - unsigned dev; - unsigned unit_off; -}; - -static void _calc_stripe_info(struct objio_state *ios, u64 file_offset, - struct _striping_info *si) -{ - u32 stripe_unit = ios->layout->stripe_unit; - u32 group_width = ios->layout->group_width; - u64 group_depth = ios->layout->group_depth; - u32 U = stripe_unit * group_width; - - u64 T = U * group_depth; - u64 S = T * ios->layout->group_count; - u64 M = div64_u64(file_offset, S); - - /* - G = (L - (M * S)) / T - H = (L - (M * S)) % T - */ - u64 LmodU = file_offset - M * S; - u32 G = div64_u64(LmodU, T); - u64 H = LmodU - G * T; - - u32 N = div_u64(H, U); - - div_u64_rem(file_offset, stripe_unit, &si->unit_off); - si->obj_offset = si->unit_off + (N * stripe_unit) + - (M * group_depth * stripe_unit); - - /* "H - (N * U)" is just "H % U" so it's bound to u32 */ - si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width; - si->dev *= ios->layout->mirrors_p1; - - si->group_length = T - H; -} - -static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg, - unsigned pgbase, struct _objio_per_comp *per_dev, int len, - gfp_t gfp_flags) -{ - unsigned pg = *cur_pg; - int cur_len = len; - struct request_queue *q = - osd_request_queue(_io_od(ios, per_dev->dev)); - - if (per_dev->bio == NULL) { - unsigned pages_in_stripe = ios->layout->group_width * - (ios->layout->stripe_unit / PAGE_SIZE); - unsigned bio_size = (ios->nr_pages + pages_in_stripe) / - ios->layout->group_width; - - if (BIO_MAX_PAGES_KMALLOC < bio_size) - bio_size = BIO_MAX_PAGES_KMALLOC; - - per_dev->bio = bio_kmalloc(gfp_flags, bio_size); - if (unlikely(!per_dev->bio)) { - dprintk("Faild to allocate BIO size=%u\n", bio_size); - return -ENOMEM; - } - } - - while (cur_len > 0) { - unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); - unsigned added_len; - - BUG_ON(ios->nr_pages <= pg); - cur_len -= pglen; - - added_len = bio_add_pc_page(q, per_dev->bio, - ios->pages[pg], pglen, pgbase); - if (unlikely(pglen != added_len)) - return -ENOMEM; - pgbase = 0; - ++pg; - } - BUG_ON(cur_len); - - per_dev->length += len; - *cur_pg = pg; - return 0; -} - -static int _prepare_one_group(struct objio_state *ios, u64 length, - struct _striping_info *si, unsigned *last_pg, - gfp_t gfp_flags) -{ - unsigned stripe_unit = ios->layout->stripe_unit; - unsigned mirrors_p1 = ios->layout->mirrors_p1; - unsigned devs_in_group = ios->layout->group_width * mirrors_p1; - unsigned dev = si->dev; - unsigned first_dev = dev - (dev % devs_in_group); - unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0; - unsigned cur_pg = *last_pg; - int ret = 0; - - while (length) { - struct _objio_per_comp *per_dev = &ios->per_dev[dev - first_dev]; - unsigned cur_len, page_off = 0; - - if (!per_dev->length) { - per_dev->dev = dev; - if (dev < si->dev) { - per_dev->offset = si->obj_offset + stripe_unit - - si->unit_off; - cur_len = stripe_unit; - } else if (dev == si->dev) { - per_dev->offset = si->obj_offset; - cur_len = stripe_unit - si->unit_off; - page_off = si->unit_off & ~PAGE_MASK; - BUG_ON(page_off && - (page_off != ios->pgbase)); - } else { /* dev > si->dev */ - per_dev->offset = si->obj_offset - si->unit_off; - cur_len = stripe_unit; - } - - if (max_comp < dev - first_dev) - max_comp = dev - first_dev; - } else { - cur_len = stripe_unit; - } - if (cur_len >= length) - cur_len = length; - - ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev, - cur_len, gfp_flags); - if (unlikely(ret)) - goto out; - - dev += mirrors_p1; - dev = (dev % devs_in_group) + first_dev; - - length -= cur_len; - ios->length += cur_len; - } -out: - ios->numdevs = max_comp + mirrors_p1; - *last_pg = cur_pg; - return ret; -} - -static int _io_rw_pagelist(struct objio_state *ios, gfp_t gfp_flags) -{ - u64 length = ios->count; - u64 offset = ios->offset; - struct _striping_info si; - unsigned last_pg = 0; - int ret = 0; - - while (length) { - _calc_stripe_info(ios, offset, &si); - - if (length < si.group_length) - si.group_length = length; - - ret = _prepare_one_group(ios, si.group_length, &si, &last_pg, gfp_flags); - if (unlikely(ret)) - goto out; - - offset += si.group_length; - length -= si.group_length; - } - -out: - if (!ios->length) - return ret; - - return 0; -} - -static int _sync_done(struct objio_state *ios) -{ - struct completion *waiting = ios->private; - - complete(waiting); - return 0; -} - -static void _last_io(struct kref *kref) -{ - struct objio_state *ios = container_of(kref, struct objio_state, kref); - - ios->done(ios); -} - -static void _done_io(struct osd_request *or, void *p) -{ - struct objio_state *ios = p; - - kref_put(&ios->kref, _last_io); -} - -static int _io_exec(struct objio_state *ios) -{ - DECLARE_COMPLETION_ONSTACK(wait); - int ret = 0; - unsigned i; - objio_done_fn saved_done_fn = ios->done; - bool sync = ios->sync; - - if (sync) { - ios->done = _sync_done; - ios->private = &wait; - } - - kref_init(&ios->kref); - - for (i = 0; i < ios->numdevs; i++) { - struct osd_request *or = ios->per_dev[i].or; - - if (!or) - continue; - - kref_get(&ios->kref); - osd_execute_request_async(or, _done_io, ios); - } - - kref_put(&ios->kref, _last_io); - - if (sync) { - wait_for_completion(&wait); - ret = saved_done_fn(ios); - } - - return ret; -} -#endif - /* * read */ @@ -781,63 +418,6 @@ static void _read_done(struct ore_io_state *ios, void *private) objlayout_read_done(&objios->oir, status, objios->sync); } -#if 0 -static int _read_mirrors(struct objio_state *ios, unsigned cur_comp) -{ - struct osd_request *or = NULL; - struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; - unsigned dev = per_dev->dev; - struct ore_comp *cred = - &ios->oc->comps[cur_comp]; - struct osd_obj_id obj = cred->obj; - int ret; - - or = osd_start_request(_io_od(ios, dev), GFP_KERNEL); - if (unlikely(!or)) { - ret = -ENOMEM; - goto err; - } - per_dev->or = or; - - osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length); - - ret = osd_finalize_request(or, 0, cred->cred, NULL); - if (ret) { - dprintk("%s: Faild to osd_finalize_request() => %d\n", - __func__, ret); - goto err; - } - - dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n", - __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset), - per_dev->length); - -err: - return ret; -} - -static int _read_exec(struct objio_state *ios) -{ - unsigned i; - int ret; - - for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { - if (!ios->per_dev[i].length) - continue; - ret = _read_mirrors(ios, i); - if (unlikely(ret)) - goto err; - } - - ios->done = _read_done; - return _io_exec(ios); - -err: - _io_free(ios); - return ret; -} -#endif - int objio_read_pagelist(struct nfs_read_data *rdata) { struct objio_state *objios; @@ -879,90 +459,6 @@ static void _write_done(struct ore_io_state *ios, void *private) objlayout_write_done(&objios->oir, status, objios->sync); } -#if 0 -static int _write_mirrors(struct objio_state *ios, unsigned cur_comp) -{ - struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp]; - unsigned dev = ios->per_dev[cur_comp].dev; - unsigned last_comp = cur_comp + ios->layout->mirrors_p1; - int ret; - - for (; cur_comp < last_comp; ++cur_comp, ++dev) { - struct osd_request *or = NULL; - struct ore_comp *cred = &ios->oc->comps[cur_comp]; - struct osd_obj_id obj = cred->obj; - struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; - struct bio *bio; - - or = osd_start_request(_io_od(ios, dev), GFP_NOFS); - if (unlikely(!or)) { - ret = -ENOMEM; - goto err; - } - per_dev->or = or; - - if (per_dev != master_dev) { - bio = bio_kmalloc(GFP_NOFS, - master_dev->bio->bi_max_vecs); - if (unlikely(!bio)) { - dprintk("Faild to allocate BIO size=%u\n", - master_dev->bio->bi_max_vecs); - ret = -ENOMEM; - goto err; - } - - __bio_clone(bio, master_dev->bio); - bio->bi_bdev = NULL; - bio->bi_next = NULL; - per_dev->bio = bio; - per_dev->dev = dev; - per_dev->length = master_dev->length; - per_dev->offset = master_dev->offset; - } else { - bio = master_dev->bio; - bio->bi_rw |= REQ_WRITE; - } - - osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length); - - ret = osd_finalize_request(or, 0, cred->cred, NULL); - if (ret) { - dprintk("%s: Faild to osd_finalize_request() => %d\n", - __func__, ret); - goto err; - } - - dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n", - __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset), - per_dev->length); - } - -err: - return ret; -} - -static int _write_exec(struct objio_state *ios) -{ - unsigned i; - int ret; - - for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { - if (!ios->per_dev[i].length) - continue; - ret = _write_mirrors(ios, i); - if (unlikely(ret)) - goto err; - } - - ios->done = _write_done; - return _io_exec(ios); - -err: - _io_free(ios); - return ret; -} -#endif - int objio_write_pagelist(struct nfs_write_data *wdata, int how) { struct objio_state *objios; From 278c023a99b0d6b471d0f4a79835c703482e29ac Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Mon, 31 Oct 2011 15:16:54 -0700 Subject: [PATCH 24/25] pnfs-obj: Support for RAID5 read-4-write interface. The ore need suplied a r4w_get_page/r4w_put_page API from Filesystem so it can get cache pages to read-into when writing parial stripes. Signed-off-by: Boaz Harrosh Signed-off-by: Trond Myklebust --- fs/nfs/objlayout/objio_osd.c | 38 ++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 3161da654a9b..c807ab93140e 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -459,6 +459,43 @@ static void _write_done(struct ore_io_state *ios, void *private) objlayout_write_done(&objios->oir, status, objios->sync); } +static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) +{ + struct objio_state *objios = priv; + struct nfs_write_data *wdata = objios->oir.rpcdata; + pgoff_t index = offset / PAGE_SIZE; + struct page *page = find_get_page(wdata->inode->i_mapping, index); + + if (!page) { + page = find_or_create_page(wdata->inode->i_mapping, + index, GFP_NOFS); + if (unlikely(!page)) { + dprintk("%s: grab_cache_page Failed index=0x%lx\n", + __func__, index); + return NULL; + } + unlock_page(page); + } + if (PageDirty(page) || PageWriteback(page)) + *uptodate = true; + else + *uptodate = PageUptodate(page); + dprintk("%s: index=0x%lx uptodate=%d\n", __func__, index, *uptodate); + return page; +} + +static void __r4w_put_page(void *priv, struct page *page) +{ + dprintk("%s: index=0x%lx\n", __func__, page->index); + page_cache_release(page); + return; +} + +static const struct _ore_r4w_op _r4w_op = { + .get_page = &__r4w_get_page, + .put_page = &__r4w_put_page, +}; + int objio_write_pagelist(struct nfs_write_data *wdata, int how) { struct objio_state *objios; @@ -472,6 +509,7 @@ int objio_write_pagelist(struct nfs_write_data *wdata, int how) return ret; objios->sync = 0 != (how & FLUSH_SYNC); + objios->ios->r4w = &_r4w_op; if (!objios->sync) objios->ios->done = _write_done; From 6070295efc90d1093b2031c43380bd7d9673c802 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 4 Nov 2011 07:04:10 -0400 Subject: [PATCH 25/25] nfs: set vs_hidden on nfs4_callback_version4 (try #2) This service should not be registered with or unregistered from rpcbind. Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust --- fs/nfs/callback_xdr.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index ee1a5b3cd48d..726e59a9e50f 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -987,4 +987,5 @@ struct svc_version nfs4_callback_version4 = { .vs_proc = nfs4_callback_procedures1, .vs_xdrsize = NFS4_CALLBACK_XDRSIZE, .vs_dispatch = NULL, + .vs_hidden = 1, };