From ddeff520f02b92128132c282c350fa72afffb84a Mon Sep 17 00:00:00 2001 From: Nikita Danilov Date: Wed, 9 Aug 2006 13:53:47 -0400 Subject: [PATCH 01/15] NFS: Fix a potential deadlock in nfs_release_page nfs_wb_page() waits on request completion and, as a result, is not safe to be called from nfs_release_page() invoked by VM scanner as part of GFP_NOFS allocation. Fix possible deadlock by analyzing gfp mask and refusing to release page if __GFP_FS is not set. Signed-off-by: Nikita Danilov Signed-off-by: Trond Myklebust (cherry picked from 374d969debfb290bafcb41d28918dc6f7e43ce31 commit) --- fs/nfs/file.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index cc2b874ad5a4..48e892880d5b 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -312,7 +312,13 @@ static void nfs_invalidate_page(struct page *page, unsigned long offset) static int nfs_release_page(struct page *page, gfp_t gfp) { - return !nfs_wb_page(page->mapping->host, page); + if (gfp & __GFP_FS) + return !nfs_wb_page(page->mapping->host, page); + else + /* + * Avoid deadlock on nfs_wait_on_request(). + */ + return 0; } const struct address_space_operations nfs_file_aops = { From a634904a7de0d3a0bc606f608007a34e8c05bfee Mon Sep 17 00:00:00 2001 From: ASANO Masahiro Date: Tue, 22 Aug 2006 20:06:02 -0400 Subject: [PATCH 02/15] VFS: add lookup hint for network file systems I'm trying to speeding up mkdir(2) for network file systems. A typical mkdir(2) calls two inode_operations: lookup and mkdir. The lookup operation would fail with ENOENT in common case. I think it is unnecessary because the subsequent mkdir operation can check it. In case of creat(2), lookup operation is called with the LOOKUP_CREATE flag, so individual filesystem can omit real lookup. e.g. nfs_lookup(). Here is a sample patch which uses LOOKUP_CREATE and O_EXCL on mkdir, symlink and mknod. This uses the gadget for creat(2). And here is the result of a benchmark on NFSv3. mkdir(2) 10,000 times: original 50.5 sec patched 29.0 sec Signed-off-by: ASANO Masahiro Signed-off-by: Andrew Morton Signed-off-by: Trond Myklebust (cherry picked from fab7bf44449b29f9d5572a5dd8adcf7c91d5bf0f commit) --- fs/namei.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/namei.c b/fs/namei.c index 55a131230f94..863166441bf3 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1767,6 +1767,8 @@ struct dentry *lookup_create(struct nameidata *nd, int is_dir) if (nd->last_type != LAST_NORM) goto fail; nd->flags &= ~LOOKUP_PARENT; + nd->flags |= LOOKUP_CREATE; + nd->intent.open.flags = O_EXCL; /* * Do the final lookup. From 5d67476fff2df6ff12f60b540fd0e74cf2a668f9 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 31 Jul 2006 14:11:48 -0700 Subject: [PATCH 03/15] SUNRPC: make rpc_unlink() take a dentry argument instead of a path Signe-off-by: Trond Myklebust (cherry picked from 88bf6d811b01a4be7fd507d18bf5f1c527989089 commit) --- fs/nfs/idmap.c | 3 +-- include/linux/sunrpc/rpc_pipe_fs.h | 2 +- net/sunrpc/auth_gss/auth_gss.c | 2 +- net/sunrpc/rpc_pipe.c | 20 ++++++-------------- 4 files changed, 9 insertions(+), 18 deletions(-) diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index b81e7ed3c902..df0be1214358 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -130,9 +130,8 @@ nfs_idmap_delete(struct nfs4_client *clp) if (!idmap) return; + rpc_unlink(idmap->idmap_dentry); dput(idmap->idmap_dentry); - idmap->idmap_dentry = NULL; - rpc_unlink(idmap->idmap_path); clp->cl_idmap = NULL; kfree(idmap); } diff --git a/include/linux/sunrpc/rpc_pipe_fs.h b/include/linux/sunrpc/rpc_pipe_fs.h index 2c2189cb30aa..04d2767d5ef7 100644 --- a/include/linux/sunrpc/rpc_pipe_fs.h +++ b/include/linux/sunrpc/rpc_pipe_fs.h @@ -44,7 +44,7 @@ extern int rpc_queue_upcall(struct inode *, struct rpc_pipe_msg *); extern struct dentry *rpc_mkdir(char *, struct rpc_clnt *); extern int rpc_rmdir(char *); extern struct dentry *rpc_mkpipe(char *, void *, struct rpc_pipe_ops *, int flags); -extern int rpc_unlink(char *); +extern int rpc_unlink(struct dentry *); extern struct vfsmount *rpc_get_mount(void); extern void rpc_put_mount(void); diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 4a9aa9393b97..beaa7b848246 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -718,7 +718,7 @@ gss_destroy(struct rpc_auth *auth) auth, auth->au_flavor); gss_auth = container_of(auth, struct gss_auth, rpc_auth); - rpc_unlink(gss_auth->path); + rpc_unlink(gss_auth->dentry); dput(gss_auth->dentry); gss_auth->dentry = NULL; gss_mech_put(gss_auth->mech); diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index a3bd2db2e024..9144f2767b66 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -746,22 +746,15 @@ err_dput: } int -rpc_unlink(char *path) +rpc_unlink(struct dentry *dentry) { - struct nameidata nd; - struct dentry *dentry; + struct dentry *parent; struct inode *dir; - int error; + int error = 0; - if ((error = rpc_lookup_parent(path, &nd)) != 0) - return error; - dir = nd.dentry->d_inode; + parent = dget_parent(dentry); + dir = parent->d_inode; mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); - dentry = lookup_one_len(nd.last.name, nd.dentry, nd.last.len); - if (IS_ERR(dentry)) { - error = PTR_ERR(dentry); - goto out_release; - } d_drop(dentry); if (dentry->d_inode) { rpc_close_pipes(dentry->d_inode); @@ -769,9 +762,8 @@ rpc_unlink(char *path) } dput(dentry); inode_dir_notify(dir, DN_DELETE); -out_release: mutex_unlock(&dir->i_mutex); - rpc_release_path(&nd); + dput(parent); return error; } From dff02cc1a34fcb60904a2c57cb351857cc11219e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 31 Jul 2006 14:17:18 -0700 Subject: [PATCH 04/15] NFS: clean up rpc_rmdir Make it take a dentry argument instead of a path Signed-off-by: Trond Myklebust (cherry picked from 648d4116eb2509f010f7f34704a650150309b3e7 commit) --- include/linux/sunrpc/rpc_pipe_fs.h | 2 +- net/sunrpc/clnt.c | 6 +++--- net/sunrpc/rpc_pipe.c | 18 +++++------------- 3 files changed, 9 insertions(+), 17 deletions(-) diff --git a/include/linux/sunrpc/rpc_pipe_fs.h b/include/linux/sunrpc/rpc_pipe_fs.h index 04d2767d5ef7..a481472c9484 100644 --- a/include/linux/sunrpc/rpc_pipe_fs.h +++ b/include/linux/sunrpc/rpc_pipe_fs.h @@ -42,7 +42,7 @@ RPC_I(struct inode *inode) extern int rpc_queue_upcall(struct inode *, struct rpc_pipe_msg *); extern struct dentry *rpc_mkdir(char *, struct rpc_clnt *); -extern int rpc_rmdir(char *); +extern int rpc_rmdir(struct dentry *); extern struct dentry *rpc_mkpipe(char *, void *, struct rpc_pipe_ops *, int flags); extern int rpc_unlink(struct dentry *); extern struct vfsmount *rpc_get_mount(void); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index d6409e757219..d307556872db 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -183,7 +183,7 @@ rpc_new_client(struct rpc_xprt *xprt, char *servname, out_no_auth: if (!IS_ERR(clnt->cl_dentry)) { - rpc_rmdir(clnt->cl_pathname); + rpc_rmdir(clnt->cl_dentry); dput(clnt->cl_dentry); rpc_put_mount(); } @@ -320,8 +320,8 @@ rpc_destroy_client(struct rpc_clnt *clnt) rpc_destroy_client(clnt->cl_parent); goto out_free; } - if (clnt->cl_pathname[0]) - rpc_rmdir(clnt->cl_pathname); + if (!IS_ERR(clnt->cl_dentry)) + rpc_rmdir(clnt->cl_dentry); if (clnt->cl_xprt) { xprt_destroy(clnt->cl_xprt); clnt->cl_xprt = NULL; diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 9144f2767b66..9c355e1ae61a 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -684,28 +684,20 @@ err_dput: } int -rpc_rmdir(char *path) +rpc_rmdir(struct dentry *dentry) { - struct nameidata nd; - struct dentry *dentry; + struct dentry *parent; struct inode *dir; int error; - if ((error = rpc_lookup_parent(path, &nd)) != 0) - return error; - dir = nd.dentry->d_inode; + parent = dget_parent(dentry); + dir = parent->d_inode; mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); - dentry = lookup_one_len(nd.last.name, nd.dentry, nd.last.len); - if (IS_ERR(dentry)) { - error = PTR_ERR(dentry); - goto out_release; - } rpc_depopulate(dentry); error = __rpc_rmdir(dir, dentry); dput(dentry); -out_release: mutex_unlock(&dir->i_mutex); - rpc_release_path(&nd); + dput(parent); return error; } From 68adb0af51ebccb72ffb14d49cb8121b1afc4259 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 10 Aug 2006 17:51:46 -0400 Subject: [PATCH 05/15] SUNRPC: rpc_unlink() must check for unhashed dentries A prior call to rpc_depopulate() by rpc_rmdir() on the parent directory may have already called simple_unlink() on this entry. Add the same check to rpc_rmdir(). Also remove a redundant call to rpc_close_pipes() in rpc_rmdir. Signed-off-by: Trond Myklebust (cherry picked from 0bbfb9d20f6437c4031aa3bf9b4d311a053e58e3 commit) --- net/sunrpc/rpc_pipe.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 9c355e1ae61a..0b1a1ac8a4bc 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -539,6 +539,7 @@ repeat: rpc_close_pipes(dentry->d_inode); simple_unlink(dir, dentry); } + inode_dir_notify(dir, DN_DELETE); dput(dentry); } while (n); goto repeat; @@ -610,8 +611,8 @@ __rpc_rmdir(struct inode *dir, struct dentry *dentry) int error; shrink_dcache_parent(dentry); - if (dentry->d_inode) - rpc_close_pipes(dentry->d_inode); + if (d_unhashed(dentry)) + return 0; if ((error = simple_rmdir(dir, dentry)) != 0) return error; if (!error) { @@ -747,13 +748,15 @@ rpc_unlink(struct dentry *dentry) parent = dget_parent(dentry); dir = parent->d_inode; mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); - d_drop(dentry); - if (dentry->d_inode) { - rpc_close_pipes(dentry->d_inode); - error = simple_unlink(dir, dentry); + if (!d_unhashed(dentry)) { + d_drop(dentry); + if (dentry->d_inode) { + rpc_close_pipes(dentry->d_inode); + error = simple_unlink(dir, dentry); + } + inode_dir_notify(dir, DN_DELETE); } dput(dentry); - inode_dir_notify(dir, DN_DELETE); mutex_unlock(&dir->i_mutex); dput(parent); return error; From 8f8e7a50f450fcb86a5b2ffb94543c57a14f8260 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 14 Aug 2006 13:11:15 -0400 Subject: [PATCH 06/15] SUNRPC: Fix dentry refcounting issues with users of rpc_pipefs rpc_unlink() and rpc_rmdir() will dput the dentry reference for you. Signed-off-by: Trond Myklebust (cherry picked from a05a57effa71a1f67ccbfc52335c10c8b85f3f6a commit) --- fs/nfs/idmap.c | 1 - net/sunrpc/auth_gss/auth_gss.c | 1 - net/sunrpc/clnt.c | 15 ++++++--------- 3 files changed, 6 insertions(+), 11 deletions(-) diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index df0be1214358..07a5dd57646e 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -131,7 +131,6 @@ nfs_idmap_delete(struct nfs4_client *clp) if (!idmap) return; rpc_unlink(idmap->idmap_dentry); - dput(idmap->idmap_dentry); clp->cl_idmap = NULL; kfree(idmap); } diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index beaa7b848246..ef1cf5b476c8 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -719,7 +719,6 @@ gss_destroy(struct rpc_auth *auth) gss_auth = container_of(auth, struct gss_auth, rpc_auth); rpc_unlink(gss_auth->dentry); - dput(gss_auth->dentry); gss_auth->dentry = NULL; gss_mech_put(gss_auth->mech); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index d307556872db..d9eac7069101 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -184,7 +184,6 @@ rpc_new_client(struct rpc_xprt *xprt, char *servname, out_no_auth: if (!IS_ERR(clnt->cl_dentry)) { rpc_rmdir(clnt->cl_dentry); - dput(clnt->cl_dentry); rpc_put_mount(); } out_no_path: @@ -251,10 +250,8 @@ rpc_clone_client(struct rpc_clnt *clnt) new->cl_autobind = 0; new->cl_oneshot = 0; new->cl_dead = 0; - if (!IS_ERR(new->cl_dentry)) { + if (!IS_ERR(new->cl_dentry)) dget(new->cl_dentry); - rpc_get_mount(); - } rpc_init_rtt(&new->cl_rtt_default, clnt->cl_xprt->timeout.to_initval); if (new->cl_auth) atomic_inc(&new->cl_auth->au_count); @@ -317,11 +314,15 @@ rpc_destroy_client(struct rpc_clnt *clnt) clnt->cl_auth = NULL; } if (clnt->cl_parent != clnt) { + if (!IS_ERR(clnt->cl_dentry)) + dput(clnt->cl_dentry); rpc_destroy_client(clnt->cl_parent); goto out_free; } - if (!IS_ERR(clnt->cl_dentry)) + if (!IS_ERR(clnt->cl_dentry)) { rpc_rmdir(clnt->cl_dentry); + rpc_put_mount(); + } if (clnt->cl_xprt) { xprt_destroy(clnt->cl_xprt); clnt->cl_xprt = NULL; @@ -331,10 +332,6 @@ rpc_destroy_client(struct rpc_clnt *clnt) out_free: rpc_free_iostats(clnt->cl_metrics); clnt->cl_metrics = NULL; - if (!IS_ERR(clnt->cl_dentry)) { - dput(clnt->cl_dentry); - rpc_put_mount(); - } kfree(clnt); return 0; } From 01df9c5e918ae5559f2d96da0143f8bfbb9e6171 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 10 Aug 2006 11:58:57 -0400 Subject: [PATCH 07/15] LOCKD: Fix a deadlock in nlm_traverse_files() nlm_traverse_files() is not allowed to hold the nlm_file_mutex while calling nlm_inspect file, since it may end up calling nlm_release_file() when releaseing the blocks. Signed-off-by: Trond Myklebust (cherry picked from e558d3cde986e04f68afe8c790ad68ef4b94587a commit) --- fs/lockd/svcsubs.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index 2a4df9b3779a..01b4db9e5466 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -237,19 +237,22 @@ static int nlm_traverse_files(struct nlm_host *host, int action) { struct nlm_file *file, **fp; - int i; + int i, ret = 0; mutex_lock(&nlm_file_mutex); for (i = 0; i < FILE_NRHASH; i++) { fp = nlm_files + i; while ((file = *fp) != NULL) { + file->f_count++; + mutex_unlock(&nlm_file_mutex); + /* Traverse locks, blocks and shares of this file * and update file->f_locks count */ - if (nlm_inspect_file(host, file, action)) { - mutex_unlock(&nlm_file_mutex); - return 1; - } + if (nlm_inspect_file(host, file, action)) + ret = 1; + mutex_lock(&nlm_file_mutex); + file->f_count--; /* No more references to this file. Let go of it. */ if (!file->f_blocks && !file->f_locks && !file->f_shares && !file->f_count) { @@ -262,7 +265,7 @@ nlm_traverse_files(struct nlm_host *host, int action) } } mutex_unlock(&nlm_file_mutex); - return 0; + return ret; } /* From 79558f3610efd7928e8882b2eaca3093b283630e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 22 Aug 2006 13:44:32 -0400 Subject: [PATCH 08/15] NFS: Fix issue with EIO on NFS read The problem is that we may be caching writes that would extend the file and create a hole in the region that we are reading. In this case, we need to detect the eof from the server, ensure that we zero out the pages that are part of the hole and mark them as up to date. Signed-off-by: Trond Myklebust (cherry picked from 856b603b01b99146918c093969b6cb1b1b0f1c01 commit) --- fs/nfs/read.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 65c0c5b32351..da9cf11c326f 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -116,10 +116,17 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data) pages = &data->args.pages[base >> PAGE_CACHE_SHIFT]; base &= ~PAGE_CACHE_MASK; pglen = PAGE_CACHE_SIZE - base; - if (pglen < remainder) + for (;;) { + if (remainder <= pglen) { + memclear_highpage_flush(*pages, base, remainder); + break; + } memclear_highpage_flush(*pages, base, pglen); - else - memclear_highpage_flush(*pages, base, remainder); + pages++; + remainder -= pglen; + pglen = PAGE_CACHE_SIZE; + base = 0; + } } /* @@ -476,6 +483,8 @@ static void nfs_readpage_set_pages_uptodate(struct nfs_read_data *data) unsigned int base = data->args.pgbase; struct page **pages; + if (data->res.eof) + count = data->args.count; if (unlikely(count == 0)) return; pages = &data->args.pages[base >> PAGE_CACHE_SHIFT]; @@ -483,11 +492,7 @@ static void nfs_readpage_set_pages_uptodate(struct nfs_read_data *data) count += base; for (;count >= PAGE_CACHE_SIZE; count -= PAGE_CACHE_SIZE, pages++) SetPageUptodate(*pages); - /* - * Was this an eof or a short read? If the latter, don't mark the page - * as uptodate yet. - */ - if (count > 0 && (data->res.eof || data->args.count == data->res.count)) + if (count != 0) SetPageUptodate(*pages); } @@ -502,6 +507,8 @@ static void nfs_readpage_set_pages_error(struct nfs_read_data *data) count += base; for (;count >= PAGE_CACHE_SIZE; count -= PAGE_CACHE_SIZE, pages++) SetPageError(*pages); + if (count != 0) + SetPageError(*pages); } /* From 8e037094c414172481c5ce903efdab50ce932343 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 22 Aug 2006 20:06:15 -0400 Subject: [PATCH 09/15] SUNRPC: avoid choosing an IPMI port for RPC traffic Some hardware uses port 664 for its hardware-based IPMI listener. Teach the RPC client to avoid using that port by raising the default minimum port number to 665. Test plan: Find a mainboard known to use port 664 for IPMI; enable IPMI; mount NFS servers in a tight loop. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust (cherry picked from 58e8cb3a035d22fc386e1c53a5d98c3f219530fb commit) --- include/linux/sunrpc/xprt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 840e47a4ccc5..3a0cca255b76 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -37,7 +37,7 @@ extern unsigned int xprt_max_resvport; #define RPC_MIN_RESVPORT (1U) #define RPC_MAX_RESVPORT (65535U) -#define RPC_DEF_MIN_RESVPORT (650U) +#define RPC_DEF_MIN_RESVPORT (665U) #define RPC_DEF_MAX_RESVPORT (1023U) /* From 3cedf13af9f7e61aca0dbbd11b601ac93bf93a9f Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 24 Aug 2006 15:44:12 -0400 Subject: [PATCH 10/15] NFSv4: increase client-provided nfs4 clientid size Neil Brown observed that the current limit of 32 bytes isn't enough to hold two ip addresses and the rest of the stuff we're putting in it, so it's often truncated to the point where it's unlikely to be unique. This can cause spurious CLID_INUSE's from the server. Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust (cherry picked from fc8c17ec251e984ab3df9182ed097aa5b577c915 commit) --- include/linux/nfs_xdr.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 2d3fb6416d91..db9cbf68e12b 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -659,7 +659,7 @@ struct nfs4_rename_res { struct nfs4_setclientid { const nfs4_verifier * sc_verifier; /* request */ unsigned int sc_name_len; - char sc_name[32]; /* request */ + char sc_name[48]; /* request */ u32 sc_prog; /* request */ unsigned int sc_netid_len; char sc_netid[4]; /* request */ From e8896495bca8490a427409e0886d63d05419ec65 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 24 Aug 2006 15:44:19 -0400 Subject: [PATCH 11/15] NFS: Check lengths more thoroughly in NFS4 readdir XDR decode Check the bounds of length specifiers more thoroughly in the XDR decoding of NFS4 readdir reply data. Currently, if the server returns a bitmap or attr length that causes the current decode point pointer to wrap, this could go undetected (consider a small "negative" length on a 32-bit machine). Also add a check into the main XDR decode handler to make sure that the amount of data is a multiple of four bytes (as specified by RFC-1014). This makes sure that we can do u32* pointer subtraction in the NFS client without risking an undefined result (the result is undefined if the pointers are not correctly aligned with respect to one another). Signed-Off-By: David Howells Signed-off-by: Trond Myklebust (cherry picked from 5861fddd64a7eaf7e8b1a9997455a24e7f688092 commit) --- fs/nfs/nfs4xdr.c | 21 +++++++++++---------- net/sunrpc/clnt.c | 11 +++++++++++ 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 1750d996f49f..730ec8fb31c6 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -3355,7 +3355,7 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n struct kvec *iov = rcvbuf->head; unsigned int nr, pglen = rcvbuf->page_len; uint32_t *end, *entry, *p, *kaddr; - uint32_t len, attrlen; + uint32_t len, attrlen, xlen; int hdrlen, recvd, status; status = decode_op_hdr(xdr, OP_READDIR); @@ -3377,10 +3377,10 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n BUG_ON(pglen + readdir->pgbase > PAGE_CACHE_SIZE); kaddr = p = (uint32_t *) kmap_atomic(page, KM_USER0); - end = (uint32_t *) ((char *)p + pglen + readdir->pgbase); + end = p + ((pglen + readdir->pgbase) >> 2); entry = p; for (nr = 0; *p++; nr++) { - if (p + 3 > end) + if (end - p < 3) goto short_pkt; dprintk("cookie = %Lu, ", *((unsigned long long *)p)); p += 2; /* cookie */ @@ -3389,18 +3389,19 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n printk(KERN_WARNING "NFS: giant filename in readdir (len 0x%x)\n", len); goto err_unmap; } + xlen = XDR_QUADLEN(len); + if (end - p < xlen + 1) + goto short_pkt; dprintk("filename = %*s\n", len, (char *)p); - p += XDR_QUADLEN(len); - if (p + 1 > end) - goto short_pkt; + p += xlen; len = ntohl(*p++); /* bitmap length */ + if (end - p < len + 1) + goto short_pkt; p += len; - if (p + 1 > end) - goto short_pkt; attrlen = XDR_QUADLEN(ntohl(*p++)); - p += attrlen; /* attributes */ - if (p + 2 > end) + if (end - p < attrlen + 2) goto short_pkt; + p += attrlen; /* attributes */ entry = p; } if (!nr && (entry[0] != 0 || entry[1] == 0)) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index d9eac7069101..3e19d321067a 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1181,6 +1181,17 @@ call_verify(struct rpc_task *task) u32 *p = iov->iov_base, n; int error = -EACCES; + if ((task->tk_rqstp->rq_rcv_buf.len & 3) != 0) { + /* RFC-1014 says that the representation of XDR data must be a + * multiple of four bytes + * - if it isn't pointer subtraction in the NFS client may give + * undefined results + */ + printk(KERN_WARNING + "call_verify: XDR representation not a multiple of" + " 4 bytes: 0x%x\n", task->tk_rqstp->rq_rcv_buf.len); + goto out_eio; + } if ((len -= 3) < 0) goto out_overflow; p += 1; /* skip XID */ From 16b4289c7460ba9c04af40c574949dcca9029658 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 24 Aug 2006 12:27:15 -0400 Subject: [PATCH 12/15] NFSv4: Add v4 exception handling for the ACL functions. This is needed in order to handle any NFS4ERR_DELAY errors that might be returned by the server. It also ensures that we map the NFSv4 errors before they are returned to userland. Signed-off-by: Trond Myklebust (cherry picked from 71c12b3f0abc7501f6ed231a6d17bc9c05a238dc commit) --- fs/nfs/nfs4proc.c | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index e6ee97f19d81..153898e1331f 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2668,7 +2668,7 @@ out: nfs4_set_cached_acl(inode, acl); } -static inline ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen) +static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen) { struct page *pages[NFS4ACL_MAXPAGES]; struct nfs_getaclargs args = { @@ -2721,6 +2721,19 @@ out_free: return ret; } +static ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen) +{ + struct nfs4_exception exception = { }; + ssize_t ret; + do { + ret = __nfs4_get_acl_uncached(inode, buf, buflen); + if (ret >= 0) + break; + ret = nfs4_handle_exception(NFS_SERVER(inode), ret, &exception); + } while (exception.retry); + return ret; +} + static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen) { struct nfs_server *server = NFS_SERVER(inode); @@ -2737,7 +2750,7 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen) return nfs4_get_acl_uncached(inode, buf, buflen); } -static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen) +static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen) { struct nfs_server *server = NFS_SERVER(inode); struct page *pages[NFS4ACL_MAXPAGES]; @@ -2763,6 +2776,18 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen return ret; } +static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen) +{ + struct nfs4_exception exception = { }; + int err; + do { + err = nfs4_handle_exception(NFS_SERVER(inode), + __nfs4_proc_set_acl(inode, buf, buflen), + &exception); + } while (exception.retry); + return err; +} + static int nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server) { From a343bb7750e6a098909c34f5c5dfddbc4fa40053 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 22 Aug 2006 20:06:03 -0400 Subject: [PATCH 13/15] VFS: Fix access("file", X_OK) in the presence of ACLs Currently, the access() call will return incorrect information on NFS if there exists an ACL that grants execute access to the user on a regular file. The reason the information is incorrect is that the VFS overrides this execute access in open_exec() by checking (inode->i_mode & 0111). This patch propagates the VFS execute bit check back into the generic permission() call. Signed-off-by: Trond Myklebust (cherry picked from 64cbae98848c4c99851cb0a405f0b4982cd76c1e commit) --- fs/namei.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/namei.c b/fs/namei.c index 863166441bf3..432d6bc6fab0 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -227,10 +227,10 @@ int generic_permission(struct inode *inode, int mask, int permission(struct inode *inode, int mask, struct nameidata *nd) { + umode_t mode = inode->i_mode; int retval, submask; if (mask & MAY_WRITE) { - umode_t mode = inode->i_mode; /* * Nobody gets write access to a read-only fs. @@ -247,6 +247,13 @@ int permission(struct inode *inode, int mask, struct nameidata *nd) } + /* + * MAY_EXEC on regular files requires special handling: We override + * filesystem execute permissions if the mode bits aren't set. + */ + if ((mask & MAY_EXEC) && S_ISREG(mode) && !(mode & S_IXUGO)) + return -EACCES; + /* Ordinary permission routines do not understand MAY_APPEND. */ submask = mask & ~MAY_APPEND; if (inode->i_op && inode->i_op->permission) From 9167b0b9a0ab7907191523f5a0528e3b9c288e21 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 22 Aug 2006 20:06:03 -0400 Subject: [PATCH 14/15] VFS: Remove redundant open-coded mode bit check in prepare_binfmt(). The check in prepare_binfmt() for inode->i_mode & 0111 is redundant, since open_exec() will already have done that. Signed-off-by: Trond Myklebust (cherry picked from 822dec482ced07af32c378cd936d77345786572b commit) --- fs/exec.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 8344ba73a2a6..a6f64a98ac50 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -922,12 +922,6 @@ int prepare_binprm(struct linux_binprm *bprm) int retval; mode = inode->i_mode; - /* - * Check execute perms again - if the caller has CAP_DAC_OVERRIDE, - * generic_permission lets a non-executable through - */ - if (!(mode & 0111)) /* with at least _one_ execute bit set */ - return -EACCES; if (bprm->file->f_op == NULL) return -EACCES; From a969fd5a4e162c4485ae8f3e49d674656a18fa36 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 22 Aug 2006 20:06:04 -0400 Subject: [PATCH 15/15] VFS: Remove redundant open-coded mode bit checks in open_exec(). The check in open_exec() for inode->i_mode & 0111 has been made redundant by the fix to permission(). Signed-off-by: Trond Myklebust (cherry picked from 1d3741c5d991686699f100b65b9956f7ee7ae0ae commit) --- fs/exec.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index a6f64a98ac50..f7aabfeca033 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -486,8 +486,6 @@ struct file *open_exec(const char *name) if (!(nd.mnt->mnt_flags & MNT_NOEXEC) && S_ISREG(inode->i_mode)) { int err = vfs_permission(&nd, MAY_EXEC); - if (!err && !(inode->i_mode & 0111)) - err = -EACCES; file = ERR_PTR(err); if (!err) { file = nameidata_to_filp(&nd, O_RDONLY);