From e77a7b4f01b4c7b02c1c15b5d5b4ce4bd147b043 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Mon, 6 Oct 2014 16:42:52 -0700 Subject: [PATCH 01/32] nfsd: fix inclusive vfs_fsync_range() end The vfs_fsync_range() call during write processing got the end of the range off by one. The range is inclusive, not exclusive. The error has nfsd sync more data than requested -- it's correct but unnecessary overhead. The call during commit processing is correct so I copied that pattern in write processing. Maybe a helper would be nice but I kept it trivial. This is untested. I found it while reviewing code for something else entirely. Signed-off-by: Zach Brown Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 989129e2d6ea..d16076bd9a7a 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -938,6 +938,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, int stable = *stablep; int use_wgather; loff_t pos = offset; + loff_t end = LLONG_MAX; unsigned int pflags = current->flags; if (rqstp->rq_local) @@ -969,10 +970,13 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, fsnotify_modify(file); if (stable) { - if (use_wgather) + if (use_wgather) { host_err = wait_for_concurrent_writes(file); - else - host_err = vfs_fsync_range(file, offset, offset+*cnt, 0); + } else { + if (*cnt) + end = offset + *cnt - 1; + host_err = vfs_fsync_range(file, offset, end, 0); + } } out_nfserr: From ed38c0699848508672793bbdcca98ee89aa6c71e Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 19 Sep 2014 17:21:35 -0400 Subject: [PATCH 02/32] RPC: remove unneeded checks from xdr_truncate_encode() Thanks to Andrea Arcangeli for pointing out these checks are obviously unnecessary given the preceding calculations. Reported-by: Andrea Arcangeli Signed-off-by: J. Bruce Fields --- net/sunrpc/xdr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 290af97bf6f9..bcece520c4db 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -617,7 +617,7 @@ void xdr_truncate_encode(struct xdr_stream *xdr, size_t len) fraglen = min_t(int, buf->len - len, tail->iov_len); tail->iov_len -= fraglen; buf->len -= fraglen; - if (tail->iov_len && buf->len == len) { + if (tail->iov_len) { xdr->p = tail->iov_base + tail->iov_len; /* xdr->end, xdr->iov should be set already */ return; @@ -631,7 +631,7 @@ void xdr_truncate_encode(struct xdr_stream *xdr, size_t len) old = new + fraglen; xdr->page_ptr -= (old >> PAGE_SHIFT) - (new >> PAGE_SHIFT); - if (buf->page_len && buf->len == len) { + if (buf->page_len) { xdr->p = page_address(*xdr->page_ptr); xdr->end = (void *)xdr->p + PAGE_SIZE; xdr->p = (void *)xdr->p + (new % PAGE_SIZE); From 280caac078d3db075247915f6d2f72315232ed16 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 1 Oct 2014 11:36:31 -0400 Subject: [PATCH 03/32] rpc: change comments to assertions Reported-by: Andrea Arcangeli Signed-off-by: J. Bruce Fields --- net/sunrpc/xdr.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index bcece520c4db..1cb61242e55e 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -619,7 +619,8 @@ void xdr_truncate_encode(struct xdr_stream *xdr, size_t len) buf->len -= fraglen; if (tail->iov_len) { xdr->p = tail->iov_base + tail->iov_len; - /* xdr->end, xdr->iov should be set already */ + WARN_ON_ONCE(!xdr->end); + WARN_ON_ONCE(!xdr->iov); return; } WARN_ON_ONCE(fraglen); @@ -635,7 +636,7 @@ void xdr_truncate_encode(struct xdr_stream *xdr, size_t len) xdr->p = page_address(*xdr->page_ptr); xdr->end = (void *)xdr->p + PAGE_SIZE; xdr->p = (void *)xdr->p + (new % PAGE_SIZE); - /* xdr->iov should already be NULL */ + WARN_ON_ONCE(xdr->iov); return; } if (fraglen) { From b0d2e42cce8fbf12998a24abf62a26c895dd2fd2 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 22 Aug 2014 15:10:59 -0400 Subject: [PATCH 04/32] NFSD: Always initialize cl_cb_addr A client may not want to use the back channel on a transport it sent CREATE_SESSION on, in which case it clears SESSION4_BACK_CHAN. However, cl_cb_addr should be populated anyway, to be used if the client binds other connections to this session. If cl_cb_addr is not initialized, rpc_create() fails when the server attempts to set up a back channel on such secondary transports. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index e9c3afe4b5d3..1afd7d4420bd 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1440,7 +1440,7 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru list_add(&new->se_perclnt, &clp->cl_sessions); spin_unlock(&clp->cl_lock); - if (cses->flags & SESSION4_BACK_CHAN) { + { struct sockaddr *sa = svc_addr(rqstp); /* * This is a little silly; with sessions there's no real From ccc6398ea5d58fdedc6caccba0216ab30739773b Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 16 Oct 2014 08:49:37 -0400 Subject: [PATCH 05/32] nfsd: clean up comments over nfs4_file definition They're a bit outdated wrt to some recent changes. Signed-off-by: Jeff Layton Reviewed-by: Christoph Hellwig Signed-off-by: J. Bruce Fields --- fs/nfsd/state.h | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 2712042a66b1..8e85e07efce6 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -463,15 +463,19 @@ static inline struct nfs4_lockowner * lockowner(struct nfs4_stateowner *so) /* * nfs4_file: a file opened by some number of (open) nfs4_stateowners. * - * These objects are global. nfsd only keeps one instance of a nfs4_file per - * inode (though it may keep multiple file descriptors open per inode). These - * are tracked in the file_hashtbl which is protected by the state_lock - * spinlock. + * These objects are global. nfsd keeps one instance of a nfs4_file per + * filehandle (though it may keep multiple file descriptors for each). Each + * inode can have multiple filehandles associated with it, so there is + * (potentially) a many to one relationship between this struct and struct + * inode. + * + * These are hashed by filehandle in the file_hashtbl, which is protected by + * the global state_lock spinlock. */ struct nfs4_file { atomic_t fi_ref; spinlock_t fi_lock; - struct hlist_node fi_hash; /* hash by "struct inode *" */ + struct hlist_node fi_hash; /* hash on fi_fhandle */ struct list_head fi_stateids; struct list_head fi_delegations; /* One each for O_RDONLY, O_WRONLY, O_RDWR: */ From eb63192bb8cc0186265aad4f79fa4fd49c22b021 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 29 Oct 2014 11:44:16 +0300 Subject: [PATCH 06/32] SUNRPC: off by one in BUG_ON() The m->pool_to[] array has "maxpools" number of elements. It's allocated in svc_pool_map_alloc_arrays() which we called earlier in the function. This test should be >= instead of >. Signed-off-by: Dan Carpenter Signed-off-by: J. Bruce Fields --- net/sunrpc/svc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index ca8a7958f4e6..349c98f9818b 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -189,7 +189,7 @@ svc_pool_map_init_percpu(struct svc_pool_map *m) return err; for_each_online_cpu(cpu) { - BUG_ON(pidx > maxpools); + BUG_ON(pidx >= maxpools); m->to_pool[cpu] = pidx; m->pool_to[pidx] = cpu; pidx++; From 9af94fc4e470deab3427d07551725f0bf844ebc8 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 31 Oct 2014 08:28:29 -0400 Subject: [PATCH 07/32] lockd: ratelimit "lockd: cannot monitor" messages When lockd can't talk to a remote statd, it'll spew a warning message to the ring buffer. If the application is really hammering on locks however, it's possible for that message to spam the logs. Ratelimit it to minimize the potential for harm. Reported-by: Ian Collier Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/lockd/mon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 9106f42c472c..1cc6ec51e6b1 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -214,7 +214,7 @@ int nsm_monitor(const struct nlm_host *host) if (unlikely(res.status != 0)) status = -EIO; if (unlikely(status < 0)) { - printk(KERN_NOTICE "lockd: cannot monitor %s\n", nsm->sm_name); + pr_notice_ratelimited("lockd: cannot monitor %s\n", nsm->sm_name); return status; } From 72c72bdf7bf53353d2d8e055194d27f0128be92b Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Fri, 7 Nov 2014 14:44:25 -0500 Subject: [PATCH 08/32] VFS: Rename do_fallocate() to vfs_fallocate() This function needs to be exported so it can be used by the NFSD module when responding to the new ALLOCATE and DEALLOCATE operations in NFS v4.2. Christoph Hellwig suggested renaming the function to stay consistent with how other vfs functions are named. Signed-off-by: Anna Schumaker Signed-off-by: J. Bruce Fields --- drivers/staging/android/ashmem.c | 2 +- fs/ioctl.c | 2 +- fs/open.c | 5 +++-- include/linux/fs.h | 2 +- mm/madvise.c | 2 +- 5 files changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/staging/android/ashmem.c b/drivers/staging/android/ashmem.c index ad4f5790a76f..27eecfe1c410 100644 --- a/drivers/staging/android/ashmem.c +++ b/drivers/staging/android/ashmem.c @@ -446,7 +446,7 @@ ashmem_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) loff_t start = range->pgstart * PAGE_SIZE; loff_t end = (range->pgend + 1) * PAGE_SIZE; - do_fallocate(range->asma->file, + vfs_fallocate(range->asma->file, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, start, end - start); range->purged = ASHMEM_WAS_PURGED; diff --git a/fs/ioctl.c b/fs/ioctl.c index 8ac3fad36192..0bd6142183ee 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -443,7 +443,7 @@ int ioctl_preallocate(struct file *filp, void __user *argp) return -EINVAL; } - return do_fallocate(filp, FALLOC_FL_KEEP_SIZE, sr.l_start, sr.l_len); + return vfs_fallocate(filp, FALLOC_FL_KEEP_SIZE, sr.l_start, sr.l_len); } static int file_ioctl(struct file *filp, unsigned int cmd, diff --git a/fs/open.c b/fs/open.c index d6fd3acde134..c94449b2e582 100644 --- a/fs/open.c +++ b/fs/open.c @@ -222,7 +222,7 @@ SYSCALL_DEFINE2(ftruncate64, unsigned int, fd, loff_t, length) #endif /* BITS_PER_LONG == 32 */ -int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) +int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); long ret; @@ -298,6 +298,7 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) sb_end_write(inode->i_sb); return ret; } +EXPORT_SYMBOL_GPL(vfs_fallocate); SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len) { @@ -305,7 +306,7 @@ SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len) int error = -EBADF; if (f.file) { - error = do_fallocate(f.file, mode, offset, len); + error = vfs_fallocate(f.file, mode, offset, len); fdput(f); } return error; diff --git a/include/linux/fs.h b/include/linux/fs.h index a957d4366c24..a88718677579 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2032,7 +2032,7 @@ struct filename { extern long vfs_truncate(struct path *, loff_t); extern int do_truncate(struct dentry *, loff_t start, unsigned int time_attrs, struct file *filp); -extern int do_fallocate(struct file *file, int mode, loff_t offset, +extern int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len); extern long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode); diff --git a/mm/madvise.c b/mm/madvise.c index 0938b30da4ab..a271adc93289 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -326,7 +326,7 @@ static long madvise_remove(struct vm_area_struct *vma, */ get_file(f); up_read(¤t->mm->mmap_sem); - error = do_fallocate(f, + error = vfs_fallocate(f, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, end - start); fput(f); From 95d871f03cae6b49de040265cf88cbe2a16b9f05 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Fri, 7 Nov 2014 14:44:26 -0500 Subject: [PATCH 09/32] nfsd: Add ALLOCATE support The ALLOCATE operation is used to preallocate space in a file. I can do this by using vfs_fallocate() to do the actual preallocation. ALLOCATE only returns a status indicator, so we don't need to write a special encode() function. Signed-off-by: Anna Schumaker Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 36 ++++++++++++++++++++++++++++++++++++ fs/nfsd/nfs4xdr.c | 19 ++++++++++++++++++- fs/nfsd/vfs.c | 21 +++++++++++++++++++++ fs/nfsd/vfs.h | 2 ++ fs/nfsd/xdr4.h | 8 ++++++++ 5 files changed, 85 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 0beb023f25ac..a261f1800922 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1013,6 +1013,36 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return status; } +static __be32 +nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_fallocate *fallocate, int flags) +{ + __be32 status = nfserr_notsupp; + struct file *file; + + status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate, + &fallocate->falloc_stateid, + WR_STATE, &file); + if (status != nfs_ok) { + dprintk("NFSD: nfsd4_fallocate: couldn't process stateid!\n"); + return status; + } + + status = nfsd4_vfs_fallocate(rqstp, &cstate->current_fh, file, + fallocate->falloc_offset, + fallocate->falloc_length, + flags); + fput(file); + return status; +} + +static __be32 +nfsd4_allocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_fallocate *fallocate) +{ + return nfsd4_fallocate(rqstp, cstate, fallocate, 0); +} + static __be32 nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_seek *seek) @@ -1929,6 +1959,12 @@ static struct nfsd4_operation nfsd4_ops[] = { }, /* NFSv4.2 operations */ + [OP_ALLOCATE] = { + .op_func = (nfsd4op_func)nfsd4_allocate, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, + .op_name = "OP_ALLOCATE", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize, + }, [OP_SEEK] = { .op_func = (nfsd4op_func)nfsd4_seek, .op_name = "OP_SEEK", diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index eeea7a90eb87..a60cff86fc28 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1513,6 +1513,23 @@ static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, str DECODE_TAIL; } +static __be32 +nfsd4_decode_fallocate(struct nfsd4_compoundargs *argp, + struct nfsd4_fallocate *fallocate) +{ + DECODE_HEAD; + + status = nfsd4_decode_stateid(argp, &fallocate->falloc_stateid); + if (status) + return status; + + READ_BUF(16); + p = xdr_decode_hyper(p, &fallocate->falloc_offset); + xdr_decode_hyper(p, &fallocate->falloc_length); + + DECODE_TAIL; +} + static __be32 nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek) { @@ -1604,7 +1621,7 @@ static nfsd4_dec nfsd4_dec_ops[] = { [OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_reclaim_complete, /* new operations for NFSv4.2 */ - [OP_ALLOCATE] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_ALLOCATE] = (nfsd4_dec)nfsd4_decode_fallocate, [OP_COPY] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_COPY_NOTIFY] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_DEALLOCATE] = (nfsd4_dec)nfsd4_decode_notsupp, diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index d16076bd9a7a..f1999619d516 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -533,6 +534,26 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp, } #endif +__be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct file *file, loff_t offset, loff_t len, + int flags) +{ + __be32 err; + int error; + + if (!S_ISREG(file_inode(file)->i_mode)) + return nfserr_inval; + + err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, NFSD_MAY_WRITE); + if (err) + return err; + + error = vfs_fallocate(file, flags, offset, len); + if (!error) + error = commit_metadata(fhp); + + return nfserrno(error); +} #endif /* defined(CONFIG_NFSD_V4) */ #ifdef CONFIG_NFSD_V3 diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index c2ff3f14e5f6..7ffdb144e486 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -54,6 +54,8 @@ int nfsd_mountpoint(struct dentry *, struct svc_export *); #ifdef CONFIG_NFSD_V4 __be32 nfsd4_set_nfs4_label(struct svc_rqst *, struct svc_fh *, struct xdr_netobj *); +__be32 nfsd4_vfs_fallocate(struct svc_rqst *, struct svc_fh *, + struct file *, loff_t, loff_t, int); #endif /* CONFIG_NFSD_V4 */ __be32 nfsd_create(struct svc_rqst *, struct svc_fh *, char *name, int len, struct iattr *attrs, diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 5720e9457f33..eeaa0d0c4f79 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -428,6 +428,13 @@ struct nfsd4_reclaim_complete { u32 rca_one_fs; }; +struct nfsd4_fallocate { + /* request */ + stateid_t falloc_stateid; + loff_t falloc_offset; + u64 falloc_length; +}; + struct nfsd4_seek { /* request */ stateid_t seek_stateid; @@ -486,6 +493,7 @@ struct nfsd4_op { struct nfsd4_free_stateid free_stateid; /* NFSv4.2 */ + struct nfsd4_fallocate allocate; struct nfsd4_seek seek; } u; struct nfs4_replay * replay; From b0cb9085239a20b7482ddd4839dd1d5476801dfa Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Fri, 7 Nov 2014 14:44:27 -0500 Subject: [PATCH 10/32] nfsd: Add DEALLOCATE support DEALLOCATE only returns a status value, meaning we can use the noop() xdr encoder to reply to the client. Signed-off-by: Anna Schumaker Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 15 +++++++++++++++ fs/nfsd/nfs4xdr.c | 2 +- fs/nfsd/xdr4.h | 1 + 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index a261f1800922..74fb15eefc31 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -33,6 +33,7 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include +#include #include #include "idmap.h" @@ -1043,6 +1044,14 @@ nfsd4_allocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return nfsd4_fallocate(rqstp, cstate, fallocate, 0); } +static __be32 +nfsd4_deallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_fallocate *fallocate) +{ + return nfsd4_fallocate(rqstp, cstate, fallocate, + FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE); +} + static __be32 nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_seek *seek) @@ -1965,6 +1974,12 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_name = "OP_ALLOCATE", .op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize, }, + [OP_DEALLOCATE] = { + .op_func = (nfsd4op_func)nfsd4_deallocate, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, + .op_name = "OP_DEALLOCATE", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize, + }, [OP_SEEK] = { .op_func = (nfsd4op_func)nfsd4_seek, .op_name = "OP_SEEK", diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index a60cff86fc28..0622d4f4a9a2 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1624,7 +1624,7 @@ static nfsd4_dec nfsd4_dec_ops[] = { [OP_ALLOCATE] = (nfsd4_dec)nfsd4_decode_fallocate, [OP_COPY] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_COPY_NOTIFY] = (nfsd4_dec)nfsd4_decode_notsupp, - [OP_DEALLOCATE] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_DEALLOCATE] = (nfsd4_dec)nfsd4_decode_fallocate, [OP_IO_ADVISE] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_LAYOUTERROR] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_LAYOUTSTATS] = (nfsd4_dec)nfsd4_decode_notsupp, diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index eeaa0d0c4f79..90a5925bd6ab 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -494,6 +494,7 @@ struct nfsd4_op { /* NFSv4.2 */ struct nfsd4_fallocate allocate; + struct nfsd4_fallocate deallocate; struct nfsd4_seek seek; } u; struct nfs4_replay * replay; From 5b095e99928cc13332d364f7cca7a9ca684369b4 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 23 Oct 2014 08:01:02 -0400 Subject: [PATCH 11/32] nfsd: convert nfs4_file searches to use RCU The global state_lock protects the file_hashtbl, and that has the potential to be a scalability bottleneck. Address this by making the file_hashtbl use RCU. Add a rcu_head to the nfs4_file and use that when freeing ones that have been hashed. In order to conserve space, we union the fi_rcu field with the fi_delegations list_head which must be clear by the time the last reference to the file is dropped. Convert find_file_locked to use RCU lookup primitives and not to require that the state_lock be held, and convert find_file to do a lockless lookup. Convert find_or_add_file to attempt a lockless lookup first, and then fall back to doing a locked search and insert if that fails to find anything. Also, minimize the number of times we need to calculate the hash value by passing it in as an argument to the search and insert functions, and optimize the order of arguments in nfsd4_init_file. Reviewed-by: Christoph Hellwig Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 51 ++++++++++++++++++++++++++------------------- fs/nfsd/state.h | 5 ++++- 2 files changed, 33 insertions(+), 23 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 1afd7d4420bd..1379d86f7b4f 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -275,9 +275,11 @@ opaque_hashval(const void *ptr, int nbytes) return x; } -static void nfsd4_free_file(struct nfs4_file *f) +static void nfsd4_free_file_rcu(struct rcu_head *rcu) { - kmem_cache_free(file_slab, f); + struct nfs4_file *fp = container_of(rcu, struct nfs4_file, fi_rcu); + + kmem_cache_free(file_slab, fp); } static inline void @@ -286,9 +288,10 @@ put_nfs4_file(struct nfs4_file *fi) might_lock(&state_lock); if (atomic_dec_and_lock(&fi->fi_ref, &state_lock)) { - hlist_del(&fi->fi_hash); + hlist_del_rcu(&fi->fi_hash); spin_unlock(&state_lock); - nfsd4_free_file(fi); + WARN_ON_ONCE(!list_empty(&fi->fi_delegations)); + call_rcu(&fi->fi_rcu, nfsd4_free_file_rcu); } } @@ -3057,10 +3060,9 @@ static struct nfs4_file *nfsd4_alloc_file(void) } /* OPEN Share state helper functions */ -static void nfsd4_init_file(struct nfs4_file *fp, struct knfsd_fh *fh) +static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval, + struct nfs4_file *fp) { - unsigned int hashval = file_hashval(fh); - lockdep_assert_held(&state_lock); atomic_set(&fp->fi_ref, 1); @@ -3073,7 +3075,7 @@ static void nfsd4_init_file(struct nfs4_file *fp, struct knfsd_fh *fh) fp->fi_share_deny = 0; memset(fp->fi_fds, 0, sizeof(fp->fi_fds)); memset(fp->fi_access, 0, sizeof(fp->fi_access)); - hlist_add_head(&fp->fi_hash, &file_hashtbl[hashval]); + hlist_add_head_rcu(&fp->fi_hash, &file_hashtbl[hashval]); } void @@ -3294,17 +3296,14 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net) /* search file_hashtbl[] for file */ static struct nfs4_file * -find_file_locked(struct knfsd_fh *fh) +find_file_locked(struct knfsd_fh *fh, unsigned int hashval) { - unsigned int hashval = file_hashval(fh); struct nfs4_file *fp; - lockdep_assert_held(&state_lock); - - hlist_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) { + hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash) { if (nfsd_fh_match(&fp->fi_fhandle, fh)) { - get_nfs4_file(fp); - return fp; + if (atomic_inc_not_zero(&fp->fi_ref)) + return fp; } } return NULL; @@ -3314,10 +3313,11 @@ static struct nfs4_file * find_file(struct knfsd_fh *fh) { struct nfs4_file *fp; + unsigned int hashval = file_hashval(fh); - spin_lock(&state_lock); - fp = find_file_locked(fh); - spin_unlock(&state_lock); + rcu_read_lock(); + fp = find_file_locked(fh, hashval); + rcu_read_unlock(); return fp; } @@ -3325,11 +3325,18 @@ static struct nfs4_file * find_or_add_file(struct nfs4_file *new, struct knfsd_fh *fh) { struct nfs4_file *fp; + unsigned int hashval = file_hashval(fh); + + rcu_read_lock(); + fp = find_file_locked(fh, hashval); + rcu_read_unlock(); + if (fp) + return fp; spin_lock(&state_lock); - fp = find_file_locked(fh); - if (fp == NULL) { - nfsd4_init_file(new, fh); + fp = find_file_locked(fh, hashval); + if (likely(fp == NULL)) { + nfsd4_init_file(fh, hashval, new); fp = new; } spin_unlock(&state_lock); @@ -4127,7 +4134,7 @@ void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate, nfs4_put_stateowner(so); } if (open->op_file) - nfsd4_free_file(open->op_file); + kmem_cache_free(file_slab, open->op_file); if (open->op_stp) nfs4_put_stid(&open->op_stp->st_stid); } diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 8e85e07efce6..9d3be371240a 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -477,7 +477,10 @@ struct nfs4_file { spinlock_t fi_lock; struct hlist_node fi_hash; /* hash on fi_fhandle */ struct list_head fi_stateids; - struct list_head fi_delegations; + union { + struct list_head fi_delegations; + struct rcu_head fi_rcu; + }; /* One each for O_RDONLY, O_WRONLY, O_RDWR: */ struct file * fi_fds[3]; /* From 8d65ef760d50cc625c5364cba89be838b21c66a7 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 17 Nov 2014 17:02:57 -0500 Subject: [PATCH 12/32] sunrpc: eliminate the XPT_DETACHED flag All it does is indicate whether a xprt has already been deleted from a list or not, which is unnecessary since we use list_del_init and it's always set and checked under the sv_lock anyway. Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_xprt.h | 7 +++---- net/sunrpc/svc_xprt.c | 4 +--- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index ce6e4182a5b2..79f6f8f3dc0a 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -63,10 +63,9 @@ struct svc_xprt { #define XPT_CHNGBUF 7 /* need to change snd/rcv buf sizes */ #define XPT_DEFERRED 8 /* deferred request pending */ #define XPT_OLD 9 /* used for xprt aging mark+sweep */ -#define XPT_DETACHED 10 /* detached from tempsocks list */ -#define XPT_LISTENER 11 /* listening endpoint */ -#define XPT_CACHE_AUTH 12 /* cache auth info */ -#define XPT_LOCAL 13 /* connection from loopback interface */ +#define XPT_LISTENER 10 /* listening endpoint */ +#define XPT_CACHE_AUTH 11 /* cache auth info */ +#define XPT_LOCAL 12 /* connection from loopback interface */ struct svc_serv *xpt_server; /* service for transport */ atomic_t xpt_reserved; /* space on outq that is rsvd */ diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index c179ca2a5aa4..97a75c17142b 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -884,7 +884,6 @@ static void svc_age_temp_xprts(unsigned long closure) continue; list_del_init(le); set_bit(XPT_CLOSE, &xprt->xpt_flags); - set_bit(XPT_DETACHED, &xprt->xpt_flags); dprintk("queuing xprt %p for closing\n", xprt); /* a thread will dequeue and close it soon */ @@ -924,8 +923,7 @@ static void svc_delete_xprt(struct svc_xprt *xprt) xprt->xpt_ops->xpo_detach(xprt); spin_lock_bh(&serv->sv_lock); - if (!test_and_set_bit(XPT_DETACHED, &xprt->xpt_flags)) - list_del_init(&xprt->xpt_list); + list_del_init(&xprt->xpt_list); WARN_ON_ONCE(!list_empty(&xprt->xpt_ready)); if (test_bit(XPT_TEMP, &xprt->xpt_flags)) serv->sv_tmpcnt--; From 067f96ef17455800bfbf87b743960e301e0b8e40 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 19 Nov 2014 07:51:13 -0500 Subject: [PATCH 13/32] sunrpc: release svc_pool_map reference when serv allocation fails Currently, it leaks when the allocation fails. Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- net/sunrpc/svc.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 349c98f9818b..537add5309cc 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -503,13 +503,15 @@ svc_create_pooled(struct svc_program *prog, unsigned int bufsize, unsigned int npools = svc_pool_map_get(); serv = __svc_create(prog, bufsize, npools, shutdown); + if (!serv) + goto out_err; - if (serv != NULL) { - serv->sv_function = func; - serv->sv_module = mod; - } - + serv->sv_function = func; + serv->sv_module = mod; return serv; +out_err: + svc_pool_map_put(); + return NULL; } EXPORT_SYMBOL_GPL(svc_create_pooled); From 818f2f57f20d0e9a9294180f304f34cd4e8f6066 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 27 Nov 2014 18:58:54 +0300 Subject: [PATCH 14/32] nfsd: minor off by one checks in __write_versions() My static checker complains that if "len == remaining" then it means we have truncated the last character off the version string. The intent of the code is that we print as many versions as we can without truncating a version. Then we put a newline at the end. If the newline can't fit we return -EINVAL. Signed-off-by: Dan Carpenter Reviewed-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/nfsctl.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index ca73ca79a0ee..0079b28270e9 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -606,7 +606,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) num); sep = " "; - if (len > remaining) + if (len >= remaining) break; remaining -= len; buf += len; @@ -621,7 +621,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) '+' : '-', minor); - if (len > remaining) + if (len >= remaining) break; remaining -= len; buf += len; @@ -629,7 +629,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) } len = snprintf(buf, remaining, "\n"); - if (len > remaining) + if (len >= remaining) return -EINVAL; return tlen + len; } From 4d152e2c9a6a3e3556ce5da7782a9e2836edbe0f Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 19 Nov 2014 07:51:14 -0500 Subject: [PATCH 15/32] sunrpc: add a generic rq_flags field to svc_rqst and move rq_secure to it In a later patch, we're going to need some atomic bit flags. Since that field will need to be an unsigned long, we mitigate that space consumption by migrating some other bitflags to the new field. Start with the rq_secure flag. Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/nfscache.c | 4 ++-- fs/nfsd/nfsfh.c | 2 +- include/linux/sunrpc/svc.h | 4 ++-- include/trace/events/sunrpc.h | 17 +++++++++++++---- net/sunrpc/svc_xprt.c | 5 ++++- 5 files changed, 22 insertions(+), 10 deletions(-) diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 122f69185ef5..83a9694ec485 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -490,7 +490,7 @@ found_entry: /* From the hall of fame of impractical attacks: * Is this a user who tries to snoop on the cache? */ rtn = RC_DOIT; - if (!rqstp->rq_secure && rp->c_secure) + if (!test_bit(RQ_SECURE, &rqstp->rq_flags) && rp->c_secure) goto out; /* Compose RPC reply header */ @@ -579,7 +579,7 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp) spin_lock(&b->cache_lock); drc_mem_usage += bufsize; lru_put_end(b, rp); - rp->c_secure = rqstp->rq_secure; + rp->c_secure = test_bit(RQ_SECURE, &rqstp->rq_flags); rp->c_type = cachetype; rp->c_state = RC_DONE; spin_unlock(&b->cache_lock); diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 88026fc6a981..965b478d50fc 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -86,7 +86,7 @@ static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp, int flags = nfsexp_flags(rqstp, exp); /* Check if the request originated from a secure port. */ - if (!rqstp->rq_secure && !(flags & NFSEXP_INSECURE_PORT)) { + if (!test_bit(RQ_SECURE, &rqstp->rq_flags) && !(flags & NFSEXP_INSECURE_PORT)) { RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); dprintk("nfsd: request from insecure port %s!\n", svc_print_addr(rqstp, buf, sizeof(buf))); diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 21678464883a..b60eb7c3f3f7 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -253,8 +253,8 @@ struct svc_rqst { u32 rq_vers; /* program version */ u32 rq_proc; /* procedure number */ u32 rq_prot; /* IP protocol */ - unsigned short - rq_secure : 1; /* secure port */ +#define RQ_SECURE (0) /* secure port */ + unsigned long rq_flags; /* flags field */ unsigned short rq_local : 1; /* local request */ void * rq_argp; /* decoded arguments */ diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 171ca4ff6d99..5eb5f79d9794 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -412,6 +412,10 @@ TRACE_EVENT(xs_tcp_data_recv, __entry->copied, __entry->reclen, __entry->offset) ); +#define show_rqstp_flags(flags) \ + __print_flags(flags, "|", \ + { (1UL << RQ_SECURE), "RQ_SECURE"}) + TRACE_EVENT(svc_recv, TP_PROTO(struct svc_rqst *rqst, int status), @@ -421,16 +425,19 @@ TRACE_EVENT(svc_recv, __field(struct sockaddr *, addr) __field(__be32, xid) __field(int, status) + __field(unsigned long, flags) ), TP_fast_assign( __entry->addr = (struct sockaddr *)&rqst->rq_addr; __entry->xid = status > 0 ? rqst->rq_xid : 0; __entry->status = status; + __entry->flags = rqst->rq_flags; ), - TP_printk("addr=%pIScp xid=0x%x status=%d", __entry->addr, - be32_to_cpu(__entry->xid), __entry->status) + TP_printk("addr=%pIScp xid=0x%x status=%d flags=%s", __entry->addr, + be32_to_cpu(__entry->xid), __entry->status, + show_rqstp_flags(__entry->flags)) ); DECLARE_EVENT_CLASS(svc_rqst_status, @@ -444,6 +451,7 @@ DECLARE_EVENT_CLASS(svc_rqst_status, __field(__be32, xid) __field(int, dropme) __field(int, status) + __field(unsigned long, flags) ), TP_fast_assign( @@ -451,11 +459,12 @@ DECLARE_EVENT_CLASS(svc_rqst_status, __entry->xid = rqst->rq_xid; __entry->dropme = (int)rqst->rq_dropme; __entry->status = status; + __entry->flags = rqst->rq_flags; ), - TP_printk("addr=%pIScp rq_xid=0x%x dropme=%d status=%d", + TP_printk("addr=%pIScp rq_xid=0x%x dropme=%d status=%d flags=%s", __entry->addr, be32_to_cpu(__entry->xid), __entry->dropme, - __entry->status) + __entry->status, show_rqstp_flags(__entry->flags)) ); DEFINE_EVENT(svc_rqst_status, svc_process, diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 5c71ccb9659d..eaa9263c2d28 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -797,7 +797,10 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) clear_bit(XPT_OLD, &xprt->xpt_flags); - rqstp->rq_secure = xprt->xpt_ops->xpo_secure_port(rqstp); + if (xprt->xpt_ops->xpo_secure_port(rqstp)) + set_bit(RQ_SECURE, &rqstp->rq_flags); + else + clear_bit(RQ_SECURE, &rqstp->rq_flags); rqstp->rq_chandle.defer = svc_defer; rqstp->rq_xid = svc_getu32(&rqstp->rq_arg.head[0]); From 7501cc2bcf9a71cc1f19e38775c234815ee44578 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 19 Nov 2014 07:51:15 -0500 Subject: [PATCH 16/32] sunrpc: move rq_local field to rq_flags Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 4 ++-- include/linux/sunrpc/svc.h | 2 +- include/trace/events/sunrpc.h | 3 ++- net/sunrpc/svcsock.c | 5 ++++- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index f1999619d516..60c25850fb16 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -962,7 +962,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t end = LLONG_MAX; unsigned int pflags = current->flags; - if (rqstp->rq_local) + if (test_bit(RQ_LOCAL, &rqstp->rq_flags)) /* * We want less throttling in balance_dirty_pages() * and shrink_inactive_list() so that nfs to @@ -1006,7 +1006,7 @@ out_nfserr: err = 0; else err = nfserrno(host_err); - if (rqstp->rq_local) + if (test_bit(RQ_LOCAL, &rqstp->rq_flags)) tsk_restore_flags(current, pflags, PF_LESS_THROTTLE); return err; } diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index b60eb7c3f3f7..a91df9047f32 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -254,8 +254,8 @@ struct svc_rqst { u32 rq_proc; /* procedure number */ u32 rq_prot; /* IP protocol */ #define RQ_SECURE (0) /* secure port */ +#define RQ_LOCAL (1) /* local request */ unsigned long rq_flags; /* flags field */ - unsigned short rq_local : 1; /* local request */ void * rq_argp; /* decoded arguments */ void * rq_resp; /* xdr'd results */ diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 5eb5f79d9794..98259f163cd8 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -414,7 +414,8 @@ TRACE_EVENT(xs_tcp_data_recv, #define show_rqstp_flags(flags) \ __print_flags(flags, "|", \ - { (1UL << RQ_SECURE), "RQ_SECURE"}) + { (1UL << RQ_SECURE), "RQ_SECURE"}, \ + { (1UL << RQ_LOCAL), "RQ_LOCAL"}) TRACE_EVENT(svc_recv, TP_PROTO(struct svc_rqst *rqst, int status), diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index f9c052d508f0..cc331b6cf573 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1145,7 +1145,10 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) rqstp->rq_xprt_ctxt = NULL; rqstp->rq_prot = IPPROTO_TCP; - rqstp->rq_local = !!test_bit(XPT_LOCAL, &svsk->sk_xprt.xpt_flags); + if (test_bit(XPT_LOCAL, &svsk->sk_xprt.xpt_flags)) + set_bit(RQ_LOCAL, &rqstp->rq_flags); + else + clear_bit(RQ_LOCAL, &rqstp->rq_flags); p = (__be32 *)rqstp->rq_arg.head[0].iov_base; calldir = p[1]; From 30660e04b0d4bbbd15fd21098681f45a9f4080b9 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 19 Nov 2014 07:51:16 -0500 Subject: [PATCH 17/32] sunrpc: move rq_usedeferral flag to rq_flags Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 4 ++-- include/linux/sunrpc/svc.h | 2 +- include/trace/events/sunrpc.h | 9 +++++---- net/sunrpc/svc.c | 2 +- net/sunrpc/svc_xprt.c | 2 +- 5 files changed, 10 insertions(+), 9 deletions(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 74fb15eefc31..6f98393ad0d9 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1370,7 +1370,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, * Don't use the deferral mechanism for NFSv4; compounds make it * too hard to avoid non-idempotency problems. */ - rqstp->rq_usedeferral = false; + clear_bit(RQ_USEDEFERRAL, &rqstp->rq_flags); /* * According to RFC3010, this takes precedence over all other errors. @@ -1486,7 +1486,7 @@ encode_op: BUG_ON(cstate->replay_owner); out: /* Reset deferral mechanism for RPC deferrals */ - rqstp->rq_usedeferral = true; + set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags); dprintk("nfsv4 compound returned %d\n", ntohl(status)); return status; } diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index a91df9047f32..6a3cf4c76dce 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -236,7 +236,6 @@ struct svc_rqst { struct svc_cred rq_cred; /* auth info */ void * rq_xprt_ctxt; /* transport specific context ptr */ struct svc_deferred_req*rq_deferred; /* deferred request we are replaying */ - bool rq_usedeferral; /* use deferral */ size_t rq_xprt_hlen; /* xprt header len */ struct xdr_buf rq_arg; @@ -255,6 +254,7 @@ struct svc_rqst { u32 rq_prot; /* IP protocol */ #define RQ_SECURE (0) /* secure port */ #define RQ_LOCAL (1) /* local request */ +#define RQ_USEDEFERRAL (2) /* use deferral */ unsigned long rq_flags; /* flags field */ void * rq_argp; /* decoded arguments */ diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 98259f163cd8..6d1facdebc92 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -412,10 +412,11 @@ TRACE_EVENT(xs_tcp_data_recv, __entry->copied, __entry->reclen, __entry->offset) ); -#define show_rqstp_flags(flags) \ - __print_flags(flags, "|", \ - { (1UL << RQ_SECURE), "RQ_SECURE"}, \ - { (1UL << RQ_LOCAL), "RQ_LOCAL"}) +#define show_rqstp_flags(flags) \ + __print_flags(flags, "|", \ + { (1UL << RQ_SECURE), "RQ_SECURE"}, \ + { (1UL << RQ_LOCAL), "RQ_LOCAL"}, \ + { (1UL << RQ_USEDEFERRAL), "RQ_USEDEFERRAL"}) TRACE_EVENT(svc_recv, TP_PROTO(struct svc_rqst *rqst, int status), diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 163df468d6d5..f6a8f2f7a253 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1090,7 +1090,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) /* Will be turned off only in gss privacy case: */ rqstp->rq_splice_ok = true; /* Will be turned off only when NFSv4 Sessions are used */ - rqstp->rq_usedeferral = true; + set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags); rqstp->rq_dropme = false; /* Setup reply header */ diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index eaa9263c2d28..a40f3755a33d 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -1081,7 +1081,7 @@ static struct cache_deferred_req *svc_defer(struct cache_req *req) struct svc_rqst *rqstp = container_of(req, struct svc_rqst, rq_chandle); struct svc_deferred_req *dr; - if (rqstp->rq_arg.page_len || !rqstp->rq_usedeferral) + if (rqstp->rq_arg.page_len || !test_bit(RQ_USEDEFERRAL, &rqstp->rq_flags)) return NULL; /* if more than a page, give up FIXME */ if (rqstp->rq_deferred) { dr = rqstp->rq_deferred; From 78b65eb3fda95c6d131c4bbb0536e21f0bd7a7d4 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 19 Nov 2014 07:51:17 -0500 Subject: [PATCH 18/32] sunrpc: move rq_dropme flag into rq_flags Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/nfssvc.c | 2 +- include/linux/sunrpc/svc.h | 2 +- include/trace/events/sunrpc.h | 8 ++++---- net/sunrpc/svc.c | 4 ++-- net/sunrpc/svc_xprt.c | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 752d56bbe0ba..314f5c8f8f1a 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -692,7 +692,7 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) /* Now call the procedure handler, and encode NFS status. */ nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp); nfserr = map_new_errors(rqstp->rq_vers, nfserr); - if (nfserr == nfserr_dropit || rqstp->rq_dropme) { + if (nfserr == nfserr_dropit || test_bit(RQ_DROPME, &rqstp->rq_flags)) { dprintk("nfsd: Dropping request; may be revisited later\n"); nfsd_cache_update(rqstp, RC_NOCACHE, NULL); return 0; diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 6a3cf4c76dce..d4ea3e5246b0 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -255,6 +255,7 @@ struct svc_rqst { #define RQ_SECURE (0) /* secure port */ #define RQ_LOCAL (1) /* local request */ #define RQ_USEDEFERRAL (2) /* use deferral */ +#define RQ_DROPME (3) /* drop current reply */ unsigned long rq_flags; /* flags field */ void * rq_argp; /* decoded arguments */ @@ -271,7 +272,6 @@ struct svc_rqst { struct cache_req rq_chandle; /* handle passed to caches for * request delaying */ - bool rq_dropme; /* Catering to nfsd */ struct auth_domain * rq_client; /* RPC peer info */ struct auth_domain * rq_gssclient; /* "gss/"-style peer info */ diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 6d1facdebc92..355671f19a9f 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -416,7 +416,8 @@ TRACE_EVENT(xs_tcp_data_recv, __print_flags(flags, "|", \ { (1UL << RQ_SECURE), "RQ_SECURE"}, \ { (1UL << RQ_LOCAL), "RQ_LOCAL"}, \ - { (1UL << RQ_USEDEFERRAL), "RQ_USEDEFERRAL"}) + { (1UL << RQ_USEDEFERRAL), "RQ_USEDEFERRAL"}, \ + { (1UL << RQ_DROPME), "RQ_DROPME"}) TRACE_EVENT(svc_recv, TP_PROTO(struct svc_rqst *rqst, int status), @@ -459,13 +460,12 @@ DECLARE_EVENT_CLASS(svc_rqst_status, TP_fast_assign( __entry->addr = (struct sockaddr *)&rqst->rq_addr; __entry->xid = rqst->rq_xid; - __entry->dropme = (int)rqst->rq_dropme; __entry->status = status; __entry->flags = rqst->rq_flags; ), - TP_printk("addr=%pIScp rq_xid=0x%x dropme=%d status=%d flags=%s", - __entry->addr, be32_to_cpu(__entry->xid), __entry->dropme, + TP_printk("addr=%pIScp rq_xid=0x%x status=%d flags=%s", + __entry->addr, be32_to_cpu(__entry->xid), __entry->status, show_rqstp_flags(__entry->flags)) ); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index f6a8f2f7a253..d8a9d603b786 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1091,7 +1091,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) rqstp->rq_splice_ok = true; /* Will be turned off only when NFSv4 Sessions are used */ set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags); - rqstp->rq_dropme = false; + clear_bit(RQ_DROPME, &rqstp->rq_flags); /* Setup reply header */ rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(rqstp); @@ -1191,7 +1191,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) *statp = procp->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp); /* Encode reply */ - if (rqstp->rq_dropme) { + if (test_bit(RQ_DROPME, &rqstp->rq_flags)) { if (procp->pc_release) procp->pc_release(rqstp, NULL, rqstp->rq_resp); goto dropit; diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index a40f3755a33d..143c4c8ea2f1 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -1110,7 +1110,7 @@ static struct cache_deferred_req *svc_defer(struct cache_req *req) } svc_xprt_get(rqstp->rq_xprt); dr->xprt = rqstp->rq_xprt; - rqstp->rq_dropme = true; + set_bit(RQ_DROPME, &rqstp->rq_flags); dr->handle.revisit = svc_revisit; return &dr->handle; From 779fb0f3af3089daa2e88cf8ef0ef0c5d2fecb40 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 19 Nov 2014 07:51:18 -0500 Subject: [PATCH 19/32] sunrpc: move rq_splice_ok flag into rq_flags Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 2 +- fs/nfsd/nfs4xdr.c | 8 ++++---- fs/nfsd/vfs.c | 2 +- include/linux/sunrpc/svc.h | 6 +++--- include/trace/events/sunrpc.h | 3 ++- net/sunrpc/auth_gss/svcauth_gss.c | 2 +- net/sunrpc/svc.c | 2 +- 7 files changed, 13 insertions(+), 12 deletions(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 6f98393ad0d9..ac71d13c69ef 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -773,7 +773,7 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * the client wants us to do more in this compound: */ if (!nfsd4_last_compound_op(rqstp)) - rqstp->rq_splice_ok = false; + clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags); /* check stateid */ if ((status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 0622d4f4a9a2..8880ec8cfd11 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1731,7 +1731,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) argp->rqstp->rq_cachetype = cachethis ? RC_REPLBUFF : RC_NOCACHE; if (readcount > 1 || max_reply > PAGE_SIZE - auth_slack) - argp->rqstp->rq_splice_ok = false; + clear_bit(RQ_SPLICE_OK, &argp->rqstp->rq_flags); DECODE_TAIL; } @@ -3253,10 +3253,10 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, p = xdr_reserve_space(xdr, 8); /* eof flag and byte count */ if (!p) { - WARN_ON_ONCE(resp->rqstp->rq_splice_ok); + WARN_ON_ONCE(test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)); return nfserr_resource; } - if (resp->xdr.buf->page_len && resp->rqstp->rq_splice_ok) { + if (resp->xdr.buf->page_len && test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)) { WARN_ON_ONCE(1); return nfserr_resource; } @@ -3273,7 +3273,7 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, goto err_truncate; } - if (file->f_op->splice_read && resp->rqstp->rq_splice_ok) + if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)) err = nfsd4_encode_splice_read(resp, read, file, maxcount); else err = nfsd4_encode_readv(resp, read, file, maxcount); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 60c25850fb16..cb00e48bc2b0 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -902,7 +902,7 @@ static __be32 nfsd_vfs_read(struct svc_rqst *rqstp, struct file *file, loff_t offset, struct kvec *vec, int vlen, unsigned long *count) { - if (file->f_op->splice_read && rqstp->rq_splice_ok) + if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &rqstp->rq_flags)) return nfsd_splice_read(rqstp, file, offset, count); else return nfsd_readv(file, offset, vec, vlen, count); diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index d4ea3e5246b0..2714287fc4f6 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -256,6 +256,9 @@ struct svc_rqst { #define RQ_LOCAL (1) /* local request */ #define RQ_USEDEFERRAL (2) /* use deferral */ #define RQ_DROPME (3) /* drop current reply */ +#define RQ_SPLICE_OK (4) /* turned off in gss privacy + * to prevent encrypting page + * cache pages */ unsigned long rq_flags; /* flags field */ void * rq_argp; /* decoded arguments */ @@ -277,9 +280,6 @@ struct svc_rqst { struct auth_domain * rq_gssclient; /* "gss/"-style peer info */ int rq_cachetype; struct svc_cacherep * rq_cacherep; /* cache info */ - bool rq_splice_ok; /* turned off in gss privacy - * to prevent encrypting page - * cache pages */ struct task_struct *rq_task; /* service thread */ }; diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 355671f19a9f..5848fc235869 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -417,7 +417,8 @@ TRACE_EVENT(xs_tcp_data_recv, { (1UL << RQ_SECURE), "RQ_SECURE"}, \ { (1UL << RQ_LOCAL), "RQ_LOCAL"}, \ { (1UL << RQ_USEDEFERRAL), "RQ_USEDEFERRAL"}, \ - { (1UL << RQ_DROPME), "RQ_DROPME"}) + { (1UL << RQ_DROPME), "RQ_DROPME"}, \ + { (1UL << RQ_SPLICE_OK), "RQ_SPLICE_OK"}) TRACE_EVENT(svc_recv, TP_PROTO(struct svc_rqst *rqst, int status), diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index de856ddf5fed..224a82f24d3c 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -886,7 +886,7 @@ unwrap_priv_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct gs u32 priv_len, maj_stat; int pad, saved_len, remaining_len, offset; - rqstp->rq_splice_ok = false; + clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags); priv_len = svc_getnl(&buf->head[0]); if (rqstp->rq_deferred) { diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index d8a9d603b786..2c1c49ebd84d 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1088,7 +1088,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) goto err_short_len; /* Will be turned off only in gss privacy case: */ - rqstp->rq_splice_ok = true; + set_bit(RQ_SPLICE_OK, &rqstp->rq_flags); /* Will be turned off only when NFSv4 Sessions are used */ set_bit(RQ_USEDEFERRAL, &rqstp->rq_flags); clear_bit(RQ_DROPME, &rqstp->rq_flags); From 62978b3c619422d0ea17dbd39efdb2328295bcfb Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 19 Nov 2014 07:51:19 -0500 Subject: [PATCH 20/32] sunrpc: move rq_cachetype field to better optimize space There are a couple of holes in the svc_rqst field on x86_64. Move the rq_cachetype to a different location to eliminate both of them. Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 2714287fc4f6..8054a30c8a95 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -252,6 +252,7 @@ struct svc_rqst { u32 rq_vers; /* program version */ u32 rq_proc; /* procedure number */ u32 rq_prot; /* IP protocol */ + int rq_cachetype; /* catering to nfsd */ #define RQ_SECURE (0) /* secure port */ #define RQ_LOCAL (1) /* local request */ #define RQ_USEDEFERRAL (2) /* use deferral */ @@ -278,7 +279,6 @@ struct svc_rqst { /* Catering to nfsd */ struct auth_domain * rq_client; /* RPC peer info */ struct auth_domain * rq_gssclient; /* "gss/"-style peer info */ - int rq_cachetype; struct svc_cacherep * rq_cacherep; /* cache info */ struct task_struct *rq_task; /* service thread */ }; From 4d5db3f536ae3886ac86877742e6f8ce69a5de06 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 19 Nov 2014 07:51:20 -0500 Subject: [PATCH 21/32] sunrpc: convert sp_task_pending flag to use atomic bitops In a later patch, we'll want to be able to handle this flag without holding the sp_lock. Change this field to an unsigned long flags field, and declare a new flag in it that can be managed with atomic bitops. Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc.h | 4 +++- net/sunrpc/svc_xprt.c | 7 +++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 8054a30c8a95..5f0ab39bf7c3 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -50,7 +50,9 @@ struct svc_pool { unsigned int sp_nrthreads; /* # of threads in pool */ struct list_head sp_all_threads; /* all server threads */ struct svc_pool_stats sp_stats; /* statistics on pool operation */ - int sp_task_pending;/* has pending task */ +#define SP_TASK_PENDING (0) /* still work to do even if no + * xprt is queued. */ + unsigned long sp_flags; } ____cacheline_aligned_in_smp; /* diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 143c4c8ea2f1..37446046f4bf 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -509,7 +509,7 @@ void svc_wake_up(struct svc_serv *serv) */ wake_up_process(rqstp->rq_task); } else - pool->sp_task_pending = 1; + set_bit(SP_TASK_PENDING, &pool->sp_flags); spin_unlock_bh(&pool->sp_lock); } } @@ -644,10 +644,9 @@ static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout) * long for cache updates. */ rqstp->rq_chandle.thread_wait = 1*HZ; - pool->sp_task_pending = 0; + clear_bit(SP_TASK_PENDING, &pool->sp_flags); } else { - if (pool->sp_task_pending) { - pool->sp_task_pending = 0; + if (test_and_clear_bit(SP_TASK_PENDING, &pool->sp_flags)) { xprt = ERR_PTR(-EAGAIN); goto out; } From ceff739c53a1734d820d013d7d98f932994674d2 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 19 Nov 2014 07:51:21 -0500 Subject: [PATCH 22/32] sunrpc: have svc_wake_up only deal with pool 0 The way that svc_wake_up works is a bit inefficient. It walks all of the available pools for a service and either wakes up a task in each one or sets the SP_TASK_PENDING flag in each one. When svc_wake_up is called, there is no need to wake up more than one thread to do this work. In practice, only lockd currently uses this function and it's single threaded anyway. Thus, this just boils down to doing a wake up of a thread in pool 0 or setting a single flag. Eliminate the for loop in this function and change it to just operate on pool 0. Also update the comments that sit above it and get rid of some code that has been commented out for years now. Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- net/sunrpc/svc_xprt.c | 37 ++++++++++++++++--------------------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 37446046f4bf..b2676e597fc4 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -484,34 +484,29 @@ static void svc_xprt_release(struct svc_rqst *rqstp) } /* - * External function to wake up a server waiting for data - * This really only makes sense for services like lockd - * which have exactly one thread anyway. + * Some svc_serv's will have occasional work to do, even when a xprt is not + * waiting to be serviced. This function is there to "kick" a task in one of + * those services so that it can wake up and do that work. Note that we only + * bother with pool 0 as we don't need to wake up more than one thread for + * this purpose. */ void svc_wake_up(struct svc_serv *serv) { struct svc_rqst *rqstp; - unsigned int i; struct svc_pool *pool; - for (i = 0; i < serv->sv_nrpools; i++) { - pool = &serv->sv_pools[i]; + pool = &serv->sv_pools[0]; - spin_lock_bh(&pool->sp_lock); - if (!list_empty(&pool->sp_threads)) { - rqstp = list_entry(pool->sp_threads.next, - struct svc_rqst, - rq_list); - dprintk("svc: daemon %p woken up.\n", rqstp); - /* - svc_thread_dequeue(pool, rqstp); - rqstp->rq_xprt = NULL; - */ - wake_up_process(rqstp->rq_task); - } else - set_bit(SP_TASK_PENDING, &pool->sp_flags); - spin_unlock_bh(&pool->sp_lock); - } + spin_lock_bh(&pool->sp_lock); + if (!list_empty(&pool->sp_threads)) { + rqstp = list_entry(pool->sp_threads.next, + struct svc_rqst, + rq_list); + dprintk("svc: daemon %p woken up.\n", rqstp); + wake_up_process(rqstp->rq_task); + } else + set_bit(SP_TASK_PENDING, &pool->sp_flags); + spin_unlock_bh(&pool->sp_lock); } EXPORT_SYMBOL_GPL(svc_wake_up); From 0b5707e4524eb817b7b02863887820d27b56910a Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 19 Nov 2014 07:51:22 -0500 Subject: [PATCH 23/32] sunrpc: require svc_create callers to pass in meaningful shutdown routine Currently all svc_create callers pass in NULL for the shutdown parm, which then gets fixed up to be svc_rpcb_cleanup if the service uses rpcbind. Simplify this by instead having the the only caller that requires it (lockd) pass in svc_rpcb_cleanup and get rid of the special casing. Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/lockd/svc.c | 2 +- net/sunrpc/svc.c | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index d1bb7ecfd201..e94c887da2d7 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -350,7 +350,7 @@ static struct svc_serv *lockd_create_svc(void) printk(KERN_WARNING "lockd_up: no pid, %d users??\n", nlmsvc_users); - serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NULL); + serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, svc_rpcb_cleanup); if (!serv) { printk(KERN_WARNING "lockd_up: create service failed\n"); return ERR_PTR(-ENOMEM); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 2c1c49ebd84d..a06a891012e5 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -482,9 +482,6 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, spin_lock_init(&pool->sp_lock); } - if (svc_uses_rpcbind(serv) && (!serv->sv_shutdown)) - serv->sv_shutdown = svc_rpcb_cleanup; - return serv; } From 812443865c5fc255363d4a684a62c086af1addca Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 21 Nov 2014 14:19:28 -0500 Subject: [PATCH 24/32] sunrpc: add a rcu_head to svc_rqst and use kfree_rcu to free it ...also make the manipulation of sp_all_threads list use RCU-friendly functions. Signed-off-by: Jeff Layton Tested-by: Chris Worley Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc.h | 2 ++ include/trace/events/sunrpc.h | 3 ++- net/sunrpc/svc.c | 10 ++++++---- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 5f0ab39bf7c3..7f80a99c59e4 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -223,6 +223,7 @@ static inline void svc_putu32(struct kvec *iov, __be32 val) struct svc_rqst { struct list_head rq_list; /* idle list */ struct list_head rq_all; /* all threads list */ + struct rcu_head rq_rcu_head; /* for RCU deferred kfree */ struct svc_xprt * rq_xprt; /* transport ptr */ struct sockaddr_storage rq_addr; /* peer address */ @@ -262,6 +263,7 @@ struct svc_rqst { #define RQ_SPLICE_OK (4) /* turned off in gss privacy * to prevent encrypting page * cache pages */ +#define RQ_VICTIM (5) /* about to be shut down */ unsigned long rq_flags; /* flags field */ void * rq_argp; /* decoded arguments */ diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 5848fc235869..08a5fed50f34 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -418,7 +418,8 @@ TRACE_EVENT(xs_tcp_data_recv, { (1UL << RQ_LOCAL), "RQ_LOCAL"}, \ { (1UL << RQ_USEDEFERRAL), "RQ_USEDEFERRAL"}, \ { (1UL << RQ_DROPME), "RQ_DROPME"}, \ - { (1UL << RQ_SPLICE_OK), "RQ_SPLICE_OK"}) + { (1UL << RQ_SPLICE_OK), "RQ_SPLICE_OK"}, \ + { (1UL << RQ_VICTIM), "RQ_VICTIM"}) TRACE_EVENT(svc_recv, TP_PROTO(struct svc_rqst *rqst, int status), diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index a06a891012e5..b90d1bca4349 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -616,7 +616,7 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node) serv->sv_nrthreads++; spin_lock_bh(&pool->sp_lock); pool->sp_nrthreads++; - list_add(&rqstp->rq_all, &pool->sp_all_threads); + list_add_rcu(&rqstp->rq_all, &pool->sp_all_threads); spin_unlock_bh(&pool->sp_lock); rqstp->rq_server = serv; rqstp->rq_pool = pool; @@ -684,7 +684,8 @@ found_pool: * so we don't try to kill it again. */ rqstp = list_entry(pool->sp_all_threads.next, struct svc_rqst, rq_all); - list_del_init(&rqstp->rq_all); + set_bit(RQ_VICTIM, &rqstp->rq_flags); + list_del_rcu(&rqstp->rq_all); task = rqstp->rq_task; } spin_unlock_bh(&pool->sp_lock); @@ -782,10 +783,11 @@ svc_exit_thread(struct svc_rqst *rqstp) spin_lock_bh(&pool->sp_lock); pool->sp_nrthreads--; - list_del(&rqstp->rq_all); + if (!test_and_set_bit(RQ_VICTIM, &rqstp->rq_flags)) + list_del_rcu(&rqstp->rq_all); spin_unlock_bh(&pool->sp_lock); - kfree(rqstp); + kfree_rcu(rqstp, rq_rcu_head); /* Release the server */ if (serv) From 403c7b44441d60aba7f8a134c31279ffa60ea769 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 21 Nov 2014 14:19:29 -0500 Subject: [PATCH 25/32] sunrpc: fix potential races in pool_stats collection In a later patch, we'll be removing some spinlocking around the socket and thread queueing code in order to fix some contention problems. At that point, the stats counters will no longer be protected by the sp_lock. Change the counters to atomic_long_t fields, except for the "sockets_queued" counter which will still be manipulated under a spinlock. Signed-off-by: Jeff Layton Tested-by: Chris Worley Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc.h | 6 +++--- net/sunrpc/svc_xprt.c | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 7f80a99c59e4..513957eba0a5 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -26,10 +26,10 @@ typedef int (*svc_thread_fn)(void *); /* statistics for svc_pool structures */ struct svc_pool_stats { - unsigned long packets; + atomic_long_t packets; unsigned long sockets_queued; - unsigned long threads_woken; - unsigned long threads_timedout; + atomic_long_t threads_woken; + atomic_long_t threads_timedout; }; /* diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index b2676e597fc4..579ff2249562 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -362,7 +362,7 @@ static void svc_xprt_do_enqueue(struct svc_xprt *xprt) pool = svc_pool_for_cpu(xprt->xpt_server, cpu); spin_lock_bh(&pool->sp_lock); - pool->sp_stats.packets++; + atomic_long_inc(&pool->sp_stats.packets); if (!list_empty(&pool->sp_threads)) { rqstp = list_entry(pool->sp_threads.next, @@ -383,7 +383,7 @@ static void svc_xprt_do_enqueue(struct svc_xprt *xprt) svc_xprt_get(xprt); wake_up_process(rqstp->rq_task); rqstp->rq_xprt = xprt; - pool->sp_stats.threads_woken++; + atomic_long_inc(&pool->sp_stats.threads_woken); } else { dprintk("svc: transport %p put into queue\n", xprt); list_add_tail(&xprt->xpt_ready, &pool->sp_sockets); @@ -669,7 +669,7 @@ static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout) spin_lock_bh(&pool->sp_lock); if (!time_left) - pool->sp_stats.threads_timedout++; + atomic_long_inc(&pool->sp_stats.threads_timedout); xprt = rqstp->rq_xprt; if (!xprt) { @@ -1306,10 +1306,10 @@ static int svc_pool_stats_show(struct seq_file *m, void *p) seq_printf(m, "%u %lu %lu %lu %lu\n", pool->sp_id, - pool->sp_stats.packets, + (unsigned long)atomic_long_read(&pool->sp_stats.packets), pool->sp_stats.sockets_queued, - pool->sp_stats.threads_woken, - pool->sp_stats.threads_timedout); + (unsigned long)atomic_long_read(&pool->sp_stats.threads_woken), + (unsigned long)atomic_long_read(&pool->sp_stats.threads_timedout)); return 0; } From b1691bc03d4eddb959234409167bef9be9e62d74 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 21 Nov 2014 14:19:30 -0500 Subject: [PATCH 26/32] sunrpc: convert to lockless lookup of queued server threads Testing has shown that the pool->sp_lock can be a bottleneck on a busy server. Every time data is received on a socket, the server must take that lock in order to dequeue a thread from the sp_threads list. Address this problem by eliminating the sp_threads list (which contains threads that are currently idle) and replacing it with a RQ_BUSY flag in svc_rqst. This allows us to walk the sp_all_threads list under the rcu_read_lock and find a suitable thread for the xprt by doing a test_and_set_bit. Note that we do still have a potential atomicity problem however with this approach. We don't want svc_xprt_do_enqueue to set the rqst->rq_xprt pointer unless a test_and_set_bit of RQ_BUSY returned zero (which indicates that the thread was idle). But, by the time we check that, the bit could be flipped by a waking thread. To address this, we acquire a new per-rqst spinlock (rq_lock) and take that before doing the test_and_set_bit. If that returns false, then we can set rq_xprt and drop the spinlock. Then, when the thread wakes up, it must set the bit under the same spinlock and can trust that if it was already set then the rq_xprt is also properly set. With this scheme, the case where we have an idle thread no longer needs to take the highly contended pool->sp_lock at all, and that removes the bottleneck. That still leaves one issue: What of the case where we walk the whole sp_all_threads list and don't find an idle thread? Because the search is lockess, it's possible for the queueing to race with a thread that is going to sleep. To address that, we queue the xprt and then search again. If we find an idle thread at that point, we can't attach the xprt to it directly since that might race with a different thread waking up and finding it. All we can do is wake the idle thread back up and let it attempt to find the now-queued xprt. Signed-off-by: Jeff Layton Tested-by: Chris Worley Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc.h | 4 +- include/trace/events/sunrpc.h | 3 +- net/sunrpc/svc.c | 7 +- net/sunrpc/svc_xprt.c | 231 +++++++++++++++++++--------------- 4 files changed, 137 insertions(+), 108 deletions(-) diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 513957eba0a5..6f22cfeef5e3 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -45,7 +45,6 @@ struct svc_pool_stats { struct svc_pool { unsigned int sp_id; /* pool id; also node id on NUMA */ spinlock_t sp_lock; /* protects all fields */ - struct list_head sp_threads; /* idle server threads */ struct list_head sp_sockets; /* pending sockets */ unsigned int sp_nrthreads; /* # of threads in pool */ struct list_head sp_all_threads; /* all server threads */ @@ -221,7 +220,6 @@ static inline void svc_putu32(struct kvec *iov, __be32 val) * processed. */ struct svc_rqst { - struct list_head rq_list; /* idle list */ struct list_head rq_all; /* all threads list */ struct rcu_head rq_rcu_head; /* for RCU deferred kfree */ struct svc_xprt * rq_xprt; /* transport ptr */ @@ -264,6 +262,7 @@ struct svc_rqst { * to prevent encrypting page * cache pages */ #define RQ_VICTIM (5) /* about to be shut down */ +#define RQ_BUSY (6) /* request is busy */ unsigned long rq_flags; /* flags field */ void * rq_argp; /* decoded arguments */ @@ -285,6 +284,7 @@ struct svc_rqst { struct auth_domain * rq_gssclient; /* "gss/"-style peer info */ struct svc_cacherep * rq_cacherep; /* cache info */ struct task_struct *rq_task; /* service thread */ + spinlock_t rq_lock; /* per-request lock */ }; #define SVC_NET(svc_rqst) (svc_rqst->rq_xprt->xpt_net) diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 08a5fed50f34..ee4438a63a48 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -419,7 +419,8 @@ TRACE_EVENT(xs_tcp_data_recv, { (1UL << RQ_USEDEFERRAL), "RQ_USEDEFERRAL"}, \ { (1UL << RQ_DROPME), "RQ_DROPME"}, \ { (1UL << RQ_SPLICE_OK), "RQ_SPLICE_OK"}, \ - { (1UL << RQ_VICTIM), "RQ_VICTIM"}) + { (1UL << RQ_VICTIM), "RQ_VICTIM"}, \ + { (1UL << RQ_BUSY), "RQ_BUSY"}) TRACE_EVENT(svc_recv, TP_PROTO(struct svc_rqst *rqst, int status), diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index b90d1bca4349..91eaef1844c8 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -476,7 +476,6 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, i, serv->sv_name); pool->sp_id = i; - INIT_LIST_HEAD(&pool->sp_threads); INIT_LIST_HEAD(&pool->sp_sockets); INIT_LIST_HEAD(&pool->sp_all_threads); spin_lock_init(&pool->sp_lock); @@ -614,12 +613,14 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node) goto out_enomem; serv->sv_nrthreads++; + __set_bit(RQ_BUSY, &rqstp->rq_flags); + spin_lock_init(&rqstp->rq_lock); + rqstp->rq_server = serv; + rqstp->rq_pool = pool; spin_lock_bh(&pool->sp_lock); pool->sp_nrthreads++; list_add_rcu(&rqstp->rq_all, &pool->sp_all_threads); spin_unlock_bh(&pool->sp_lock); - rqstp->rq_server = serv; - rqstp->rq_pool = pool; rqstp->rq_argp = kmalloc_node(serv->sv_xdrsize, GFP_KERNEL, node); if (!rqstp->rq_argp) diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 579ff2249562..ed90d955f733 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -310,25 +310,6 @@ char *svc_print_addr(struct svc_rqst *rqstp, char *buf, size_t len) } EXPORT_SYMBOL_GPL(svc_print_addr); -/* - * Queue up an idle server thread. Must have pool->sp_lock held. - * Note: this is really a stack rather than a queue, so that we only - * use as many different threads as we need, and the rest don't pollute - * the cache. - */ -static void svc_thread_enqueue(struct svc_pool *pool, struct svc_rqst *rqstp) -{ - list_add(&rqstp->rq_list, &pool->sp_threads); -} - -/* - * Dequeue an nfsd thread. Must have pool->sp_lock held. - */ -static void svc_thread_dequeue(struct svc_pool *pool, struct svc_rqst *rqstp) -{ - list_del(&rqstp->rq_list); -} - static bool svc_xprt_has_something_to_do(struct svc_xprt *xprt) { if (xprt->xpt_flags & ((1<xpt_server, cpu); - spin_lock_bh(&pool->sp_lock); atomic_long_inc(&pool->sp_stats.packets); - if (!list_empty(&pool->sp_threads)) { - rqstp = list_entry(pool->sp_threads.next, - struct svc_rqst, - rq_list); - dprintk("svc: transport %p served by daemon %p\n", - xprt, rqstp); - svc_thread_dequeue(pool, rqstp); - if (rqstp->rq_xprt) - printk(KERN_ERR - "svc_xprt_enqueue: server %p, rq_xprt=%p!\n", - rqstp, rqstp->rq_xprt); - /* Note the order of the following 3 lines: - * We want to assign xprt to rqstp->rq_xprt only _after_ - * we've woken up the process, so that we don't race with - * the lockless check in svc_get_next_xprt(). +redo_search: + /* find a thread for this xprt */ + rcu_read_lock(); + list_for_each_entry_rcu(rqstp, &pool->sp_all_threads, rq_all) { + /* Do a lockless check first */ + if (test_bit(RQ_BUSY, &rqstp->rq_flags)) + continue; + + /* + * Once the xprt has been queued, it can only be dequeued by + * the task that intends to service it. All we can do at that + * point is to try to wake this thread back up so that it can + * do so. */ - svc_xprt_get(xprt); - wake_up_process(rqstp->rq_task); - rqstp->rq_xprt = xprt; + if (!queued) { + spin_lock_bh(&rqstp->rq_lock); + if (test_and_set_bit(RQ_BUSY, &rqstp->rq_flags)) { + /* already busy, move on... */ + spin_unlock_bh(&rqstp->rq_lock); + continue; + } + + /* this one will do */ + rqstp->rq_xprt = xprt; + svc_xprt_get(xprt); + spin_unlock_bh(&rqstp->rq_lock); + } + rcu_read_unlock(); + atomic_long_inc(&pool->sp_stats.threads_woken); - } else { + wake_up_process(rqstp->rq_task); + put_cpu(); + return; + } + rcu_read_unlock(); + + /* + * We didn't find an idle thread to use, so we need to queue the xprt. + * Do so and then search again. If we find one, we can't hook this one + * up to it directly but we can wake the thread up in the hopes that it + * will pick it up once it searches for a xprt to service. + */ + if (!queued) { + queued = true; dprintk("svc: transport %p put into queue\n", xprt); + spin_lock_bh(&pool->sp_lock); list_add_tail(&xprt->xpt_ready, &pool->sp_sockets); pool->sp_stats.sockets_queued++; + spin_unlock_bh(&pool->sp_lock); + goto redo_search; } - - spin_unlock_bh(&pool->sp_lock); put_cpu(); } @@ -408,21 +413,26 @@ void svc_xprt_enqueue(struct svc_xprt *xprt) EXPORT_SYMBOL_GPL(svc_xprt_enqueue); /* - * Dequeue the first transport. Must be called with the pool->sp_lock held. + * Dequeue the first transport, if there is one. */ static struct svc_xprt *svc_xprt_dequeue(struct svc_pool *pool) { - struct svc_xprt *xprt; + struct svc_xprt *xprt = NULL; if (list_empty(&pool->sp_sockets)) return NULL; - xprt = list_entry(pool->sp_sockets.next, - struct svc_xprt, xpt_ready); - list_del_init(&xprt->xpt_ready); + spin_lock_bh(&pool->sp_lock); + if (likely(!list_empty(&pool->sp_sockets))) { + xprt = list_first_entry(&pool->sp_sockets, + struct svc_xprt, xpt_ready); + list_del_init(&xprt->xpt_ready); + svc_xprt_get(xprt); - dprintk("svc: transport %p dequeued, inuse=%d\n", - xprt, atomic_read(&xprt->xpt_ref.refcount)); + dprintk("svc: transport %p dequeued, inuse=%d\n", + xprt, atomic_read(&xprt->xpt_ref.refcount)); + } + spin_unlock_bh(&pool->sp_lock); return xprt; } @@ -497,16 +507,21 @@ void svc_wake_up(struct svc_serv *serv) pool = &serv->sv_pools[0]; - spin_lock_bh(&pool->sp_lock); - if (!list_empty(&pool->sp_threads)) { - rqstp = list_entry(pool->sp_threads.next, - struct svc_rqst, - rq_list); + rcu_read_lock(); + list_for_each_entry_rcu(rqstp, &pool->sp_all_threads, rq_all) { + /* skip any that aren't queued */ + if (test_bit(RQ_BUSY, &rqstp->rq_flags)) + continue; + rcu_read_unlock(); dprintk("svc: daemon %p woken up.\n", rqstp); wake_up_process(rqstp->rq_task); - } else - set_bit(SP_TASK_PENDING, &pool->sp_flags); - spin_unlock_bh(&pool->sp_lock); + return; + } + rcu_read_unlock(); + + /* No free entries available */ + set_bit(SP_TASK_PENDING, &pool->sp_flags); + smp_wmb(); } EXPORT_SYMBOL_GPL(svc_wake_up); @@ -617,22 +632,47 @@ static int svc_alloc_arg(struct svc_rqst *rqstp) return 0; } +static bool +rqst_should_sleep(struct svc_rqst *rqstp) +{ + struct svc_pool *pool = rqstp->rq_pool; + + /* did someone call svc_wake_up? */ + if (test_and_clear_bit(SP_TASK_PENDING, &pool->sp_flags)) + return false; + + /* was a socket queued? */ + if (!list_empty(&pool->sp_sockets)) + return false; + + /* are we shutting down? */ + if (signalled() || kthread_should_stop()) + return false; + + /* are we freezing? */ + if (freezing(current)) + return false; + + return true; +} + static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout) { struct svc_xprt *xprt; struct svc_pool *pool = rqstp->rq_pool; long time_left = 0; + /* rq_xprt should be clear on entry */ + WARN_ON_ONCE(rqstp->rq_xprt); + /* Normally we will wait up to 5 seconds for any required * cache information to be provided. */ rqstp->rq_chandle.thread_wait = 5*HZ; - spin_lock_bh(&pool->sp_lock); xprt = svc_xprt_dequeue(pool); if (xprt) { rqstp->rq_xprt = xprt; - svc_xprt_get(xprt); /* As there is a shortage of threads and this request * had to be queued, don't allow the thread to wait so @@ -640,51 +680,38 @@ static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout) */ rqstp->rq_chandle.thread_wait = 1*HZ; clear_bit(SP_TASK_PENDING, &pool->sp_flags); - } else { - if (test_and_clear_bit(SP_TASK_PENDING, &pool->sp_flags)) { - xprt = ERR_PTR(-EAGAIN); - goto out; - } - /* - * We have to be able to interrupt this wait - * to bring down the daemons ... - */ - set_current_state(TASK_INTERRUPTIBLE); - - /* No data pending. Go to sleep */ - svc_thread_enqueue(pool, rqstp); - spin_unlock_bh(&pool->sp_lock); - - if (!(signalled() || kthread_should_stop())) { - time_left = schedule_timeout(timeout); - __set_current_state(TASK_RUNNING); - - try_to_freeze(); - - xprt = rqstp->rq_xprt; - if (xprt != NULL) - return xprt; - } else - __set_current_state(TASK_RUNNING); - - spin_lock_bh(&pool->sp_lock); - if (!time_left) - atomic_long_inc(&pool->sp_stats.threads_timedout); - - xprt = rqstp->rq_xprt; - if (!xprt) { - svc_thread_dequeue(pool, rqstp); - spin_unlock_bh(&pool->sp_lock); - dprintk("svc: server %p, no data yet\n", rqstp); - if (signalled() || kthread_should_stop()) - return ERR_PTR(-EINTR); - else - return ERR_PTR(-EAGAIN); - } + return xprt; } -out: - spin_unlock_bh(&pool->sp_lock); - return xprt; + + /* + * We have to be able to interrupt this wait + * to bring down the daemons ... + */ + set_current_state(TASK_INTERRUPTIBLE); + clear_bit(RQ_BUSY, &rqstp->rq_flags); + smp_mb(); + + if (likely(rqst_should_sleep(rqstp))) + time_left = schedule_timeout(timeout); + else + __set_current_state(TASK_RUNNING); + + try_to_freeze(); + + spin_lock_bh(&rqstp->rq_lock); + set_bit(RQ_BUSY, &rqstp->rq_flags); + spin_unlock_bh(&rqstp->rq_lock); + + xprt = rqstp->rq_xprt; + if (xprt != NULL) + return xprt; + + if (!time_left) + atomic_long_inc(&pool->sp_stats.threads_timedout); + + if (signalled() || kthread_should_stop()) + return ERR_PTR(-EINTR); + return ERR_PTR(-EAGAIN); } static void svc_add_new_temp_xprt(struct svc_serv *serv, struct svc_xprt *newxpt) From 83a712e0afefaf68555f816ea78ecd2862c6cf30 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 21 Nov 2014 14:19:31 -0500 Subject: [PATCH 27/32] sunrpc: add some tracepoints around enqueue and dequeue of svc_xprt These were useful when I was tracking down a race condition between svc_xprt_do_enqueue and svc_get_next_xprt. Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- include/trace/events/sunrpc.h | 94 +++++++++++++++++++++++++++++++++++ net/sunrpc/svc_xprt.c | 22 +++++--- 2 files changed, 109 insertions(+), 7 deletions(-) diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index ee4438a63a48..b9c1dc6c825a 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -480,6 +481,99 @@ DEFINE_EVENT(svc_rqst_status, svc_send, TP_PROTO(struct svc_rqst *rqst, int status), TP_ARGS(rqst, status)); +#define show_svc_xprt_flags(flags) \ + __print_flags(flags, "|", \ + { (1UL << XPT_BUSY), "XPT_BUSY"}, \ + { (1UL << XPT_CONN), "XPT_CONN"}, \ + { (1UL << XPT_CLOSE), "XPT_CLOSE"}, \ + { (1UL << XPT_DATA), "XPT_DATA"}, \ + { (1UL << XPT_TEMP), "XPT_TEMP"}, \ + { (1UL << XPT_DEAD), "XPT_DEAD"}, \ + { (1UL << XPT_CHNGBUF), "XPT_CHNGBUF"}, \ + { (1UL << XPT_DEFERRED), "XPT_DEFERRED"}, \ + { (1UL << XPT_OLD), "XPT_OLD"}, \ + { (1UL << XPT_LISTENER), "XPT_LISTENER"}, \ + { (1UL << XPT_CACHE_AUTH), "XPT_CACHE_AUTH"}, \ + { (1UL << XPT_LOCAL), "XPT_LOCAL"}) + +TRACE_EVENT(svc_xprt_do_enqueue, + TP_PROTO(struct svc_xprt *xprt, struct svc_rqst *rqst), + + TP_ARGS(xprt, rqst), + + TP_STRUCT__entry( + __field(struct svc_xprt *, xprt) + __field(struct svc_rqst *, rqst) + ), + + TP_fast_assign( + __entry->xprt = xprt; + __entry->rqst = rqst; + ), + + TP_printk("xprt=0x%p addr=%pIScp pid=%d flags=%s", __entry->xprt, + (struct sockaddr *)&__entry->xprt->xpt_remote, + __entry->rqst ? __entry->rqst->rq_task->pid : 0, + show_svc_xprt_flags(__entry->xprt->xpt_flags)) +); + +TRACE_EVENT(svc_xprt_dequeue, + TP_PROTO(struct svc_xprt *xprt), + + TP_ARGS(xprt), + + TP_STRUCT__entry( + __field(struct svc_xprt *, xprt) + __field_struct(struct sockaddr_storage, ss) + __field(unsigned long, flags) + ), + + TP_fast_assign( + __entry->xprt = xprt, + xprt ? memcpy(&__entry->ss, &xprt->xpt_remote, sizeof(__entry->ss)) : memset(&__entry->ss, 0, sizeof(__entry->ss)); + __entry->flags = xprt ? xprt->xpt_flags : 0; + ), + + TP_printk("xprt=0x%p addr=%pIScp flags=%s", __entry->xprt, + (struct sockaddr *)&__entry->ss, + show_svc_xprt_flags(__entry->flags)) +); + +TRACE_EVENT(svc_wake_up, + TP_PROTO(int pid), + + TP_ARGS(pid), + + TP_STRUCT__entry( + __field(int, pid) + ), + + TP_fast_assign( + __entry->pid = pid; + ), + + TP_printk("pid=%d", __entry->pid) +); + +TRACE_EVENT(svc_handle_xprt, + TP_PROTO(struct svc_xprt *xprt, int len), + + TP_ARGS(xprt, len), + + TP_STRUCT__entry( + __field(struct svc_xprt *, xprt) + __field(int, len) + ), + + TP_fast_assign( + __entry->xprt = xprt; + __entry->len = len; + ), + + TP_printk("xprt=0x%p addr=%pIScp len=%d flags=%s", __entry->xprt, + (struct sockaddr *)&__entry->xprt->xpt_remote, __entry->len, + show_svc_xprt_flags(__entry->xprt->xpt_flags)) +); #endif /* _TRACE_SUNRPC_H */ #include diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index ed90d955f733..73d40bd1839a 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -322,12 +322,12 @@ static bool svc_xprt_has_something_to_do(struct svc_xprt *xprt) static void svc_xprt_do_enqueue(struct svc_xprt *xprt) { struct svc_pool *pool; - struct svc_rqst *rqstp; + struct svc_rqst *rqstp = NULL; int cpu; bool queued = false; if (!svc_xprt_has_something_to_do(xprt)) - return; + goto out; /* Mark transport as busy. It will remain in this state until * the provider calls svc_xprt_received. We update XPT_BUSY @@ -337,7 +337,7 @@ static void svc_xprt_do_enqueue(struct svc_xprt *xprt) if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags)) { /* Don't enqueue transport while already enqueued */ dprintk("svc: transport %p busy, not enqueued\n", xprt); - return; + goto out; } cpu = get_cpu(); @@ -377,7 +377,7 @@ redo_search: atomic_long_inc(&pool->sp_stats.threads_woken); wake_up_process(rqstp->rq_task); put_cpu(); - return; + goto out; } rcu_read_unlock(); @@ -396,7 +396,10 @@ redo_search: spin_unlock_bh(&pool->sp_lock); goto redo_search; } + rqstp = NULL; put_cpu(); +out: + trace_svc_xprt_do_enqueue(xprt, rqstp); } /* @@ -420,7 +423,7 @@ static struct svc_xprt *svc_xprt_dequeue(struct svc_pool *pool) struct svc_xprt *xprt = NULL; if (list_empty(&pool->sp_sockets)) - return NULL; + goto out; spin_lock_bh(&pool->sp_lock); if (likely(!list_empty(&pool->sp_sockets))) { @@ -433,7 +436,8 @@ static struct svc_xprt *svc_xprt_dequeue(struct svc_pool *pool) xprt, atomic_read(&xprt->xpt_ref.refcount)); } spin_unlock_bh(&pool->sp_lock); - +out: + trace_svc_xprt_dequeue(xprt); return xprt; } @@ -515,6 +519,7 @@ void svc_wake_up(struct svc_serv *serv) rcu_read_unlock(); dprintk("svc: daemon %p woken up.\n", rqstp); wake_up_process(rqstp->rq_task); + trace_svc_wake_up(rqstp->rq_task->pid); return; } rcu_read_unlock(); @@ -522,6 +527,7 @@ void svc_wake_up(struct svc_serv *serv) /* No free entries available */ set_bit(SP_TASK_PENDING, &pool->sp_flags); smp_wmb(); + trace_svc_wake_up(0); } EXPORT_SYMBOL_GPL(svc_wake_up); @@ -740,7 +746,7 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt) dprintk("svc_recv: found XPT_CLOSE\n"); svc_delete_xprt(xprt); /* Leave XPT_BUSY set on the dead xprt: */ - return 0; + goto out; } if (test_bit(XPT_LISTENER, &xprt->xpt_flags)) { struct svc_xprt *newxpt; @@ -771,6 +777,8 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt) } /* clear XPT_BUSY: */ svc_xprt_received(xprt); +out: + trace_svc_handle_xprt(xprt, len); return len; } From ef17af2a817db97d42dd2ec0a425231748e23dbc Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Fri, 5 Dec 2014 16:40:07 +0100 Subject: [PATCH 28/32] fs: nfsd: Fix signedness bug in compare_blob Bugs similar to the one in acbbe6fbb240 (kcmp: fix standard comparison bug) are in rich supply. In this variant, the problem is that struct xdr_netobj::len has type unsigned int, so the expression o1->len - o2->len _also_ has type unsigned int; it has completely well-defined semantics, and the result is some non-negative integer, which is always representable in a long long. But this means that if the conditional triggers, we are guaranteed to return a positive value from compare_blob. In this case it could be fixed by - res = o1->len - o2->len; + res = (long long)o1->len - (long long)o2->len; but I'd rather eliminate the usually broken 'return a - b;' idiom. Reviewed-by: Jeff Layton Cc: Signed-off-by: Rasmus Villemoes Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 1379d86f7b4f..8770ba75b7b4 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1714,15 +1714,14 @@ static int copy_cred(struct svc_cred *target, struct svc_cred *source) return 0; } -static long long +static int compare_blob(const struct xdr_netobj *o1, const struct xdr_netobj *o2) { - long long res; - - res = o1->len - o2->len; - if (res) - return res; - return (long long)memcmp(o1->data, o2->data, o1->len); + if (o1->len < o2->len) + return -1; + if (o1->len > o2->len) + return 1; + return memcmp(o1->data, o2->data, o1->len); } static int same_name(const char *n1, const char *n2) @@ -1910,7 +1909,7 @@ add_clp_to_name_tree(struct nfs4_client *new_clp, struct rb_root *root) static struct nfs4_client * find_clp_in_name_tree(struct xdr_netobj *name, struct rb_root *root) { - long long cmp; + int cmp; struct rb_node *node = root->rb_node; struct nfs4_client *clp; From acf06a7fa12070abb3eab24fc4bc30e361a7c416 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 1 Dec 2014 13:45:24 -0500 Subject: [PATCH 29/32] sunrpc: only call test_bit once in svc_xprt_received ...move the WARN_ON_ONCE inside the following if block since they use the same condition. Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- net/sunrpc/svc_xprt.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 73d40bd1839a..c69358b3cf7f 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -220,9 +220,11 @@ static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl, */ static void svc_xprt_received(struct svc_xprt *xprt) { - WARN_ON_ONCE(!test_bit(XPT_BUSY, &xprt->xpt_flags)); - if (!test_bit(XPT_BUSY, &xprt->xpt_flags)) + if (!test_bit(XPT_BUSY, &xprt->xpt_flags)) { + WARN_ONCE(1, "xprt=0x%p already busy!", xprt); return; + } + /* As soon as we clear busy, the xprt could be closed and * 'put', so we need a reference to call svc_xprt_do_enqueue with: */ From 1b2e122d167d8983775eb57d55349c331e6aa6c7 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 28 Nov 2014 17:50:28 +0200 Subject: [PATCH 30/32] sunrpc/cache: convert to use string_escape_str() There is nice kernel helper to escape a given strings by provided rules. Let's use it instead of custom approach. Signed-off-by: Andy Shevchenko [bfields@redhat.com: fix length calculation] Signed-off-by: J. Bruce Fields --- net/sunrpc/cache.c | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 066362141133..33fb105d4352 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -1067,30 +1068,15 @@ void qword_add(char **bpp, int *lp, char *str) { char *bp = *bpp; int len = *lp; - char c; + int ret; if (len < 0) return; - while ((c=*str++) && len) - switch(c) { - case ' ': - case '\t': - case '\n': - case '\\': - if (len >= 4) { - *bp++ = '\\'; - *bp++ = '0' + ((c & 0300)>>6); - *bp++ = '0' + ((c & 0070)>>3); - *bp++ = '0' + ((c & 0007)>>0); - } - len -= 4; - break; - default: - *bp++ = c; - len--; - } - if (c || len <1) len = -1; + ret = string_escape_str(str, &bp, len, ESCAPE_OCTAL, "\\ \n\t"); + if (ret < 0 || ret == len) + len = -1; else { + len -= ret; *bp++ = ' '; len--; } From 5a64e56976f1ba98743e1678c0029a98e9034c81 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Sun, 7 Dec 2014 16:05:47 -0500 Subject: [PATCH 31/32] nfsd4: fix xdr4 inclusion of escaped char Fix a bug where nfsd4_encode_components_esc() includes the esc_end char as an additional string encoding. Signed-off-by: Benjamin Coddington Cc: stable@vger.kernel.org Fixes: e7a0444aef4a "nfsd: add IPv6 addr escaping to fs_location hosts" Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 8880ec8cfd11..a8549f8fef57 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1812,6 +1812,9 @@ static __be32 nfsd4_encode_components_esc(struct xdr_stream *xdr, char sep, } else end++; + if (found_esc) + end = next; + str = end; } pathlen = htonl(xdr->buf->len - pathlen_offset); From bf7491f1be5e125eece2ec67e0f79d513caa6c7e Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Sun, 7 Dec 2014 16:05:48 -0500 Subject: [PATCH 32/32] nfsd4: fix xdr4 count of server in fs_location4 Fix a bug where nfsd4_encode_components_esc() incorrectly calculates the length of server array in fs_location4--note that it is a count of the number of array elements, not a length in bytes. Signed-off-by: Benjamin Coddington Fixes: 082d4bd72a45 (nfsd4: "backfill" using write_bytes_to_xdr_buf) Cc: stable@vger.kernel.org Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index a8549f8fef57..e578c87d5527 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1817,7 +1817,7 @@ static __be32 nfsd4_encode_components_esc(struct xdr_stream *xdr, char sep, str = end; } - pathlen = htonl(xdr->buf->len - pathlen_offset); + pathlen = htonl(count); write_bytes_to_xdr_buf(xdr->buf, pathlen_offset, &pathlen, 4); return 0; }