From aede517207b2bf4e5176492ce3be0e45feb581bd Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Jul 2021 10:48:04 -0400 Subject: [PATCH 01/39] SUNRPC: Refactor rpc_ping() Make it use the rpc_null_call_helper() so that it can share the new rpc_call_ops structure to be introduced in the next patch. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/clnt.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 8b4de70e8ead..ca2000d8cf64 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2694,17 +2694,6 @@ static const struct rpc_procinfo rpcproc_null = { .p_decode = rpcproc_decode_null, }; -static int rpc_ping(struct rpc_clnt *clnt) -{ - struct rpc_message msg = { - .rpc_proc = &rpcproc_null, - }; - int err; - err = rpc_call_sync(clnt, &msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN | - RPC_TASK_NULLCREDS); - return err; -} - static struct rpc_task *rpc_call_null_helper(struct rpc_clnt *clnt, struct rpc_xprt *xprt, struct rpc_cred *cred, int flags, @@ -2733,6 +2722,19 @@ struct rpc_task *rpc_call_null(struct rpc_clnt *clnt, struct rpc_cred *cred, int } EXPORT_SYMBOL_GPL(rpc_call_null); +static int rpc_ping(struct rpc_clnt *clnt) +{ + struct rpc_task *task; + int status; + + task = rpc_call_null_helper(clnt, NULL, NULL, 0, NULL, NULL); + if (IS_ERR(task)) + return PTR_ERR(task); + status = task->tk_status; + rpc_put_task(task); + return status; +} + struct rpc_cb_add_xprt_calldata { struct rpc_xprt_switch *xps; struct rpc_xprt *xprt; From 823c73d0c539a0c1740c7929a7aad3f844e69d70 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Jul 2021 10:48:10 -0400 Subject: [PATCH 02/39] SUNRPC: Unset RPC_TASK_NO_RETRANS_TIMEOUT for NULL RPCs In some rare failure modes, the server is actually reading the transport, but then just dropping the requests on the floor. TCP_USER_TIMEOUT cannot detect that case. Prevent such a stuck server from pinning client resources indefinitely by ensuring that certain idempotent requests (such as NULL) can time out even if the connection is still operational. Otherwise rpc_bind_new_program(), gss_destroy_cred(), or rpc_clnt_test_and_add_xprt() can wait forever. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/clnt.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index ca2000d8cf64..d34737a8a68a 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2694,6 +2694,18 @@ static const struct rpc_procinfo rpcproc_null = { .p_decode = rpcproc_decode_null, }; +static void +rpc_null_call_prepare(struct rpc_task *task, void *data) +{ + task->tk_flags &= ~RPC_TASK_NO_RETRANS_TIMEOUT; + rpc_call_start(task); +} + +static const struct rpc_call_ops rpc_null_ops = { + .rpc_call_prepare = rpc_null_call_prepare, + .rpc_call_done = rpc_default_callback, +}; + static struct rpc_task *rpc_call_null_helper(struct rpc_clnt *clnt, struct rpc_xprt *xprt, struct rpc_cred *cred, int flags, @@ -2707,7 +2719,7 @@ struct rpc_task *rpc_call_null_helper(struct rpc_clnt *clnt, .rpc_xprt = xprt, .rpc_message = &msg, .rpc_op_cred = cred, - .callback_ops = (ops != NULL) ? ops : &rpc_default_ops, + .callback_ops = ops ?: &rpc_null_ops, .callback_data = data, .flags = flags | RPC_TASK_SOFT | RPC_TASK_SOFTCONN | RPC_TASK_NULLCREDS, @@ -2758,6 +2770,7 @@ static void rpc_cb_add_xprt_release(void *calldata) } static const struct rpc_call_ops rpc_cb_add_xprt_call_ops = { + .rpc_call_prepare = rpc_null_call_prepare, .rpc_call_done = rpc_cb_add_xprt_done, .rpc_release = rpc_cb_add_xprt_release, }; From d480696dc68943538b81a26b0f4f39eb50c41380 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Jul 2021 10:48:16 -0400 Subject: [PATCH 03/39] SUNRPC: Remove unneeded TRACE_DEFINE_ENUMs Clean up: TRACE_DEFINE_ENUM is needed only for enums, not for C macros. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/sunrpc.h | 34 ---------------------------------- 1 file changed, 34 deletions(-) diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 861f199896c6..ea6340129b1b 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -295,21 +295,6 @@ TRACE_EVENT(rpc_request, ) ); -TRACE_DEFINE_ENUM(RPC_TASK_ASYNC); -TRACE_DEFINE_ENUM(RPC_TASK_SWAPPER); -TRACE_DEFINE_ENUM(RPC_TASK_NULLCREDS); -TRACE_DEFINE_ENUM(RPC_CALL_MAJORSEEN); -TRACE_DEFINE_ENUM(RPC_TASK_ROOTCREDS); -TRACE_DEFINE_ENUM(RPC_TASK_DYNAMIC); -TRACE_DEFINE_ENUM(RPC_TASK_NO_ROUND_ROBIN); -TRACE_DEFINE_ENUM(RPC_TASK_SOFT); -TRACE_DEFINE_ENUM(RPC_TASK_SOFTCONN); -TRACE_DEFINE_ENUM(RPC_TASK_SENT); -TRACE_DEFINE_ENUM(RPC_TASK_TIMEOUT); -TRACE_DEFINE_ENUM(RPC_TASK_NOCONNECT); -TRACE_DEFINE_ENUM(RPC_TASK_NO_RETRANS_TIMEOUT); -TRACE_DEFINE_ENUM(RPC_TASK_CRED_NOREF); - #define rpc_show_task_flags(flags) \ __print_flags(flags, "|", \ { RPC_TASK_ASYNC, "ASYNC" }, \ @@ -327,14 +312,6 @@ TRACE_DEFINE_ENUM(RPC_TASK_CRED_NOREF); { RPC_TASK_NO_RETRANS_TIMEOUT, "NORTO" }, \ { RPC_TASK_CRED_NOREF, "CRED_NOREF" }) -TRACE_DEFINE_ENUM(RPC_TASK_RUNNING); -TRACE_DEFINE_ENUM(RPC_TASK_QUEUED); -TRACE_DEFINE_ENUM(RPC_TASK_ACTIVE); -TRACE_DEFINE_ENUM(RPC_TASK_NEED_XMIT); -TRACE_DEFINE_ENUM(RPC_TASK_NEED_RECV); -TRACE_DEFINE_ENUM(RPC_TASK_MSG_PIN_WAIT); -TRACE_DEFINE_ENUM(RPC_TASK_SIGNALLED); - #define rpc_show_runstate(flags) \ __print_flags(flags, "|", \ { (1UL << RPC_TASK_RUNNING), "RUNNING" }, \ @@ -945,17 +922,6 @@ TRACE_EVENT(rpc_socket_nospace, ) ); -TRACE_DEFINE_ENUM(XPRT_LOCKED); -TRACE_DEFINE_ENUM(XPRT_CONNECTED); -TRACE_DEFINE_ENUM(XPRT_CONNECTING); -TRACE_DEFINE_ENUM(XPRT_CLOSE_WAIT); -TRACE_DEFINE_ENUM(XPRT_BOUND); -TRACE_DEFINE_ENUM(XPRT_BINDING); -TRACE_DEFINE_ENUM(XPRT_CLOSING); -TRACE_DEFINE_ENUM(XPRT_CONGESTED); -TRACE_DEFINE_ENUM(XPRT_CWND_WAIT); -TRACE_DEFINE_ENUM(XPRT_WRITE_SPACE); - #define rpc_show_xprt_state(x) \ __print_flags(x, "|", \ { (1UL << XPRT_LOCKED), "LOCKED"}, \ From f9d091cff80d303dde6182296e0f4d7b8a7880ac Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Jul 2021 10:48:22 -0400 Subject: [PATCH 04/39] SUNRPC: Update trace flags Recent patches added RPC_TASK_MOVEABLE, XPRT_OFFLINE, and XPRT_REMOVE. Update the tracepoint display macros to display these flags properly. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/sunrpc.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index ea6340129b1b..b13130903a50 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -299,6 +299,7 @@ TRACE_EVENT(rpc_request, __print_flags(flags, "|", \ { RPC_TASK_ASYNC, "ASYNC" }, \ { RPC_TASK_SWAPPER, "SWAPPER" }, \ + { RPC_TASK_MOVEABLE, "MOVEABLE" }, \ { RPC_TASK_NULLCREDS, "NULLCREDS" }, \ { RPC_CALL_MAJORSEEN, "MAJORSEEN" }, \ { RPC_TASK_ROOTCREDS, "ROOTCREDS" }, \ @@ -931,6 +932,8 @@ TRACE_EVENT(rpc_socket_nospace, { (1UL << XPRT_BOUND), "BOUND"}, \ { (1UL << XPRT_BINDING), "BINDING"}, \ { (1UL << XPRT_CLOSING), "CLOSING"}, \ + { (1UL << XPRT_OFFLINE), "OFFLINE"}, \ + { (1UL << XPRT_REMOVE), "REMOVE"}, \ { (1UL << XPRT_CONGESTED), "CONGESTED"}, \ { (1UL << XPRT_CWND_WAIT), "CWND_WAIT"}, \ { (1UL << XPRT_WRITE_SPACE), "WRITE_SPACE"}) From be630b9150b0321e21dbd951d715cff72c73b0c6 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Jul 2021 10:48:28 -0400 Subject: [PATCH 05/39] SUNRPC: xprt_retransmit() displays the the NULL procedure incorrectly Currently: xprt_retransmit: task:11@1 xid=0x55a7ffac nfsv4 (null) ntrans=2 should be: xprt_retransmit: task:11@1 xid=0x55a7ffac nfsv4 NULL ntrans=2 Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/sunrpc.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index b13130903a50..59ad1718496b 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -1063,8 +1063,7 @@ TRACE_EVENT(xprt_retransmit, __field(int, version) __string(progname, rqst->rq_task->tk_client->cl_program->name) - __string(procedure, - rqst->rq_task->tk_msg.rpc_proc->p_name) + __string(procname, rpc_proc_name(rqst->rq_task)) ), TP_fast_assign( @@ -1078,14 +1077,15 @@ TRACE_EVENT(xprt_retransmit, __assign_str(progname, task->tk_client->cl_program->name); __entry->version = task->tk_client->cl_vers; - __assign_str(procedure, task->tk_msg.rpc_proc->p_name); + __assign_str(procname, rpc_proc_name(task)); ), TP_printk( "task:%u@%u xid=0x%08x %sv%d %s ntrans=%d", __entry->task_id, __entry->client_id, __entry->xid, - __get_str(progname), __entry->version, __get_str(procedure), - __entry->ntrans) + __get_str(progname), __entry->version, __get_str(procname), + __entry->ntrans + ) ); TRACE_EVENT(xprt_ping, From be17b8caf3a3a20c4d910265a6287b07ab444795 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Jul 2021 10:48:34 -0400 Subject: [PATCH 06/39] SUNRPC: Record timeout value in xprt_retransmit tracepoint The client can alter the timeout value after each retransmit. Record the updated timeout value in the trace log. Suggested-by: Dai Ngo Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/sunrpc.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 59ad1718496b..18d552a17c19 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -1061,6 +1061,7 @@ TRACE_EVENT(xprt_retransmit, __field(u32, xid) __field(int, ntrans) __field(int, version) + __field(unsigned long, timeout) __string(progname, rqst->rq_task->tk_client->cl_program->name) __string(procname, rpc_proc_name(rqst->rq_task)) @@ -1074,6 +1075,7 @@ TRACE_EVENT(xprt_retransmit, task->tk_client->cl_clid : -1; __entry->xid = be32_to_cpu(rqst->rq_xid); __entry->ntrans = rqst->rq_ntrans; + __entry->timeout = task->tk_timeout; __assign_str(progname, task->tk_client->cl_program->name); __entry->version = task->tk_client->cl_vers; @@ -1081,10 +1083,10 @@ TRACE_EVENT(xprt_retransmit, ), TP_printk( - "task:%u@%u xid=0x%08x %sv%d %s ntrans=%d", + "task:%u@%u xid=0x%08x %sv%d %s ntrans=%d timeout=%lu", __entry->task_id, __entry->client_id, __entry->xid, __get_str(progname), __entry->version, __get_str(procname), - __entry->ntrans + __entry->ntrans, __entry->timeout ) ); From 1143129e4d0d27740ce680d2fb0161ad4f27aa7e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 2 Aug 2021 14:44:17 -0400 Subject: [PATCH 07/39] xprtrdma: Disconnect after an ib_post_send() immediate error ib_post_send() does not disconnect the QP when it returns an immediate error. Thus, the code that posts LocalInv has to explicitly disconnect after an immediate error. This is just like the frwr_send() callers handle it. If a disconnect isn't done here, the transport deadlocks. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/frwr_ops.c | 8 ++++++++ net/sunrpc/xprtrdma/verbs.c | 2 +- net/sunrpc/xprtrdma/xprt_rdma.h | 1 + 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 229fcc9a9064..754c5dffe127 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -557,6 +557,10 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) /* On error, the MRs get destroyed once the QP has drained. */ trace_xprtrdma_post_linv_err(req, rc); + + /* Force a connection loss to ensure complete recovery. + */ + rpcrdma_force_disconnect(ep); } /** @@ -653,4 +657,8 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * retransmission. */ rpcrdma_unpin_rqst(req->rl_reply); + + /* Force a connection loss to ensure complete recovery. + */ + rpcrdma_force_disconnect(ep); } diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 649c23518ec0..c1797ea19418 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -124,7 +124,7 @@ static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt) * connection is closed or lost. (The important thing is it needs * to be invoked "at least" once). */ -static void rpcrdma_force_disconnect(struct rpcrdma_ep *ep) +void rpcrdma_force_disconnect(struct rpcrdma_ep *ep) { if (atomic_add_unless(&ep->re_force_disconnect, 1, 1)) xprt_force_disconnect(ep->re_xprt); diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 5d231d94e944..927e20a2c04e 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -454,6 +454,7 @@ extern unsigned int xprt_rdma_memreg_strategy; /* * Endpoint calls - xprtrdma/verbs.c */ +void rpcrdma_force_disconnect(struct rpcrdma_ep *ep); void rpcrdma_flush_disconnect(struct rpcrdma_xprt *r_xprt, struct ib_wc *wc); int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt); void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt); From 97480cae13ca3a9c1de3eb6fd66cf9650a60db42 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 2 Aug 2021 14:44:24 -0400 Subject: [PATCH 08/39] xprtrdma: Put rpcrdma_reps before waking the tear-down completion Ensure the tear-down completion is awoken only /after/ we've stopped fiddling with rpcrdma_rep objects in rpcrdma_post_recvs(). Fixes: 15788d1d1077 ("xprtrdma: Do not refresh Receive Queue while it is draining") Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index c1797ea19418..016f10a781b4 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1416,11 +1416,6 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp) rc = ib_post_recv(ep->re_id->qp, wr, (const struct ib_recv_wr **)&bad_wr); - if (atomic_dec_return(&ep->re_receiving) > 0) - complete(&ep->re_done); - -out: - trace_xprtrdma_post_recvs(r_xprt, count, rc); if (rc) { for (wr = bad_wr; wr;) { struct rpcrdma_rep *rep; @@ -1431,6 +1426,11 @@ out: --count; } } + if (atomic_dec_return(&ep->re_receiving) > 0) + complete(&ep->re_done); + +out: + trace_xprtrdma_post_recvs(r_xprt, count, rc); ep->re_receive_count += count; return; } From 683f31c3ab2e60b323b9b88c0ac389ff9cacec1a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 2 Aug 2021 14:44:30 -0400 Subject: [PATCH 09/39] xprtrdma: Add xprtrdma_post_recvs_err() tracepoint In the vast majority of cases, rc=0. Don't record that in the post_recvs tracepoint. Instead, add a separate tracepoint that can be left enabled all the time to capture the very rare immediate errors returned by ib_post_recv(). Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 41 ++++++++++++++++++++++++++++------ net/sunrpc/xprtrdma/verbs.c | 3 ++- 2 files changed, 36 insertions(+), 8 deletions(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index bd55908c1bef..d65a84bd040c 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -818,16 +818,14 @@ TRACE_EVENT(xprtrdma_post_recv, TRACE_EVENT(xprtrdma_post_recvs, TP_PROTO( const struct rpcrdma_xprt *r_xprt, - unsigned int count, - int status + unsigned int count ), - TP_ARGS(r_xprt, count, status), + TP_ARGS(r_xprt, count), TP_STRUCT__entry( __field(u32, cq_id) __field(unsigned int, count) - __field(int, status) __field(int, posted) __string(addr, rpcrdma_addrstr(r_xprt)) __string(port, rpcrdma_portstr(r_xprt)) @@ -838,15 +836,44 @@ TRACE_EVENT(xprtrdma_post_recvs, __entry->cq_id = ep->re_attr.recv_cq->res.id; __entry->count = count; - __entry->status = status; __entry->posted = ep->re_receive_count; __assign_str(addr, rpcrdma_addrstr(r_xprt)); __assign_str(port, rpcrdma_portstr(r_xprt)); ), - TP_printk("peer=[%s]:%s cq.id=%d %u new recvs, %d active (rc %d)", + TP_printk("peer=[%s]:%s cq.id=%d %u new recvs, %d active", __get_str(addr), __get_str(port), __entry->cq_id, - __entry->count, __entry->posted, __entry->status + __entry->count, __entry->posted + ) +); + +TRACE_EVENT(xprtrdma_post_recvs_err, + TP_PROTO( + const struct rpcrdma_xprt *r_xprt, + int status + ), + + TP_ARGS(r_xprt, status), + + TP_STRUCT__entry( + __field(u32, cq_id) + __field(int, status) + __string(addr, rpcrdma_addrstr(r_xprt)) + __string(port, rpcrdma_portstr(r_xprt)) + ), + + TP_fast_assign( + const struct rpcrdma_ep *ep = r_xprt->rx_ep; + + __entry->cq_id = ep->re_attr.recv_cq->res.id; + __entry->status = status; + __assign_str(addr, rpcrdma_addrstr(r_xprt)); + __assign_str(port, rpcrdma_portstr(r_xprt)); + ), + + TP_printk("peer=[%s]:%s cq.id=%d rc=%d", + __get_str(addr), __get_str(port), __entry->cq_id, + __entry->status ) ); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 016f10a781b4..1e9041c022b6 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1417,6 +1417,7 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp) rc = ib_post_recv(ep->re_id->qp, wr, (const struct ib_recv_wr **)&bad_wr); if (rc) { + trace_xprtrdma_post_recvs_err(r_xprt, rc); for (wr = bad_wr; wr;) { struct rpcrdma_rep *rep; @@ -1430,7 +1431,7 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp) complete(&ep->re_done); out: - trace_xprtrdma_post_recvs(r_xprt, count, rc); + trace_xprtrdma_post_recvs(r_xprt, count); ep->re_receive_count += count; return; } From d9ae8134f253e8d0e15b1f0127af3b8b5552b90c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 2 Aug 2021 14:44:36 -0400 Subject: [PATCH 10/39] xprtrdma: Add an xprtrdma_post_send_err tracepoint Unlike xprtrdma_post_send(), this one can be left enabled all the time, and should almost never fire. But we do want to know about immediate errors when they happen. Note that there is already a similar post_linv_err tracepoint. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/rpcrdma.h | 33 +++++++++++++++++++++++++++++++++ net/sunrpc/xprtrdma/frwr_ops.c | 6 +++++- 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index d65a84bd040c..de4195499592 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -793,6 +793,39 @@ TRACE_EVENT(xprtrdma_post_send, ) ); +TRACE_EVENT(xprtrdma_post_send_err, + TP_PROTO( + const struct rpcrdma_xprt *r_xprt, + const struct rpcrdma_req *req, + int rc + ), + + TP_ARGS(r_xprt, req, rc), + + TP_STRUCT__entry( + __field(u32, cq_id) + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(int, rc) + ), + + TP_fast_assign( + const struct rpc_rqst *rqst = &req->rl_slot; + const struct rpcrdma_ep *ep = r_xprt->rx_ep; + + __entry->cq_id = ep ? ep->re_attr.recv_cq->res.id : 0; + __entry->task_id = rqst->rq_task->tk_pid; + __entry->client_id = rqst->rq_task->tk_client ? + rqst->rq_task->tk_client->cl_clid : -1; + __entry->rc = rc; + ), + + TP_printk("task:%u@%u cq.id=%u rc=%d", + __entry->task_id, __entry->client_id, + __entry->cq_id, __entry->rc + ) +); + TRACE_EVENT(xprtrdma_post_recv, TP_PROTO( const struct rpcrdma_rep *rep diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 754c5dffe127..f700b34a5bfd 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -394,6 +394,7 @@ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) struct rpcrdma_ep *ep = r_xprt->rx_ep; struct rpcrdma_mr *mr; unsigned int num_wrs; + int ret; num_wrs = 1; post_wr = send_wr; @@ -420,7 +421,10 @@ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) } trace_xprtrdma_post_send(req); - return ib_post_send(ep->re_id->qp, post_wr, NULL); + ret = ib_post_send(ep->re_id->qp, post_wr, NULL); + if (ret) + trace_xprtrdma_post_send_err(r_xprt, req, ret); + return ret; } /** From 8d863b1f0541ea540cb47d1454eb060963a79d5f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 2 Aug 2021 14:44:42 -0400 Subject: [PATCH 11/39] xprtrdma: Eliminate rpcrdma_post_sends() Clean up. Now that there is only one registration mode, there is only one target "post_send" method: frwr_send(). rpcrdma_post_sends() no longer adds much value, especially since all of its call sites ignore the return code value except to check if it's non-zero. Just have them call frwr_send() directly instead. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/backchannel.c | 2 +- net/sunrpc/xprtrdma/transport.c | 2 +- net/sunrpc/xprtrdma/verbs.c | 15 --------------- net/sunrpc/xprtrdma/xprt_rdma.h | 1 - 4 files changed, 2 insertions(+), 18 deletions(-) diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 1151efd09b27..17f174d6ea3b 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -115,7 +115,7 @@ int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst) if (rc < 0) goto failed_marshal; - if (rpcrdma_post_sends(r_xprt, req)) + if (frwr_send(r_xprt, req)) goto drop_connection; return 0; diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 9c2ffc67c0fd..a463400ed5a3 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -661,7 +661,7 @@ xprt_rdma_send_request(struct rpc_rqst *rqst) goto drop_connection; rqst->rq_xtime = ktime_get(); - if (rpcrdma_post_sends(r_xprt, req)) + if (frwr_send(r_xprt, req)) goto drop_connection; rqst->rq_xmit_bytes_sent += rqst->rq_snd_buf.len; diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 1e9041c022b6..aaec3c9be8db 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1349,21 +1349,6 @@ static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb) kfree(rb); } -/** - * rpcrdma_post_sends - Post WRs to a transport's Send Queue - * @r_xprt: controlling transport instance - * @req: rpcrdma_req containing the Send WR to post - * - * Returns 0 if the post was successful, otherwise -ENOTCONN - * is returned. - */ -int rpcrdma_post_sends(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) -{ - if (frwr_send(r_xprt, req)) - return -ENOTCONN; - return 0; -} - /** * rpcrdma_post_recvs - Refill the Receive Queue * @r_xprt: controlling transport instance diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 927e20a2c04e..d91f54eae00b 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -459,7 +459,6 @@ void rpcrdma_flush_disconnect(struct rpcrdma_xprt *r_xprt, struct ib_wc *wc); int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt); void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt); -int rpcrdma_post_sends(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req); void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp); /* From 71d3d0ebc894294ef9454e45a3ac2e9ba60b3351 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 26 Jul 2021 08:01:27 -0400 Subject: [PATCH 12/39] SUNRPC: Convert rpc_client refcount to use refcount_t There are now tools in the refcount library that allow us to convert the client shutdown code. Reported-by: Xiyu Yang Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- include/linux/sunrpc/clnt.h | 3 ++- net/sunrpc/auth_gss/gss_rpc_upcall.c | 2 +- net/sunrpc/clnt.c | 22 ++++++++++------------ net/sunrpc/debugfs.c | 2 +- net/sunrpc/rpc_pipe.c | 2 +- 5 files changed, 15 insertions(+), 16 deletions(-) diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 8b5d5c97553e..b2edd5fc2f0c 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -35,7 +36,7 @@ struct rpc_sysfs_client; * The high-level client handle */ struct rpc_clnt { - atomic_t cl_count; /* Number of references */ + refcount_t cl_count; /* Number of references */ unsigned int cl_clid; /* client id */ struct list_head cl_clients; /* Global list of clients */ struct list_head cl_tasks; /* List of tasks */ diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c index d1c003a25b0f..61c276bddaf2 100644 --- a/net/sunrpc/auth_gss/gss_rpc_upcall.c +++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c @@ -160,7 +160,7 @@ static struct rpc_clnt *get_gssp_clnt(struct sunrpc_net *sn) mutex_lock(&sn->gssp_lock); clnt = sn->gssp_clnt; if (clnt) - atomic_inc(&clnt->cl_count); + refcount_inc(&clnt->cl_count); mutex_unlock(&sn->gssp_lock); return clnt; } diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index d34737a8a68a..a5b7f6e34d15 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -167,7 +167,7 @@ static int rpc_clnt_skip_event(struct rpc_clnt *clnt, unsigned long event) case RPC_PIPEFS_MOUNT: if (clnt->cl_pipedir_objects.pdh_dentry != NULL) return 1; - if (atomic_read(&clnt->cl_count) == 0) + if (refcount_read(&clnt->cl_count) == 0) return 1; break; case RPC_PIPEFS_UMOUNT: @@ -419,7 +419,7 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, clnt->cl_rtt = &clnt->cl_rtt_default; rpc_init_rtt(&clnt->cl_rtt_default, clnt->cl_timeout->to_initval); - atomic_set(&clnt->cl_count, 1); + refcount_set(&clnt->cl_count, 1); if (nodename == NULL) nodename = utsname()->nodename; @@ -431,7 +431,7 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, if (err) goto out_no_path; if (parent) - atomic_inc(&parent->cl_count); + refcount_inc(&parent->cl_count); trace_rpc_clnt_new(clnt, xprt, program->name, args->servername); return clnt; @@ -918,18 +918,16 @@ rpc_free_client(struct rpc_clnt *clnt) static struct rpc_clnt * rpc_free_auth(struct rpc_clnt *clnt) { - if (clnt->cl_auth == NULL) - return rpc_free_client(clnt); - /* * Note: RPCSEC_GSS may need to send NULL RPC calls in order to * release remaining GSS contexts. This mechanism ensures * that it can do so safely. */ - atomic_inc(&clnt->cl_count); - rpcauth_release(clnt->cl_auth); - clnt->cl_auth = NULL; - if (atomic_dec_and_test(&clnt->cl_count)) + if (clnt->cl_auth != NULL) { + rpcauth_release(clnt->cl_auth); + clnt->cl_auth = NULL; + } + if (refcount_dec_and_test(&clnt->cl_count)) return rpc_free_client(clnt); return NULL; } @@ -943,7 +941,7 @@ rpc_release_client(struct rpc_clnt *clnt) do { if (list_empty(&clnt->cl_tasks)) wake_up(&destroy_wait); - if (!atomic_dec_and_test(&clnt->cl_count)) + if (refcount_dec_not_one(&clnt->cl_count)) break; clnt = rpc_free_auth(clnt); } while (clnt != NULL); @@ -1082,7 +1080,7 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt) if (clnt != NULL) { rpc_task_set_transport(task, clnt); task->tk_client = clnt; - atomic_inc(&clnt->cl_count); + refcount_inc(&clnt->cl_count); if (clnt->cl_softrtry) task->tk_flags |= RPC_TASK_SOFT; if (clnt->cl_softerr) diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c index 56029e3af6ff..79995eb95927 100644 --- a/net/sunrpc/debugfs.c +++ b/net/sunrpc/debugfs.c @@ -90,7 +90,7 @@ static int tasks_open(struct inode *inode, struct file *filp) struct seq_file *seq = filp->private_data; struct rpc_clnt *clnt = seq->private = inode->i_private; - if (!atomic_inc_not_zero(&clnt->cl_count)) { + if (!refcount_inc_not_zero(&clnt->cl_count)) { seq_release(inode, filp); ret = -EINVAL; } diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 09c000d490a1..ee5336d73fdd 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -423,7 +423,7 @@ rpc_info_open(struct inode *inode, struct file *file) spin_lock(&file->f_path.dentry->d_lock); if (!d_unhashed(file->f_path.dentry)) clnt = RPC_I(inode)->private; - if (clnt != NULL && atomic_inc_not_zero(&clnt->cl_count)) { + if (clnt != NULL && refcount_inc_not_zero(&clnt->cl_count)) { spin_unlock(&file->f_path.dentry->d_lock); m->private = clnt; } else { From e20772cbdf463c12088837e5a08bde1b876bfd25 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 26 Jul 2021 07:58:49 -0400 Subject: [PATCH 13/39] NFSv4/pNFS: Fix a layoutget livelock loop If NFS_LAYOUT_RETURN_REQUESTED is set, but there is no value set for the layout plh_return_seq, we can end up in a livelock loop in which every layout segment retrieved by a new call to layoutget is immediately invalidated by pnfs_layout_need_return(). To get around this, we should just set plh_return_seq to the current value of the layout stateid's seqid. Fixes: d474f96104bd ("NFS: Don't return layout segments that are in use") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/pnfs.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index ef14ea0b6ab8..da5cacad6979 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -347,11 +347,15 @@ pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode, iomode = IOMODE_ANY; lo->plh_return_iomode = iomode; set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); - if (seq != 0) { - WARN_ON_ONCE(lo->plh_return_seq != 0 && lo->plh_return_seq != seq); + /* + * We must set lo->plh_return_seq to avoid livelocks with + * pnfs_layout_need_return() + */ + if (seq == 0) + seq = be32_to_cpu(lo->plh_stateid.seqid); + if (!lo->plh_return_seq || pnfs_seqid_is_newer(seq, lo->plh_return_seq)) lo->plh_return_seq = seq; - pnfs_barrier_update(lo, seq); - } + pnfs_barrier_update(lo, seq); } static void From 7c0bbf2d3dcd13c906244e0c2e4a0ba3a33dab6a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 26 Jul 2021 07:59:20 -0400 Subject: [PATCH 14/39] NFSv4/pNFS: Remove dead code Since commit 2b28a7bee453 ("fs, nfs: convert pnfs_layout_hdr.plh_refcount from atomic_t to refcount_t") it has not been legal to bump a zero refcount, so the code that tries to allow it if the NFS_LSEG_VALID flag is still set would cause trouble. Luckily, NFS_LSEG_VALID has its own refcount so we can never hit this bad code snippet in practice. Remove it to avoid confusion. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/pnfs.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index da5cacad6979..856706180d33 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -596,10 +596,6 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg) inode = lo->plh_inode; if (refcount_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) { - if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) { - spin_unlock(&inode->i_lock); - return; - } pnfs_get_layout_hdr(lo); pnfs_layout_remove_lseg(lo, lseg); if (pnfs_cache_lseg_for_layoutreturn(lo, lseg)) From 45baadaad7bf9183651fb74f4ed1200da48505a5 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 23 Jul 2021 08:57:20 -0400 Subject: [PATCH 15/39] NFSv4/pNFS: Always allow update of a zero valued layout barrier A zero value for the layout barrier indicates that it has been cleared (since seqid '0' is an illegal value), so we should always allow it to be updated. Fixes: d29b468da4f9 ("pNFS/NFSv4: Improve rejection of out-of-order layouts") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/pnfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 856706180d33..7775f6b5a53a 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -335,7 +335,7 @@ static bool pnfs_seqid_is_newer(u32 s1, u32 s2) static void pnfs_barrier_update(struct pnfs_layout_hdr *lo, u32 newseq) { - if (pnfs_seqid_is_newer(newseq, lo->plh_barrier)) + if (pnfs_seqid_is_newer(newseq, lo->plh_barrier) || !lo->plh_barrier) lo->plh_barrier = newseq; } From d6236a98b3bab07c0a1455fd1ab46f79c3978cdc Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 23 Jul 2021 08:57:21 -0400 Subject: [PATCH 16/39] NFSv4/pnfs: The layout barrier indicate a minimal value for the seqid The intention of the layout barrier is to ensure that we do not update the layout to match an older value than the current expectation. Fix the test in pnfs_layout_stateid_blocked() to reflect that it is legal for the seqid of the stateid to match that of the barrier. Fixes: aa95edf309ef ("NFSv4/pnfs: Fix the layout barrier update") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/pnfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 7775f6b5a53a..7c9090a28e5c 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1000,7 +1000,7 @@ pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo, { u32 seqid = be32_to_cpu(stateid->seqid); - return !pnfs_seqid_is_newer(seqid, lo->plh_barrier) && lo->plh_barrier; + return lo->plh_barrier && pnfs_seqid_is_newer(lo->plh_barrier, seqid); } /* lget is set to 1 if called from inside send_layoutget call chain */ From c2dc3e5fad13aca5d7bdf4bcb52b1a1d707c8555 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 26 Jul 2021 07:59:23 -0400 Subject: [PATCH 17/39] SUNRPC: Fix potential memory corruption We really should not call rpc_wake_up_queued_task_set_status() with xprt->snd_task as an argument unless we are certain that is actually an rpc_task. Fixes: 0445f92c5d53 ("SUNRPC: Fix disconnection races") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- include/linux/sunrpc/xprt.h | 1 + net/sunrpc/xprt.c | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index c8c39f22d3b1..59cd97da895b 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -432,6 +432,7 @@ void xprt_release_write(struct rpc_xprt *, struct rpc_task *); #define XPRT_CONGESTED (9) #define XPRT_CWND_WAIT (10) #define XPRT_WRITE_SPACE (11) +#define XPRT_SND_IS_COOKIE (12) static inline void xprt_set_connected(struct rpc_xprt *xprt) { diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index fb6db09725c7..bddd354a0076 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -775,9 +775,9 @@ void xprt_force_disconnect(struct rpc_xprt *xprt) /* Try to schedule an autoclose RPC call */ if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) queue_work(xprtiod_workqueue, &xprt->task_cleanup); - else if (xprt->snd_task) + else if (xprt->snd_task && !test_bit(XPRT_SND_IS_COOKIE, &xprt->state)) rpc_wake_up_queued_task_set_status(&xprt->pending, - xprt->snd_task, -ENOTCONN); + xprt->snd_task, -ENOTCONN); spin_unlock(&xprt->transport_lock); } EXPORT_SYMBOL_GPL(xprt_force_disconnect); @@ -866,6 +866,7 @@ bool xprt_lock_connect(struct rpc_xprt *xprt, goto out; if (xprt->snd_task != task) goto out; + set_bit(XPRT_SND_IS_COOKIE, &xprt->state); xprt->snd_task = cookie; ret = true; out: @@ -881,6 +882,7 @@ void xprt_unlock_connect(struct rpc_xprt *xprt, void *cookie) if (!test_bit(XPRT_LOCKED, &xprt->state)) goto out; xprt->snd_task =NULL; + clear_bit(XPRT_SND_IS_COOKIE, &xprt->state); xprt->ops->release_xprt(xprt, NULL); xprt_schedule_autodisconnect(xprt); out: From e26d9972720e2484f44cdd94ca4e31cc372ed2ed Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 26 Jul 2021 07:59:24 -0400 Subject: [PATCH 18/39] SUNRPC: Clean up scheduling of autoclose Consolidate duplicated code in xprt_force_disconnect() and xprt_conditional_disconnect(). Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- net/sunrpc/xprt.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index bddd354a0076..aae5a328b15b 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -760,6 +760,20 @@ void xprt_disconnect_done(struct rpc_xprt *xprt) } EXPORT_SYMBOL_GPL(xprt_disconnect_done); +/** + * xprt_schedule_autoclose_locked - Try to schedule an autoclose RPC call + * @xprt: transport to disconnect + */ +static void xprt_schedule_autoclose_locked(struct rpc_xprt *xprt) +{ + set_bit(XPRT_CLOSE_WAIT, &xprt->state); + if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) + queue_work(xprtiod_workqueue, &xprt->task_cleanup); + else if (xprt->snd_task && !test_bit(XPRT_SND_IS_COOKIE, &xprt->state)) + rpc_wake_up_queued_task_set_status(&xprt->pending, + xprt->snd_task, -ENOTCONN); +} + /** * xprt_force_disconnect - force a transport to disconnect * @xprt: transport to disconnect @@ -771,13 +785,7 @@ void xprt_force_disconnect(struct rpc_xprt *xprt) /* Don't race with the test_bit() in xprt_clear_locked() */ spin_lock(&xprt->transport_lock); - set_bit(XPRT_CLOSE_WAIT, &xprt->state); - /* Try to schedule an autoclose RPC call */ - if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) - queue_work(xprtiod_workqueue, &xprt->task_cleanup); - else if (xprt->snd_task && !test_bit(XPRT_SND_IS_COOKIE, &xprt->state)) - rpc_wake_up_queued_task_set_status(&xprt->pending, - xprt->snd_task, -ENOTCONN); + xprt_schedule_autoclose_locked(xprt); spin_unlock(&xprt->transport_lock); } EXPORT_SYMBOL_GPL(xprt_force_disconnect); @@ -817,11 +825,7 @@ void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie) goto out; if (test_bit(XPRT_CLOSING, &xprt->state)) goto out; - set_bit(XPRT_CLOSE_WAIT, &xprt->state); - /* Try to schedule an autoclose RPC call */ - if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) - queue_work(xprtiod_workqueue, &xprt->task_cleanup); - xprt_wake_pending_tasks(xprt, -EAGAIN); + xprt_schedule_autoclose_locked(xprt); out: spin_unlock(&xprt->transport_lock); } From f99fa50880f5300fbbb3c0754ddc7f8738d24fe7 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 26 Jul 2021 08:03:12 -0400 Subject: [PATCH 19/39] SUNRPC/xprtrdma: Fix reconnection locking The xprtrdma client code currently relies on the task that initiated the connect to hold the XPRT_LOCK for the duration of the connection attempt. If the task is woken early, due to some other event, then that lock could get released early. Avoid races by using the same mechanism that the socket code uses of transferring lock ownership to the RDMA connect worker itself. That frees us to call rpcrdma_xprt_disconnect() directly since we're now guaranteed exclusion w.r.t. other callers. Fixes: 4cf44be6f1e8 ("xprtrdma: Fix recursion into rpcrdma_xprt_disconnect()") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- net/sunrpc/xprt.c | 2 ++ net/sunrpc/xprtrdma/transport.c | 11 +++++------ 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index aae5a328b15b..b88ac8132054 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -877,6 +877,7 @@ out: spin_unlock(&xprt->transport_lock); return ret; } +EXPORT_SYMBOL_GPL(xprt_lock_connect); void xprt_unlock_connect(struct rpc_xprt *xprt, void *cookie) { @@ -893,6 +894,7 @@ out: spin_unlock(&xprt->transport_lock); wake_up_bit(&xprt->state, XPRT_LOCKED); } +EXPORT_SYMBOL_GPL(xprt_unlock_connect); /** * xprt_connect - schedule a transport connect operation diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index a463400ed5a3..16e5696314a4 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -250,12 +250,9 @@ xprt_rdma_connect_worker(struct work_struct *work) xprt->stat.connect_start; xprt_set_connected(xprt); rc = -EAGAIN; - } else { - /* Force a call to xprt_rdma_close to clean up */ - spin_lock(&xprt->transport_lock); - set_bit(XPRT_CLOSE_WAIT, &xprt->state); - spin_unlock(&xprt->transport_lock); - } + } else + rpcrdma_xprt_disconnect(r_xprt); + xprt_unlock_connect(xprt, r_xprt); xprt_wake_pending_tasks(xprt, rc); } @@ -489,6 +486,8 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) struct rpcrdma_ep *ep = r_xprt->rx_ep; unsigned long delay; + WARN_ON_ONCE(!xprt_lock_connect(xprt, task, r_xprt)); + delay = 0; if (ep && ep->re_connect_status != 0) { delay = xprt_reconnect_delay(xprt); From 5d46dd04cb68771f77ba66dbf6fd323a4a2ce00d Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Tue, 20 Jul 2021 16:04:42 -0400 Subject: [PATCH 20/39] sunrpc: Fix return value of get_srcport() Since bc1c56e9bbe9 transport->srcport may by unset, causing get_srcport() to return 0 when called. Fix this by querying the port from the underlying socket instead of the transport. Fixes: bc1c56e9bbe9 (SUNRPC: prevent port reuse on transports which don't request it) Signed-off-by: Anna Schumaker --- net/sunrpc/xprtsock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index e573dcecdd66..02b071dbdd22 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1656,7 +1656,7 @@ static int xs_get_srcport(struct sock_xprt *transport) unsigned short get_srcport(struct rpc_xprt *xprt) { struct sock_xprt *sock = container_of(xprt, struct sock_xprt, xprt); - return sock->srcport; + return xs_sock_getport(sock->sock); } EXPORT_SYMBOL(get_srcport); From e44773daf851dc2755144355723c1c305e7246a1 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Thu, 29 Jul 2021 16:45:23 -0400 Subject: [PATCH 21/39] SUNRPC: Add srcaddr as a file in sysfs I don't support changing it right now, but it could be useful information for clients with multiple network cards. Signed-off-by: Anna Schumaker --- net/sunrpc/sysfs.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c index 64da3bfd28e6..2e7a53504974 100644 --- a/net/sunrpc/sysfs.c +++ b/net/sunrpc/sysfs.c @@ -100,6 +100,28 @@ static ssize_t rpc_sysfs_xprt_dstaddr_show(struct kobject *kobj, return ret + 1; } +static ssize_t rpc_sysfs_xprt_srcaddr_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); + struct sockaddr_storage saddr; + struct sock_xprt *sock; + ssize_t ret = -1; + + if (!xprt) + return 0; + + sock = container_of(xprt, struct sock_xprt, xprt); + if (kernel_getsockname(sock->sock, (struct sockaddr *)&saddr) < 0) + goto out; + + ret = sprintf(buf, "%pISc\n", &saddr); +out: + xprt_put(xprt); + return ret + 1; +} + static ssize_t rpc_sysfs_xprt_info_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -376,6 +398,9 @@ static const void *rpc_sysfs_xprt_namespace(struct kobject *kobj) static struct kobj_attribute rpc_sysfs_xprt_dstaddr = __ATTR(dstaddr, 0644, rpc_sysfs_xprt_dstaddr_show, rpc_sysfs_xprt_dstaddr_store); +static struct kobj_attribute rpc_sysfs_xprt_srcaddr = __ATTR(srcaddr, + 0644, rpc_sysfs_xprt_srcaddr_show, NULL); + static struct kobj_attribute rpc_sysfs_xprt_info = __ATTR(xprt_info, 0444, rpc_sysfs_xprt_info_show, NULL); @@ -384,6 +409,7 @@ static struct kobj_attribute rpc_sysfs_xprt_change_state = __ATTR(xprt_state, static struct attribute *rpc_sysfs_xprt_attrs[] = { &rpc_sysfs_xprt_dstaddr.attr, + &rpc_sysfs_xprt_srcaddr.attr, &rpc_sysfs_xprt_info.attr, &rpc_sysfs_xprt_change_state.attr, NULL, From 69f2cd6df3ee07ae88befafc038d4dd9154e2799 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Thu, 29 Jul 2021 16:46:05 -0400 Subject: [PATCH 22/39] SUNRPC: Add dst_port to the sysfs xprt info file This is most likely going to be 2049 for NFS, but some servers might be configured to export on a non-standard port. Let's show this information just in case somebody needs it. Signed-off-by: Anna Schumaker --- net/sunrpc/sysfs.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c index 2e7a53504974..414c664a3199 100644 --- a/net/sunrpc/sysfs.c +++ b/net/sunrpc/sysfs.c @@ -136,14 +136,16 @@ static ssize_t rpc_sysfs_xprt_info_show(struct kobject *kobj, "max_num_slots=%u\nmin_num_slots=%u\nnum_reqs=%u\n" "binding_q_len=%u\nsending_q_len=%u\npending_q_len=%u\n" "backlog_q_len=%u\nmain_xprt=%d\nsrc_port=%u\n" - "tasks_queuelen=%ld\n", + "tasks_queuelen=%ld\ndst_port=%s\n", xprt->last_used, xprt->cong, xprt->cwnd, xprt->max_reqs, xprt->min_reqs, xprt->num_reqs, xprt->binding.qlen, xprt->sending.qlen, xprt->pending.qlen, xprt->backlog.qlen, xprt->main, (xprt->xprt_class->ident == XPRT_TRANSPORT_TCP) ? get_srcport(xprt) : 0, - atomic_long_read(&xprt->queuelen)); + atomic_long_read(&xprt->queuelen), + (xprt->xprt_class->ident == XPRT_TRANSPORT_TCP) ? + xprt->address_strings[RPC_DISPLAY_PORT] : "0"); xprt_put(xprt); return ret + 1; } From 438623a06bacd69c40c4af633bb09a3bbb9dfc78 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 15 Jul 2021 15:52:06 -0400 Subject: [PATCH 23/39] SUNRPC: Add svc_rqst::rq_auth_stat I'd like to take commit 4532608d71c8 ("SUNRPC: Clean up generic dispatcher code") even further by using only private local SVC dispatchers for all kernel RPC services. This change would enable the removal of the logic that switches between svc_generic_dispatch() and a service's private dispatcher, and simplify the invocation of the service's pc_release method so that humans can visually verify that it is always invoked properly. All that will come later. First, let's provide a better way to return authentication errors from SVC dispatcher functions. Instead of overloading the dispatch method's *statp argument, add a field to struct svc_rqst that can hold an error value. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/linux/sunrpc/svc.h | 1 + include/linux/sunrpc/svcauth.h | 4 +-- include/trace/events/sunrpc.h | 6 ++--- net/sunrpc/auth_gss/svcauth_gss.c | 43 +++++++++++++++---------------- net/sunrpc/svc.c | 17 ++++++------ net/sunrpc/svcauth.c | 8 +++--- net/sunrpc/svcauth_unix.c | 12 ++++----- 7 files changed, 46 insertions(+), 45 deletions(-) diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index e91d51ea028b..35f12963e1ff 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -282,6 +282,7 @@ struct svc_rqst { void * rq_argp; /* decoded arguments */ void * rq_resp; /* xdr'd results */ void * rq_auth_data; /* flavor-specific data */ + __be32 rq_auth_stat; /* authentication status */ int rq_auth_slack; /* extra space xdr code * should leave in head * for krb5i, krb5p. diff --git a/include/linux/sunrpc/svcauth.h b/include/linux/sunrpc/svcauth.h index b0003866a249..6d9cc9080aca 100644 --- a/include/linux/sunrpc/svcauth.h +++ b/include/linux/sunrpc/svcauth.h @@ -127,7 +127,7 @@ struct auth_ops { char * name; struct module *owner; int flavour; - int (*accept)(struct svc_rqst *rq, __be32 *authp); + int (*accept)(struct svc_rqst *rq); int (*release)(struct svc_rqst *rq); void (*domain_release)(struct auth_domain *); int (*set_client)(struct svc_rqst *rq); @@ -149,7 +149,7 @@ struct auth_ops { struct svc_xprt; -extern int svc_authenticate(struct svc_rqst *rqstp, __be32 *authp); +extern int svc_authenticate(struct svc_rqst *rqstp); extern int svc_authorise(struct svc_rqst *rqstp); extern int svc_set_client(struct svc_rqst *rqstp); extern int svc_auth_register(rpc_authflavor_t flavor, struct auth_ops *aops); diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 18d552a17c19..c7d9e6c7a979 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -1582,9 +1582,9 @@ TRACE_DEFINE_ENUM(SVC_COMPLETE); { SVC_COMPLETE, "SVC_COMPLETE" }) TRACE_EVENT(svc_authenticate, - TP_PROTO(const struct svc_rqst *rqst, int auth_res, __be32 auth_stat), + TP_PROTO(const struct svc_rqst *rqst, int auth_res), - TP_ARGS(rqst, auth_res, auth_stat), + TP_ARGS(rqst, auth_res), TP_STRUCT__entry( __field(u32, xid) @@ -1595,7 +1595,7 @@ TRACE_EVENT(svc_authenticate, TP_fast_assign( __entry->xid = be32_to_cpu(rqst->rq_xid); __entry->svc_status = auth_res; - __entry->auth_stat = be32_to_cpu(auth_stat); + __entry->auth_stat = be32_to_cpu(rqst->rq_auth_stat); ), TP_printk("xid=0x%08x auth_res=%s auth_stat=%s", diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index a81be45f40d9..635449ed7af6 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -707,11 +707,11 @@ svc_safe_putnetobj(struct kvec *resv, struct xdr_netobj *o) /* * Verify the checksum on the header and return SVC_OK on success. * Otherwise, return SVC_DROP (in the case of a bad sequence number) - * or return SVC_DENIED and indicate error in authp. + * or return SVC_DENIED and indicate error in rqstp->rq_auth_stat. */ static int gss_verify_header(struct svc_rqst *rqstp, struct rsc *rsci, - __be32 *rpcstart, struct rpc_gss_wire_cred *gc, __be32 *authp) + __be32 *rpcstart, struct rpc_gss_wire_cred *gc) { struct gss_ctx *ctx_id = rsci->mechctx; struct xdr_buf rpchdr; @@ -725,7 +725,7 @@ gss_verify_header(struct svc_rqst *rqstp, struct rsc *rsci, iov.iov_len = (u8 *)argv->iov_base - (u8 *)rpcstart; xdr_buf_from_iov(&iov, &rpchdr); - *authp = rpc_autherr_badverf; + rqstp->rq_auth_stat = rpc_autherr_badverf; if (argv->iov_len < 4) return SVC_DENIED; flavor = svc_getnl(argv); @@ -737,13 +737,13 @@ gss_verify_header(struct svc_rqst *rqstp, struct rsc *rsci, if (rqstp->rq_deferred) /* skip verification of revisited request */ return SVC_OK; if (gss_verify_mic(ctx_id, &rpchdr, &checksum) != GSS_S_COMPLETE) { - *authp = rpcsec_gsserr_credproblem; + rqstp->rq_auth_stat = rpcsec_gsserr_credproblem; return SVC_DENIED; } if (gc->gc_seq > MAXSEQ) { trace_rpcgss_svc_seqno_large(rqstp, gc->gc_seq); - *authp = rpcsec_gsserr_ctxproblem; + rqstp->rq_auth_stat = rpcsec_gsserr_ctxproblem; return SVC_DENIED; } if (!gss_check_seq_num(rqstp, rsci, gc->gc_seq)) @@ -1142,7 +1142,7 @@ static void gss_free_in_token_pages(struct gssp_in_token *in_token) } static int gss_read_proxy_verf(struct svc_rqst *rqstp, - struct rpc_gss_wire_cred *gc, __be32 *authp, + struct rpc_gss_wire_cred *gc, struct xdr_netobj *in_handle, struct gssp_in_token *in_token) { @@ -1151,7 +1151,7 @@ static int gss_read_proxy_verf(struct svc_rqst *rqstp, int pages, i, res, pgto, pgfrom; size_t inlen, to_offs, from_offs; - res = gss_read_common_verf(gc, argv, authp, in_handle); + res = gss_read_common_verf(gc, argv, &rqstp->rq_auth_stat, in_handle); if (res) return res; @@ -1227,7 +1227,7 @@ gss_write_resv(struct kvec *resv, size_t size_limit, * Otherwise, drop the request pending an answer to the upcall. */ static int svcauth_gss_legacy_init(struct svc_rqst *rqstp, - struct rpc_gss_wire_cred *gc, __be32 *authp) + struct rpc_gss_wire_cred *gc) { struct kvec *argv = &rqstp->rq_arg.head[0]; struct kvec *resv = &rqstp->rq_res.head[0]; @@ -1236,7 +1236,7 @@ static int svcauth_gss_legacy_init(struct svc_rqst *rqstp, struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id); memset(&rsikey, 0, sizeof(rsikey)); - ret = gss_read_verf(gc, argv, authp, + ret = gss_read_verf(gc, argv, &rqstp->rq_auth_stat, &rsikey.in_handle, &rsikey.in_token); if (ret) return ret; @@ -1339,7 +1339,7 @@ out: } static int svcauth_gss_proxy_init(struct svc_rqst *rqstp, - struct rpc_gss_wire_cred *gc, __be32 *authp) + struct rpc_gss_wire_cred *gc) { struct kvec *resv = &rqstp->rq_res.head[0]; struct xdr_netobj cli_handle; @@ -1351,8 +1351,7 @@ static int svcauth_gss_proxy_init(struct svc_rqst *rqstp, struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); memset(&ud, 0, sizeof(ud)); - ret = gss_read_proxy_verf(rqstp, gc, authp, - &ud.in_handle, &ud.in_token); + ret = gss_read_proxy_verf(rqstp, gc, &ud.in_handle, &ud.in_token); if (ret) return ret; @@ -1525,7 +1524,7 @@ static void destroy_use_gss_proxy_proc_entry(struct net *net) {} * response here and return SVC_COMPLETE. */ static int -svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) +svcauth_gss_accept(struct svc_rqst *rqstp) { struct kvec *argv = &rqstp->rq_arg.head[0]; struct kvec *resv = &rqstp->rq_res.head[0]; @@ -1538,7 +1537,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) int ret; struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id); - *authp = rpc_autherr_badcred; + rqstp->rq_auth_stat = rpc_autherr_badcred; if (!svcdata) svcdata = kmalloc(sizeof(*svcdata), GFP_KERNEL); if (!svcdata) @@ -1575,22 +1574,22 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) if ((gc->gc_proc != RPC_GSS_PROC_DATA) && (rqstp->rq_proc != 0)) goto auth_err; - *authp = rpc_autherr_badverf; + rqstp->rq_auth_stat = rpc_autherr_badverf; switch (gc->gc_proc) { case RPC_GSS_PROC_INIT: case RPC_GSS_PROC_CONTINUE_INIT: if (use_gss_proxy(SVC_NET(rqstp))) - return svcauth_gss_proxy_init(rqstp, gc, authp); + return svcauth_gss_proxy_init(rqstp, gc); else - return svcauth_gss_legacy_init(rqstp, gc, authp); + return svcauth_gss_legacy_init(rqstp, gc); case RPC_GSS_PROC_DATA: case RPC_GSS_PROC_DESTROY: /* Look up the context, and check the verifier: */ - *authp = rpcsec_gsserr_credproblem; + rqstp->rq_auth_stat = rpcsec_gsserr_credproblem; rsci = gss_svc_searchbyctx(sn->rsc_cache, &gc->gc_ctx); if (!rsci) goto auth_err; - switch (gss_verify_header(rqstp, rsci, rpcstart, gc, authp)) { + switch (gss_verify_header(rqstp, rsci, rpcstart, gc)) { case SVC_OK: break; case SVC_DENIED: @@ -1600,7 +1599,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) } break; default: - *authp = rpc_autherr_rejectedcred; + rqstp->rq_auth_stat = rpc_autherr_rejectedcred; goto auth_err; } @@ -1616,13 +1615,13 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) svc_putnl(resv, RPC_SUCCESS); goto complete; case RPC_GSS_PROC_DATA: - *authp = rpcsec_gsserr_ctxproblem; + rqstp->rq_auth_stat = rpcsec_gsserr_ctxproblem; svcdata->verf_start = resv->iov_base + resv->iov_len; if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq)) goto auth_err; rqstp->rq_cred = rsci->cred; get_group_info(rsci->cred.cr_group_info); - *authp = rpc_autherr_badcred; + rqstp->rq_auth_stat = rpc_autherr_badcred; switch (gc->gc_svc) { case RPC_GSS_SVC_NONE: break; diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 0de918cb3d90..360dab62b6b4 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1283,7 +1283,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) struct svc_process_info process; __be32 *statp; u32 prog, vers; - __be32 auth_stat, rpc_stat; + __be32 rpc_stat; int auth_res; __be32 *reply_statp; @@ -1326,14 +1326,14 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) * We do this before anything else in order to get a decent * auth verifier. */ - auth_res = svc_authenticate(rqstp, &auth_stat); + auth_res = svc_authenticate(rqstp); /* Also give the program a chance to reject this call: */ if (auth_res == SVC_OK && progp) { - auth_stat = rpc_autherr_badcred; + rqstp->rq_auth_stat = rpc_autherr_badcred; auth_res = progp->pg_authenticate(rqstp); } if (auth_res != SVC_OK) - trace_svc_authenticate(rqstp, auth_res, auth_stat); + trace_svc_authenticate(rqstp, auth_res); switch (auth_res) { case SVC_OK: break; @@ -1392,8 +1392,8 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) goto release_dropit; if (*statp == rpc_garbage_args) goto err_garbage; - auth_stat = svc_get_autherr(rqstp, statp); - if (auth_stat != rpc_auth_ok) + rqstp->rq_auth_stat = svc_get_autherr(rqstp, statp); + if (rqstp->rq_auth_stat != rpc_auth_ok) goto err_release_bad_auth; } else { dprintk("svc: calling dispatcher\n"); @@ -1450,13 +1450,14 @@ err_release_bad_auth: if (procp->pc_release) procp->pc_release(rqstp); err_bad_auth: - dprintk("svc: authentication failed (%d)\n", ntohl(auth_stat)); + dprintk("svc: authentication failed (%d)\n", + be32_to_cpu(rqstp->rq_auth_stat)); serv->sv_stats->rpcbadauth++; /* Restore write pointer to location of accept status: */ xdr_ressize_check(rqstp, reply_statp); svc_putnl(resv, 1); /* REJECT */ svc_putnl(resv, 1); /* AUTH_ERROR */ - svc_putnl(resv, ntohl(auth_stat)); /* status */ + svc_putu32(resv, rqstp->rq_auth_stat); /* status */ goto sendit; err_bad_prog: diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c index 998b196b6176..5a8b8e03fdd4 100644 --- a/net/sunrpc/svcauth.c +++ b/net/sunrpc/svcauth.c @@ -59,12 +59,12 @@ svc_put_auth_ops(struct auth_ops *aops) } int -svc_authenticate(struct svc_rqst *rqstp, __be32 *authp) +svc_authenticate(struct svc_rqst *rqstp) { rpc_authflavor_t flavor; struct auth_ops *aops; - *authp = rpc_auth_ok; + rqstp->rq_auth_stat = rpc_auth_ok; flavor = svc_getnl(&rqstp->rq_arg.head[0]); @@ -72,7 +72,7 @@ svc_authenticate(struct svc_rqst *rqstp, __be32 *authp) aops = svc_get_auth_ops(flavor); if (aops == NULL) { - *authp = rpc_autherr_badcred; + rqstp->rq_auth_stat = rpc_autherr_badcred; return SVC_DENIED; } @@ -80,7 +80,7 @@ svc_authenticate(struct svc_rqst *rqstp, __be32 *authp) init_svc_cred(&rqstp->rq_cred); rqstp->rq_authop = aops; - return aops->accept(rqstp, authp); + return aops->accept(rqstp); } EXPORT_SYMBOL_GPL(svc_authenticate); diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 35b7966ac3b3..eacfebf326dd 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -725,7 +725,7 @@ svcauth_unix_set_client(struct svc_rqst *rqstp) EXPORT_SYMBOL_GPL(svcauth_unix_set_client); static int -svcauth_null_accept(struct svc_rqst *rqstp, __be32 *authp) +svcauth_null_accept(struct svc_rqst *rqstp) { struct kvec *argv = &rqstp->rq_arg.head[0]; struct kvec *resv = &rqstp->rq_res.head[0]; @@ -736,12 +736,12 @@ svcauth_null_accept(struct svc_rqst *rqstp, __be32 *authp) if (svc_getu32(argv) != 0) { dprintk("svc: bad null cred\n"); - *authp = rpc_autherr_badcred; + rqstp->rq_auth_stat = rpc_autherr_badcred; return SVC_DENIED; } if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) { dprintk("svc: bad null verf\n"); - *authp = rpc_autherr_badverf; + rqstp->rq_auth_stat = rpc_autherr_badverf; return SVC_DENIED; } @@ -785,7 +785,7 @@ struct auth_ops svcauth_null = { static int -svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp) +svcauth_unix_accept(struct svc_rqst *rqstp) { struct kvec *argv = &rqstp->rq_arg.head[0]; struct kvec *resv = &rqstp->rq_res.head[0]; @@ -827,7 +827,7 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp) } groups_sort(cred->cr_group_info); if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) { - *authp = rpc_autherr_badverf; + rqstp->rq_auth_stat = rpc_autherr_badverf; return SVC_DENIED; } @@ -839,7 +839,7 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp) return SVC_OK; badcred: - *authp = rpc_autherr_badcred; + rqstp->rq_auth_stat = rpc_autherr_badcred; return SVC_DENIED; } From 5c2465dfd457f3015eebcc3ace50570e1d896aeb Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 15 Jul 2021 15:52:12 -0400 Subject: [PATCH 24/39] SUNRPC: Set rq_auth_stat in the pg_authenticate() callout In a few moments, rq_auth_stat will need to be explicitly set to rpc_auth_ok before execution gets to the dispatcher. svc_authenticate() already sets it, but it often gets reset to rpc_autherr_badcred right after that call, even when authentication is successful. Let's ensure that the pg_authenticate callout and svc_set_client() set it properly in every case. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- fs/lockd/svc.c | 2 ++ fs/nfs/callback.c | 4 ++++ net/sunrpc/auth_gss/svcauth_gss.c | 4 ++++ net/sunrpc/svc.c | 4 +--- net/sunrpc/svcauth_unix.c | 6 +++++- 5 files changed, 16 insertions(+), 4 deletions(-) diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 2de048f80eb8..8e936999216c 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -649,6 +649,7 @@ static int lockd_authenticate(struct svc_rqst *rqstp) switch (rqstp->rq_authop->flavour) { case RPC_AUTH_NULL: case RPC_AUTH_UNIX: + rqstp->rq_auth_stat = rpc_auth_ok; if (rqstp->rq_proc == 0) return SVC_OK; if (is_callback(rqstp->rq_proc)) { @@ -659,6 +660,7 @@ static int lockd_authenticate(struct svc_rqst *rqstp) } return svc_set_client(rqstp); } + rqstp->rq_auth_stat = rpc_autherr_badcred; return SVC_DENIED; } diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 7817ad94a6ba..86d856de1389 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -429,6 +429,8 @@ check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp) */ static int nfs_callback_authenticate(struct svc_rqst *rqstp) { + rqstp->rq_auth_stat = rpc_autherr_badcred; + switch (rqstp->rq_authop->flavour) { case RPC_AUTH_NULL: if (rqstp->rq_proc != CB_NULL) @@ -439,6 +441,8 @@ static int nfs_callback_authenticate(struct svc_rqst *rqstp) if (svc_is_backchannel(rqstp)) return SVC_DENIED; } + + rqstp->rq_auth_stat = rpc_auth_ok; return SVC_OK; } diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 635449ed7af6..f89075070fb0 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -1038,6 +1038,8 @@ svcauth_gss_set_client(struct svc_rqst *rqstp) struct rpc_gss_wire_cred *gc = &svcdata->clcred; int stat; + rqstp->rq_auth_stat = rpc_autherr_badcred; + /* * A gss export can be specified either by: * export *(sec=krb5,rw) @@ -1053,6 +1055,8 @@ svcauth_gss_set_client(struct svc_rqst *rqstp) stat = svcauth_unix_set_client(rqstp); if (stat == SVC_DROP || stat == SVC_CLOSE) return stat; + + rqstp->rq_auth_stat = rpc_auth_ok; return SVC_OK; } diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 360dab62b6b4..2019d1203641 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1328,10 +1328,8 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) */ auth_res = svc_authenticate(rqstp); /* Also give the program a chance to reject this call: */ - if (auth_res == SVC_OK && progp) { - rqstp->rq_auth_stat = rpc_autherr_badcred; + if (auth_res == SVC_OK && progp) auth_res = progp->pg_authenticate(rqstp); - } if (auth_res != SVC_OK) trace_svc_authenticate(rqstp, auth_res); switch (auth_res) { diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index eacfebf326dd..d7ed7d49115a 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -681,8 +681,9 @@ svcauth_unix_set_client(struct svc_rqst *rqstp) rqstp->rq_client = NULL; if (rqstp->rq_proc == 0) - return SVC_OK; + goto out; + rqstp->rq_auth_stat = rpc_autherr_badcred; ipm = ip_map_cached_get(xprt); if (ipm == NULL) ipm = __ip_map_lookup(sn->ip_map_cache, rqstp->rq_server->sv_program->pg_class, @@ -719,6 +720,9 @@ svcauth_unix_set_client(struct svc_rqst *rqstp) put_group_info(cred->cr_group_info); cred->cr_group_info = gi; } + +out: + rqstp->rq_auth_stat = rpc_auth_ok; return SVC_OK; } From 9082e1d914f8b27114352b1940bbcc7522f682e7 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 15 Jul 2021 15:52:19 -0400 Subject: [PATCH 25/39] SUNRPC: Eliminate the RQ_AUTHERR flag Now that there is an alternate method for returning an auth_stat value, replace the RQ_AUTHERR flag with use of that new method. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- fs/nfs/callback_xdr.c | 3 ++- include/linux/sunrpc/svc.h | 2 -- include/trace/events/sunrpc.h | 3 +-- net/sunrpc/svc.c | 24 ++++-------------------- 4 files changed, 7 insertions(+), 25 deletions(-) diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index c5348ba81129..7ff99155b023 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -988,7 +988,8 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp) out_invalidcred: pr_warn_ratelimited("NFS: NFSv4 callback contains invalid cred\n"); - return svc_return_autherr(rqstp, rpc_autherr_badcred); + rqstp->rq_auth_stat = rpc_autherr_badcred; + return rpc_success; } /* diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 35f12963e1ff..63c9210cae06 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -275,7 +275,6 @@ struct svc_rqst { #define RQ_VICTIM (5) /* about to be shut down */ #define RQ_BUSY (6) /* request is busy */ #define RQ_DATA (7) /* request has data */ -#define RQ_AUTHERR (8) /* Request status is auth error */ unsigned long rq_flags; /* flags field */ ktime_t rq_qtime; /* enqueue time */ @@ -533,7 +532,6 @@ unsigned int svc_fill_write_vector(struct svc_rqst *rqstp, char *svc_fill_symlink_pathname(struct svc_rqst *rqstp, struct kvec *first, void *p, size_t total); -__be32 svc_return_autherr(struct svc_rqst *rqstp, __be32 auth_err); __be32 svc_generic_init_request(struct svc_rqst *rqstp, const struct svc_program *progp, struct svc_process_info *procinfo); diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index c7d9e6c7a979..169b93e4dbc1 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -1539,8 +1539,7 @@ DEFINE_SVCXDRBUF_EVENT(sendto); svc_rqst_flag(SPLICE_OK) \ svc_rqst_flag(VICTIM) \ svc_rqst_flag(BUSY) \ - svc_rqst_flag(DATA) \ - svc_rqst_flag_end(AUTHERR) + svc_rqst_flag_end(DATA) #undef svc_rqst_flag #undef svc_rqst_flag_end diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 2019d1203641..95836bf514b5 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1163,22 +1163,6 @@ void svc_printk(struct svc_rqst *rqstp, const char *fmt, ...) static __printf(2,3) void svc_printk(struct svc_rqst *rqstp, const char *fmt, ...) {} #endif -__be32 -svc_return_autherr(struct svc_rqst *rqstp, __be32 auth_err) -{ - set_bit(RQ_AUTHERR, &rqstp->rq_flags); - return auth_err; -} -EXPORT_SYMBOL_GPL(svc_return_autherr); - -static __be32 -svc_get_autherr(struct svc_rqst *rqstp, __be32 *statp) -{ - if (test_and_clear_bit(RQ_AUTHERR, &rqstp->rq_flags)) - return *statp; - return rpc_auth_ok; -} - static int svc_generic_dispatch(struct svc_rqst *rqstp, __be32 *statp) { @@ -1202,7 +1186,7 @@ svc_generic_dispatch(struct svc_rqst *rqstp, __be32 *statp) test_bit(RQ_DROPME, &rqstp->rq_flags)) return 0; - if (test_bit(RQ_AUTHERR, &rqstp->rq_flags)) + if (rqstp->rq_auth_stat != rpc_auth_ok) return 1; if (*statp != rpc_success) @@ -1390,15 +1374,15 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) goto release_dropit; if (*statp == rpc_garbage_args) goto err_garbage; - rqstp->rq_auth_stat = svc_get_autherr(rqstp, statp); - if (rqstp->rq_auth_stat != rpc_auth_ok) - goto err_release_bad_auth; } else { dprintk("svc: calling dispatcher\n"); if (!process.dispatch(rqstp, statp)) goto release_dropit; /* Release reply info */ } + if (rqstp->rq_auth_stat != rpc_auth_ok) + goto err_release_bad_auth; + /* Check RPC status result */ if (*statp != rpc_success) resv->iov_len = ((void*)statp) - resv->iov_base + 4; From 7d34c96217cf3c2d37ca0a56ca0bc3c3bef1e189 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 15 Jul 2021 15:52:25 -0400 Subject: [PATCH 26/39] NFS: Add a private local dispatcher for NFSv4 callback operations The client's NFSv4 callback service is the only remaining user of svc_generic_dispatch(). Note that the NFSv4 callback service doesn't use the .pc_encode and .pc_decode callouts in any substantial way, so they are removed. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- fs/nfs/callback_xdr.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 7ff99155b023..e30374e363a6 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -992,6 +992,15 @@ out_invalidcred: return rpc_success; } +static int +nfs_callback_dispatch(struct svc_rqst *rqstp, __be32 *statp) +{ + const struct svc_procedure *procp = rqstp->rq_procinfo; + + *statp = procp->pc_func(rqstp); + return 1; +} + /* * Define NFS4 callback COMPOUND ops. */ @@ -1080,7 +1089,7 @@ const struct svc_version nfs4_callback_version1 = { .vs_proc = nfs4_callback_procedures1, .vs_count = nfs4_callback_count1, .vs_xdrsize = NFS4_CALLBACK_XDRSIZE, - .vs_dispatch = NULL, + .vs_dispatch = nfs_callback_dispatch, .vs_hidden = true, .vs_need_cong_ctrl = true, }; @@ -1092,7 +1101,7 @@ const struct svc_version nfs4_callback_version4 = { .vs_proc = nfs4_callback_procedures1, .vs_count = nfs4_callback_count4, .vs_xdrsize = NFS4_CALLBACK_XDRSIZE, - .vs_dispatch = NULL, + .vs_dispatch = nfs_callback_dispatch, .vs_hidden = true, .vs_need_cong_ctrl = true, }; From c35a810ce59524971c4a3b45faed4d0121e5a305 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 15 Jul 2021 15:52:31 -0400 Subject: [PATCH 27/39] NFS: Remove unused callback void decoder Clean up: The callback RPC dispatcher no longer invokes these call outs, although svc_process_common() relies on seeing a .pc_encode function. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- fs/nfs/callback_xdr.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index e30374e363a6..c1d08ab1fe22 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -63,11 +63,10 @@ static __be32 nfs4_callback_null(struct svc_rqst *rqstp) return htonl(NFS4_OK); } -static int nfs4_decode_void(struct svc_rqst *rqstp, __be32 *p) -{ - return xdr_argsize_check(rqstp, p); -} - +/* + * svc_process_common() looks for an XDR encoder to know when + * not to drop a Reply. + */ static int nfs4_encode_void(struct svc_rqst *rqstp, __be32 *p) { return xdr_ressize_check(rqstp, p); @@ -1067,7 +1066,6 @@ static struct callback_op callback_ops[] = { static const struct svc_procedure nfs4_callback_procedures1[] = { [CB_NULL] = { .pc_func = nfs4_callback_null, - .pc_decode = nfs4_decode_void, .pc_encode = nfs4_encode_void, .pc_xdrressize = 1, .pc_name = "NULL", From 89ef17b6636f2ae3e4e4041f53be7c0118b4b6c1 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 15 Jul 2021 15:52:37 -0400 Subject: [PATCH 28/39] NFS: Extract the xdr_init_encode/decode() calls from decode_compound Clean up: Move the xdr_init_encode() and xdr_init_decode() calls into the dispatcher, just like the NFSD and lockd dispatchers. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- fs/nfs/callback_xdr.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index c1d08ab1fe22..bf0efec93da8 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -925,22 +925,15 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp) { struct cb_compound_hdr_arg hdr_arg = { 0 }; struct cb_compound_hdr_res hdr_res = { NULL }; - struct xdr_stream xdr_in, xdr_out; - __be32 *p, status; struct cb_process_state cps = { .drc_status = 0, .clp = NULL, .net = SVC_NET(rqstp), }; unsigned int nops = 0; + __be32 status; - xdr_init_decode(&xdr_in, &rqstp->rq_arg, - rqstp->rq_arg.head[0].iov_base, NULL); - - p = (__be32*)((char *)rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len); - xdr_init_encode(&xdr_out, &rqstp->rq_res, p, NULL); - - status = decode_compound_hdr_arg(&xdr_in, &hdr_arg); + status = decode_compound_hdr_arg(&rqstp->rq_arg_stream, &hdr_arg); if (status == htonl(NFS4ERR_RESOURCE)) return rpc_garbage_args; @@ -960,15 +953,15 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp) cps.minorversion = hdr_arg.minorversion; hdr_res.taglen = hdr_arg.taglen; hdr_res.tag = hdr_arg.tag; - if (encode_compound_hdr_res(&xdr_out, &hdr_res) != 0) { + if (encode_compound_hdr_res(&rqstp->rq_res_stream, &hdr_res) != 0) { if (cps.clp) nfs_put_client(cps.clp); return rpc_system_err; } while (status == 0 && nops != hdr_arg.nops) { - status = process_op(nops, rqstp, &xdr_in, - rqstp->rq_argp, &xdr_out, rqstp->rq_resp, - &cps); + status = process_op(nops, rqstp, &rqstp->rq_arg_stream, + rqstp->rq_argp, &rqstp->rq_res_stream, + rqstp->rq_resp, &cps); nops++; } @@ -996,6 +989,9 @@ nfs_callback_dispatch(struct svc_rqst *rqstp, __be32 *statp) { const struct svc_procedure *procp = rqstp->rq_procinfo; + svcxdr_init_decode(rqstp); + svcxdr_init_encode(rqstp); + *statp = procp->pc_func(rqstp); return 1; } From 9eff97abef057c02a13bb3aa0e4821cd60fd80df Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 15 Jul 2021 15:52:43 -0400 Subject: [PATCH 29/39] NFS: Clean up the synopsis of callback process_op() The xdr_stream and rq_arg and rq_res are already accessible via the @rqstp parameter. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- fs/nfs/callback_xdr.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index bf0efec93da8..4c48d85f6517 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -863,17 +863,16 @@ preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op) } static __be32 process_op(int nop, struct svc_rqst *rqstp, - struct xdr_stream *xdr_in, void *argp, - struct xdr_stream *xdr_out, void *resp, - struct cb_process_state *cps) + struct cb_process_state *cps) { + struct xdr_stream *xdr_out = &rqstp->rq_res_stream; struct callback_op *op = &callback_ops[0]; unsigned int op_nr; __be32 status; long maxlen; __be32 res; - status = decode_op_hdr(xdr_in, &op_nr); + status = decode_op_hdr(&rqstp->rq_arg_stream, &op_nr); if (unlikely(status)) return status; @@ -903,9 +902,11 @@ static __be32 process_op(int nop, struct svc_rqst *rqstp, maxlen = xdr_out->end - xdr_out->p; if (maxlen > 0 && maxlen < PAGE_SIZE) { - status = op->decode_args(rqstp, xdr_in, argp); + status = op->decode_args(rqstp, &rqstp->rq_arg_stream, + rqstp->rq_argp); if (likely(status == 0)) - status = op->process_op(argp, resp, cps); + status = op->process_op(rqstp->rq_argp, rqstp->rq_resp, + cps); } else status = htonl(NFS4ERR_RESOURCE); @@ -914,7 +915,7 @@ encode_hdr: if (unlikely(res)) return res; if (op->encode_res != NULL && status == 0) - status = op->encode_res(rqstp, xdr_out, resp); + status = op->encode_res(rqstp, xdr_out, rqstp->rq_resp); return status; } @@ -959,9 +960,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp) return rpc_system_err; } while (status == 0 && nops != hdr_arg.nops) { - status = process_op(nops, rqstp, &rqstp->rq_arg_stream, - rqstp->rq_argp, &rqstp->rq_res_stream, - rqstp->rq_resp, &cps); + status = process_op(nops, rqstp, &cps); nops++; } From ca7d1d1a0b975d3d8aaaeab008a07bb3d3c5ec7e Mon Sep 17 00:00:00 2001 From: Dai Ngo Date: Fri, 21 May 2021 15:09:38 -0400 Subject: [PATCH 30/39] NFSv4.2: remove restriction of copy size for inter-server copy. Currently inter-server copy is allowed only if the copy size is larger than (rsize*14) which is the over-head of the mount operation of the source export. This patch, relying on the delayed unmount feature, removes this restriction since the mount and unmount overhead is now not applicable for every inter-server copy. Signed-off-by: Dai Ngo Signed-off-by: Anna Schumaker --- fs/nfs/nfs4file.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index c820de58a661..c91565227ea2 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -158,13 +158,11 @@ static ssize_t __nfs4_copy_file_range(struct file *file_in, loff_t pos_in, sync = true; retry: if (!nfs42_files_from_same_server(file_in, file_out)) { - /* for inter copy, if copy size if smaller than 12 RPC - * payloads, fallback to traditional copy. There are - * 14 RPCs during an NFSv4.x mount between source/dest - * servers. + /* + * for inter copy, if copy size is too small + * then fallback to generic copy. */ - if (sync || - count <= 14 * NFS_SERVER(file_inode(file_in))->rsize) + if (sync) return -EOPNOTSUPP; cn_resp = kzalloc(sizeof(struct nfs42_copy_notify_res), GFP_NOFS); From 0a6ff58edbfb26469a095ab964095506352fc960 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 24 Aug 2021 11:38:17 -0400 Subject: [PATCH 31/39] SUNRPC: Simplify socket shutdown when not reusing TCP ports If we're not required to reuse the TCP port, then we can just immediately close the socket, and leave the cleanup details to the TCP layer. Fixes: e6237b6feb37 ("NFSv4.1: Don't rebind to the same source port when reconnecting to the server") Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- net/sunrpc/xprtsock.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 02b071dbdd22..5fb969f8a5ad 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2099,6 +2099,10 @@ static void xs_tcp_shutdown(struct rpc_xprt *xprt) if (sock == NULL) return; + if (!xprt->reuseport) { + xs_close(xprt); + return; + } switch (skst) { default: kernel_sock_shutdown(sock, SHUT_RDWR); From 7c81e6a9d75bd2c094005b64b442cec729dbdf66 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 24 Aug 2021 11:38:18 -0400 Subject: [PATCH 32/39] SUNRPC: Tweak TCP socket shutdown in the RPC client We only really need to call shutdown() if we're in the ESTABLISHED TCP state, since that is the only case where the client is initiating a close of an established connection. If the socket is in FIN_WAIT1 or FIN_WAIT2, then we've already initiated socket shutdown and are waiting for the server's reply, so do nothing. In all other cases where we've already received a FIN from the server, we should be able to just close the socket. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- net/sunrpc/xprtsock.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 5fb969f8a5ad..d80793a1b75c 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2104,12 +2104,15 @@ static void xs_tcp_shutdown(struct rpc_xprt *xprt) return; } switch (skst) { - default: + case TCP_FIN_WAIT1: + case TCP_FIN_WAIT2: + break; + case TCP_ESTABLISHED: + case TCP_CLOSE_WAIT: kernel_sock_shutdown(sock, SHUT_RDWR); trace_rpc_socket_shutdown(xprt, sock); break; - case TCP_CLOSE: - case TCP_TIME_WAIT: + default: xs_reset_transport(transport); } } From 79d534f8cbf9714e2ba735f5b6c97678d266b2de Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Wed, 18 Aug 2021 10:02:52 +0800 Subject: [PATCH 33/39] NFSv3: Delete duplicate judgement in nfs3_async_handle_jukebox As eb96d5c97b08 ("SUNRPC handle EKEYEXPIRED in call_refreshresult") commit handle EKEYEXPIRED in call_refreshresult, so there is only handle when "task->tk_status" is equal "-EJUKEBOX" in nfs3_async_handle_jukebox. Signed-off-by: Ye Bin Signed-off-by: Anna Schumaker --- fs/nfs/nfs3proc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 2299446b3b89..f7524310ddf4 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -49,8 +49,7 @@ nfs3_async_handle_jukebox(struct rpc_task *task, struct inode *inode) { if (task->tk_status != -EJUKEBOX) return 0; - if (task->tk_status == -EJUKEBOX) - nfs_inc_stats(inode, NFSIOS_DELAY); + nfs_inc_stats(inode, NFSIOS_DELAY); task->tk_status = 0; rpc_restart_call(task); rpc_delay(task, NFS_JUKEBOX_RETRY_TIME); From 3a3f976639f267823e443fdd8bffa03848fa1c3f Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Fri, 27 Aug 2021 14:37:14 -0400 Subject: [PATCH 34/39] SUNRPC keep track of number of transports to unique addresses Currently, xprt_switch keeps a number of all xprts (xps_nxprts) that were added to the switch regardless of whethere it's an nconnect transport or a transport to a trunkable address. Introduce a new counter to keep track of transports to unique destination addresses per xprt_switch. Signed-off-by: Olga Kornievskaia Signed-off-by: Anna Schumaker --- include/linux/sunrpc/xprtmultipath.h | 1 + net/sunrpc/clnt.c | 2 +- net/sunrpc/xprtmultipath.c | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/sunrpc/xprtmultipath.h b/include/linux/sunrpc/xprtmultipath.h index b19addc8b715..bbb8a5fa0816 100644 --- a/include/linux/sunrpc/xprtmultipath.h +++ b/include/linux/sunrpc/xprtmultipath.h @@ -18,6 +18,7 @@ struct rpc_xprt_switch { unsigned int xps_id; unsigned int xps_nxprts; unsigned int xps_nactive; + unsigned int xps_nunique_destaddr_xprts; atomic_long_t xps_queuelen; struct list_head xps_xprt_list; diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index a5b7f6e34d15..451ac7d031db 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2799,7 +2799,7 @@ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt, task = rpc_call_null_helper(clnt, xprt, NULL, RPC_TASK_ASYNC, &rpc_cb_add_xprt_call_ops, data); - + data->xps->xps_nunique_destaddr_xprts++; rpc_put_task(task); success: return 1; diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c index c60820e45082..1693f81aae37 100644 --- a/net/sunrpc/xprtmultipath.c +++ b/net/sunrpc/xprtmultipath.c @@ -139,6 +139,7 @@ struct rpc_xprt_switch *xprt_switch_alloc(struct rpc_xprt *xprt, xps->xps_iter_ops = &rpc_xprt_iter_singular; rpc_sysfs_xprt_switch_setup(xps, xprt, gfp_flags); xprt_switch_add_xprt_locked(xps, xprt); + xps->xps_nunique_destaddr_xprts = 1; rpc_sysfs_xprt_setup(xps, xprt, gfp_flags); } From df205d0a8ea1b873a29f1333f232f884e9728acc Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Fri, 27 Aug 2021 14:37:16 -0400 Subject: [PATCH 35/39] SUNRPC add xps_nunique_destaddr_xprts to xprt_switch_info in sysfs In sysfs's xprt_switch_info attribute also display the value of number of transports with unique destination addresses for this xprt_switch. Signed-off-by: Olga Kornievskaia Signed-off-by: Anna Schumaker --- net/sunrpc/sysfs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c index 414c664a3199..9a6f17e18f73 100644 --- a/net/sunrpc/sysfs.c +++ b/net/sunrpc/sysfs.c @@ -207,8 +207,10 @@ static ssize_t rpc_sysfs_xprt_switch_info_show(struct kobject *kobj, if (!xprt_switch) return 0; - ret = sprintf(buf, "num_xprts=%u\nnum_active=%u\nqueue_len=%ld\n", + ret = sprintf(buf, "num_xprts=%u\nnum_active=%u\n" + "num_unique_destaddr=%u\nqueue_len=%ld\n", xprt_switch->xps_nxprts, xprt_switch->xps_nactive, + xprt_switch->xps_nunique_destaddr_xprts, atomic_long_read(&xprt_switch->xps_queuelen)); xprt_switch_put(xprt_switch); return ret + 1; From 7e134205f62955369619021a695cd78fefd32451 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Fri, 27 Aug 2021 14:37:17 -0400 Subject: [PATCH 36/39] NFSv4 introduce max_connect mount options This option will control up to how many xprts can the client establish to the server with a distinct address (that means nconnect connections are not counted towards this new limit). This patch is setting up nfs structures to keeep track of the max_connect limit (does not enforce it). The default value is kept at 1 so that no current mounts that don't want any additional connections would be effected. The maximum value is set at 16. Mounts to DS are not limited to default value of 1 but instead set to the maximum default value of 16 (NFS_MAX_TRANSPORTS). Signed-off-by: Olga Kornievskaia Signed-off-by: Anna Schumaker --- fs/nfs/client.c | 1 + fs/nfs/fs_context.c | 7 +++++++ fs/nfs/internal.h | 2 ++ fs/nfs/nfs4client.c | 12 ++++++++++-- fs/nfs/super.c | 2 ++ include/linux/nfs_fs.h | 5 +++++ include/linux/nfs_fs_sb.h | 1 + 7 files changed, 28 insertions(+), 2 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 330f65727c45..486dec59972b 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -179,6 +179,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) clp->cl_proto = cl_init->proto; clp->cl_nconnect = cl_init->nconnect; + clp->cl_max_connect = cl_init->max_connect ? cl_init->max_connect : 1; clp->cl_net = get_net(cl_init->net); clp->cl_principal = "*"; diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index d95c9a39bc70..0d444a90f513 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -60,6 +60,7 @@ enum nfs_param { Opt_mountvers, Opt_namelen, Opt_nconnect, + Opt_max_connect, Opt_port, Opt_posix, Opt_proto, @@ -158,6 +159,7 @@ static const struct fs_parameter_spec nfs_fs_parameters[] = { fsparam_u32 ("mountvers", Opt_mountvers), fsparam_u32 ("namlen", Opt_namelen), fsparam_u32 ("nconnect", Opt_nconnect), + fsparam_u32 ("max_connect", Opt_max_connect), fsparam_string("nfsvers", Opt_vers), fsparam_u32 ("port", Opt_port), fsparam_flag_no("posix", Opt_posix), @@ -770,6 +772,11 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, goto out_of_bounds; ctx->nfs_server.nconnect = result.uint_32; break; + case Opt_max_connect: + if (result.uint_32 < 1 || result.uint_32 > NFS_MAX_TRANSPORTS) + goto out_of_bounds; + ctx->nfs_server.max_connect = result.uint_32; + break; case Opt_lookupcache: switch (result.uint_32) { case Opt_lookupcache_all: diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index a36af04188c2..66fc936834f2 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -67,6 +67,7 @@ struct nfs_client_initdata { int proto; u32 minorversion; unsigned int nconnect; + unsigned int max_connect; struct net *net; const struct rpc_timeout *timeparms; const struct cred *cred; @@ -121,6 +122,7 @@ struct nfs_fs_context { int port; unsigned short protocol; unsigned short nconnect; + unsigned short max_connect; unsigned short export_path_len; } nfs_server; diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 28431acd1230..270caa1805a2 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -865,6 +865,7 @@ static int nfs4_set_client(struct nfs_server *server, const char *ip_addr, int proto, const struct rpc_timeout *timeparms, u32 minorversion, unsigned int nconnect, + unsigned int max_connect, struct net *net) { struct nfs_client_initdata cl_init = { @@ -883,6 +884,8 @@ static int nfs4_set_client(struct nfs_server *server, if (minorversion == 0) __set_bit(NFS_CS_REUSEPORT, &cl_init.init_flags); + else + cl_init.max_connect = max_connect; if (proto == XPRT_TRANSPORT_TCP) cl_init.nconnect = nconnect; @@ -952,8 +955,10 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, return ERR_PTR(-EINVAL); cl_init.hostname = buf; - if (mds_clp->cl_nconnect > 1 && ds_proto == XPRT_TRANSPORT_TCP) + if (mds_clp->cl_nconnect > 1 && ds_proto == XPRT_TRANSPORT_TCP) { cl_init.nconnect = mds_clp->cl_nconnect; + cl_init.max_connect = NFS_MAX_TRANSPORTS; + } if (mds_srv->flags & NFS_MOUNT_NORESVPORT) __set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); @@ -1122,6 +1127,7 @@ static int nfs4_init_server(struct nfs_server *server, struct fs_context *fc) &timeparms, ctx->minorversion, ctx->nfs_server.nconnect, + ctx->nfs_server.max_connect, fc->net_ns); if (error < 0) return error; @@ -1211,6 +1217,7 @@ struct nfs_server *nfs4_create_referral_server(struct fs_context *fc) parent_server->client->cl_timeout, parent_client->cl_mvops->minor_version, parent_client->cl_nconnect, + parent_client->cl_max_connect, parent_client->cl_net); if (!error) goto init_server; @@ -1226,6 +1233,7 @@ struct nfs_server *nfs4_create_referral_server(struct fs_context *fc) parent_server->client->cl_timeout, parent_client->cl_mvops->minor_version, parent_client->cl_nconnect, + parent_client->cl_max_connect, parent_client->cl_net); if (error < 0) goto error; @@ -1323,7 +1331,7 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname, error = nfs4_set_client(server, hostname, sap, salen, buf, clp->cl_proto, clnt->cl_timeout, clp->cl_minorversion, - clp->cl_nconnect, net); + clp->cl_nconnect, clp->cl_max_connect, net); clear_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status); if (error != 0) { nfs_server_insert_lists(server); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index fe58525cfed4..e65c83494c05 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -480,6 +480,8 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, if (clp->cl_nconnect > 0) seq_printf(m, ",nconnect=%u", clp->cl_nconnect); if (version == 4) { + if (clp->cl_max_connect > 1) + seq_printf(m, ",max_connect=%u", clp->cl_max_connect); if (nfss->port != NFS_PORT) seq_printf(m, ",port=%u", nfss->port); } else diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index ce6474594872..b9a8b925db43 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -40,6 +40,11 @@ #include +/* + * These are the default for number of transports to different server IPs + */ +#define NFS_MAX_TRANSPORTS 16 + /* * These are the default flags for swap requests */ diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index d71a0e90faeb..2a9acbfe00f0 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -62,6 +62,7 @@ struct nfs_client { u32 cl_minorversion;/* NFSv4 minorversion */ unsigned int cl_nconnect; /* Number of connections */ + unsigned int cl_max_connect; /* max number of xprts allowed */ const char * cl_principal; /* used for machine cred */ #if IS_ENABLED(CONFIG_NFS_V4) From dc48e0abee245e2f0361bd8d4e3b00f70450fab2 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Fri, 27 Aug 2021 14:37:18 -0400 Subject: [PATCH 37/39] SUNRPC enforce creation of no more than max_connect xprts If we are adding new transports via rpc_clnt_test_and_add_xprt() then check if we've reached the limit. Currently only pnfs path adds transports via that function but this is done in preparation when the client would add new transports when session trunking is detected. A warning is logged if the limit is reached. Signed-off-by: Olga Kornievskaia Signed-off-by: Anna Schumaker --- fs/nfs/client.c | 1 + include/linux/sunrpc/clnt.h | 2 ++ net/sunrpc/clnt.c | 9 +++++++++ 3 files changed, 12 insertions(+) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 486dec59972b..23e165d5ec9c 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -541,6 +541,7 @@ int nfs_create_rpc_client(struct nfs_client *clp, clnt->cl_principal = clp->cl_principal; clp->cl_rpcclient = clnt; + clnt->cl_max_connect = clp->cl_max_connect; return 0; } EXPORT_SYMBOL_GPL(nfs_create_rpc_client); diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index b2edd5fc2f0c..a4661646adc9 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -82,6 +82,7 @@ struct rpc_clnt { struct work_struct cl_work; }; const struct cred *cl_cred; + unsigned int cl_max_connect; /* max number of transports not to the same IP */ }; /* @@ -136,6 +137,7 @@ struct rpc_create_args { char *client_name; struct svc_xprt *bc_xprt; /* NFSv4.1 backchannel */ const struct cred *cred; + unsigned int max_connect; }; struct rpc_add_xprt_test { diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 451ac7d031db..f056ff931444 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2787,6 +2787,15 @@ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt, struct rpc_cb_add_xprt_calldata *data; struct rpc_task *task; + if (xps->xps_nunique_destaddr_xprts + 1 > clnt->cl_max_connect) { + rcu_read_lock(); + pr_warn("SUNRPC: reached max allowed number (%d) did not add " + "transport to server: %s\n", clnt->cl_max_connect, + rpc_peeraddr2str(clnt, RPC_DISPLAY_ADDR)); + rcu_read_unlock(); + return -EINVAL; + } + data = kmalloc(sizeof(*data), GFP_NOFS); if (!data) return -ENOMEM; From 2a7a451a9084877a0b9d335c77d57e4cda1e5882 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Fri, 27 Aug 2021 14:37:19 -0400 Subject: [PATCH 38/39] NFSv4.1 add network transport when session trunking is detected After trunking is discovered in nfs4_discover_server_trunking(), add the transport to the old client structure if the allowed limit of transports has not been reached. An example: there exists a multi-homed server and client mounts one server address and some volume and then doest another mount to a different address of the same server and perhaps a different volume. Previously, the client checks that this is a session trunkable servers (same server), and removes the newly created client structure along with its transport. Now, the client adds the connection from the 2nd mount into the xprt switch of the existing client (it leads to having 2 available connections). Signed-off-by: Olga Kornievskaia Signed-off-by: Anna Schumaker --- fs/nfs/nfs4client.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 270caa1805a2..af57332503be 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -402,6 +402,33 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp) return nfs4_init_callback(clp); } +static void nfs4_add_trunk(struct nfs_client *clp, struct nfs_client *old) +{ + struct sockaddr_storage clp_addr, old_addr; + struct sockaddr *clp_sap = (struct sockaddr *)&clp_addr; + struct sockaddr *old_sap = (struct sockaddr *)&old_addr; + size_t clp_salen; + struct xprt_create xprt_args = { + .ident = old->cl_proto, + .net = old->cl_net, + .servername = old->cl_hostname, + }; + + if (clp->cl_proto != old->cl_proto) + return; + clp_salen = rpc_peeraddr(clp->cl_rpcclient, clp_sap, sizeof(clp_addr)); + rpc_peeraddr(old->cl_rpcclient, old_sap, sizeof(old_addr)); + + if (clp_addr.ss_family != old_addr.ss_family) + return; + + xprt_args.dstaddr = clp_sap; + xprt_args.addrlen = clp_salen; + + rpc_clnt_add_xprt(old->cl_rpcclient, &xprt_args, + rpc_clnt_test_and_add_xprt, NULL); +} + /** * nfs4_init_client - Initialise an NFS4 client record * @@ -436,6 +463,8 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, * won't try to use it. */ nfs_mark_client_ready(clp, -EPERM); + if (old->cl_mvops->session_trunk) + nfs4_add_trunk(clp, old); } clear_bit(NFS_CS_TSM_POSSIBLE, &clp->cl_flags); nfs_put_client(clp); From 8cfb9015280d49f9d92d5b0f88cedf5f0856c0fd Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 27 Aug 2021 14:00:56 -0400 Subject: [PATCH 39/39] NFS: Always provide aligned buffers to the RPC read layers Instead of messing around with XDR padding in the RDMA layer, we should just give the RPC layer an aligned buffer. Try to avoid creating extra RPC calls by aligning to the smaller value of ALIGN(len, rsize) and PAGE_SIZE. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/read.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 9f39e0a1a38b..08d6cc57cbc3 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -293,15 +293,19 @@ static int readpage_async_filler(void *data, struct page *page) { struct nfs_readdesc *desc = data; + struct inode *inode = page_file_mapping(page)->host; + unsigned int rsize = NFS_SERVER(inode)->rsize; struct nfs_page *new; - unsigned int len; + unsigned int len, aligned_len; int error; len = nfs_page_length(page); if (len == 0) return nfs_return_empty_page(page); - new = nfs_create_request(desc->ctx, page, 0, len); + aligned_len = min_t(unsigned int, ALIGN(len, rsize), PAGE_SIZE); + + new = nfs_create_request(desc->ctx, page, 0, aligned_len); if (IS_ERR(new)) goto out_error;