SUNRPC: Handle TCP socket sends with kernel_sendpage() again
Daire Byrne reports a ~50% aggregrate throughput regression on his Linux NFS server after commitda1661b93b("SUNRPC: Teach server to use xprt_sock_sendmsg for socket sends"), which replaced kernel_send_page() calls in NFSD's socket send path with calls to sock_sendmsg() using iov_iter. Investigation showed that tcp_sendmsg() was not using zero-copy to send the xdr_buf's bvec pages, but instead was relying on memcpy. This means copying every byte of a large NFS READ payload. It looks like TLS sockets do indeed support a ->sendpage method, so it's really not necessary to use xprt_sock_sendmsg() to support TLS fully on the server. A mechanical reversion ofda1661b93bis not possible at this point, but we can re-implement the server's TCP socket sendmsg path using kernel_sendpage(). Reported-by: Daire Byrne <daire@dneg.com> BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=209439 Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
This commit is contained in:
		
							parent
							
								
									d6c9e4368c
								
							
						
					
					
						commit
						4a85a6a332
					
				| @ -1062,6 +1062,90 @@ err_noclose: | |||||||
| 	return 0;	/* record not complete */ | 	return 0;	/* record not complete */ | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | static int svc_tcp_send_kvec(struct socket *sock, const struct kvec *vec, | ||||||
|  | 			      int flags) | ||||||
|  | { | ||||||
|  | 	return kernel_sendpage(sock, virt_to_page(vec->iov_base), | ||||||
|  | 			       offset_in_page(vec->iov_base), | ||||||
|  | 			       vec->iov_len, flags); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | /*
 | ||||||
|  |  * kernel_sendpage() is used exclusively to reduce the number of | ||||||
|  |  * copy operations in this path. Therefore the caller must ensure | ||||||
|  |  * that the pages backing @xdr are unchanging. | ||||||
|  |  * | ||||||
|  |  * In addition, the logic assumes that * .bv_len is never larger | ||||||
|  |  * than PAGE_SIZE. | ||||||
|  |  */ | ||||||
|  | static int svc_tcp_sendmsg(struct socket *sock, struct msghdr *msg, | ||||||
|  | 			   struct xdr_buf *xdr, rpc_fraghdr marker, | ||||||
|  | 			   unsigned int *sentp) | ||||||
|  | { | ||||||
|  | 	const struct kvec *head = xdr->head; | ||||||
|  | 	const struct kvec *tail = xdr->tail; | ||||||
|  | 	struct kvec rm = { | ||||||
|  | 		.iov_base	= &marker, | ||||||
|  | 		.iov_len	= sizeof(marker), | ||||||
|  | 	}; | ||||||
|  | 	int flags, ret; | ||||||
|  | 
 | ||||||
|  | 	*sentp = 0; | ||||||
|  | 	xdr_alloc_bvec(xdr, GFP_KERNEL); | ||||||
|  | 
 | ||||||
|  | 	msg->msg_flags = MSG_MORE; | ||||||
|  | 	ret = kernel_sendmsg(sock, msg, &rm, 1, rm.iov_len); | ||||||
|  | 	if (ret < 0) | ||||||
|  | 		return ret; | ||||||
|  | 	*sentp += ret; | ||||||
|  | 	if (ret != rm.iov_len) | ||||||
|  | 		return -EAGAIN; | ||||||
|  | 
 | ||||||
|  | 	flags = head->iov_len < xdr->len ? MSG_MORE | MSG_SENDPAGE_NOTLAST : 0; | ||||||
|  | 	ret = svc_tcp_send_kvec(sock, head, flags); | ||||||
|  | 	if (ret < 0) | ||||||
|  | 		return ret; | ||||||
|  | 	*sentp += ret; | ||||||
|  | 	if (ret != head->iov_len) | ||||||
|  | 		goto out; | ||||||
|  | 
 | ||||||
|  | 	if (xdr->page_len) { | ||||||
|  | 		unsigned int offset, len, remaining; | ||||||
|  | 		struct bio_vec *bvec; | ||||||
|  | 
 | ||||||
|  | 		bvec = xdr->bvec; | ||||||
|  | 		offset = xdr->page_base; | ||||||
|  | 		remaining = xdr->page_len; | ||||||
|  | 		flags = MSG_MORE | MSG_SENDPAGE_NOTLAST; | ||||||
|  | 		while (remaining > 0) { | ||||||
|  | 			if (remaining <= PAGE_SIZE && tail->iov_len == 0) | ||||||
|  | 				flags = 0; | ||||||
|  | 			len = min(remaining, bvec->bv_len); | ||||||
|  | 			ret = kernel_sendpage(sock, bvec->bv_page, | ||||||
|  | 					      bvec->bv_offset + offset, | ||||||
|  | 					      len, flags); | ||||||
|  | 			if (ret < 0) | ||||||
|  | 				return ret; | ||||||
|  | 			*sentp += ret; | ||||||
|  | 			if (ret != len) | ||||||
|  | 				goto out; | ||||||
|  | 			remaining -= len; | ||||||
|  | 			offset = 0; | ||||||
|  | 			bvec++; | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	if (tail->iov_len) { | ||||||
|  | 		ret = svc_tcp_send_kvec(sock, tail, 0); | ||||||
|  | 		if (ret < 0) | ||||||
|  | 			return ret; | ||||||
|  | 		*sentp += ret; | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | out: | ||||||
|  | 	return 0; | ||||||
|  | } | ||||||
|  | 
 | ||||||
| /**
 | /**
 | ||||||
|  * svc_tcp_sendto - Send out a reply on a TCP socket |  * svc_tcp_sendto - Send out a reply on a TCP socket | ||||||
|  * @rqstp: completed svc_rqst |  * @rqstp: completed svc_rqst | ||||||
| @ -1089,7 +1173,7 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp) | |||||||
| 	mutex_lock(&xprt->xpt_mutex); | 	mutex_lock(&xprt->xpt_mutex); | ||||||
| 	if (svc_xprt_is_dead(xprt)) | 	if (svc_xprt_is_dead(xprt)) | ||||||
| 		goto out_notconn; | 		goto out_notconn; | ||||||
| 	err = xprt_sock_sendmsg(svsk->sk_sock, &msg, xdr, 0, marker, &sent); | 	err = svc_tcp_sendmsg(svsk->sk_sock, &msg, xdr, marker, &sent); | ||||||
| 	xdr_free_bvec(xdr); | 	xdr_free_bvec(xdr); | ||||||
| 	trace_svcsock_tcp_send(xprt, err < 0 ? err : sent); | 	trace_svcsock_tcp_send(xprt, err < 0 ? err : sent); | ||||||
| 	if (err < 0 || sent != (xdr->len + sizeof(marker))) | 	if (err < 0 || sent != (xdr->len + sizeof(marker))) | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user