Skip to content

Commit a6b31d1

Browse files
Trond MyklebustTrond Myklebust
Trond Myklebust
authored and
Trond Myklebust
committed
SUNRPC: Fix a data corruption issue when retransmitting RPC calls
The following scenario can cause silent data corruption when doing NFS writes. It has mainly been observed when doing database writes using O_DIRECT. 1) The RPC client uses sendpage() to do zero-copy of the page data. 2) Due to networking issues, the reply from the server is delayed, and so the RPC client times out. 3) The client issues a second sendpage of the page data as part of an RPC call retransmission. 4) The reply to the first transmission arrives from the server _before_ the client hardware has emptied the TCP socket send buffer. 5) After processing the reply, the RPC state machine rules that the call to be done, and triggers the completion callbacks. 6) The application notices the RPC call is done, and reuses the pages to store something else (e.g. a new write). 7) The client NIC drains the TCP socket send buffer. Since the page data has now changed, it reads a corrupted version of the initial RPC call, and puts it on the wire. This patch fixes the problem in the following manner: The ordering guarantees of TCP ensure that when the server sends a reply, then we know that the _first_ transmission has completed. Using zero-copy in that situation is therefore safe. If a time out occurs, we then send the retransmission using sendmsg() (i.e. no zero-copy), We then know that the socket contains a full copy of the data, and so it will retransmit a faithful reproduction even if the RPC call completes, and the application reuses the O_DIRECT buffer in the meantime. Signed-off-by: Trond Myklebust <[email protected]> Cc: [email protected]
1 parent fab99eb commit a6b31d1

File tree

1 file changed

+21
-7
lines changed

1 file changed

+21
-7
lines changed

net/sunrpc/xprtsock.c

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -393,8 +393,10 @@ static int xs_send_kvec(struct socket *sock, struct sockaddr *addr, int addrlen,
393393
return kernel_sendmsg(sock, &msg, NULL, 0, 0);
394394
}
395395

396-
static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned int base, int more)
396+
static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned int base, int more, bool zerocopy)
397397
{
398+
ssize_t (*do_sendpage)(struct socket *sock, struct page *page,
399+
int offset, size_t size, int flags);
398400
struct page **ppage;
399401
unsigned int remainder;
400402
int err, sent = 0;
@@ -403,14 +405,17 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i
403405
base += xdr->page_base;
404406
ppage = xdr->pages + (base >> PAGE_SHIFT);
405407
base &= ~PAGE_MASK;
408+
do_sendpage = sock->ops->sendpage;
409+
if (!zerocopy)
410+
do_sendpage = sock_no_sendpage;
406411
for(;;) {
407412
unsigned int len = min_t(unsigned int, PAGE_SIZE - base, remainder);
408413
int flags = XS_SENDMSG_FLAGS;
409414

410415
remainder -= len;
411416
if (remainder != 0 || more)
412417
flags |= MSG_MORE;
413-
err = sock->ops->sendpage(sock, *ppage, base, len, flags);
418+
err = do_sendpage(sock, *ppage, base, len, flags);
414419
if (remainder == 0 || err != len)
415420
break;
416421
sent += err;
@@ -431,9 +436,10 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i
431436
* @addrlen: UDP only -- length of destination address
432437
* @xdr: buffer containing this request
433438
* @base: starting position in the buffer
439+
* @zerocopy: true if it is safe to use sendpage()
434440
*
435441
*/
436-
static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base)
442+
static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base, bool zerocopy)
437443
{
438444
unsigned int remainder = xdr->len - base;
439445
int err, sent = 0;
@@ -461,7 +467,7 @@ static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen,
461467
if (base < xdr->page_len) {
462468
unsigned int len = xdr->page_len - base;
463469
remainder -= len;
464-
err = xs_send_pagedata(sock, xdr, base, remainder != 0);
470+
err = xs_send_pagedata(sock, xdr, base, remainder != 0, zerocopy);
465471
if (remainder == 0 || err != len)
466472
goto out;
467473
sent += err;
@@ -564,7 +570,7 @@ static int xs_local_send_request(struct rpc_task *task)
564570
req->rq_svec->iov_base, req->rq_svec->iov_len);
565571

566572
status = xs_sendpages(transport->sock, NULL, 0,
567-
xdr, req->rq_bytes_sent);
573+
xdr, req->rq_bytes_sent, true);
568574
dprintk("RPC: %s(%u) = %d\n",
569575
__func__, xdr->len - req->rq_bytes_sent, status);
570576
if (likely(status >= 0)) {
@@ -620,7 +626,7 @@ static int xs_udp_send_request(struct rpc_task *task)
620626
status = xs_sendpages(transport->sock,
621627
xs_addr(xprt),
622628
xprt->addrlen, xdr,
623-
req->rq_bytes_sent);
629+
req->rq_bytes_sent, true);
624630

625631
dprintk("RPC: xs_udp_send_request(%u) = %d\n",
626632
xdr->len - req->rq_bytes_sent, status);
@@ -693,20 +699,28 @@ static int xs_tcp_send_request(struct rpc_task *task)
693699
struct rpc_xprt *xprt = req->rq_xprt;
694700
struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
695701
struct xdr_buf *xdr = &req->rq_snd_buf;
702+
bool zerocopy = true;
696703
int status;
697704

698705
xs_encode_stream_record_marker(&req->rq_snd_buf);
699706

700707
xs_pktdump("packet data:",
701708
req->rq_svec->iov_base,
702709
req->rq_svec->iov_len);
710+
/* Don't use zero copy if this is a resend. If the RPC call
711+
* completes while the socket holds a reference to the pages,
712+
* then we may end up resending corrupted data.
713+
*/
714+
if (task->tk_flags & RPC_TASK_SENT)
715+
zerocopy = false;
703716

704717
/* Continue transmitting the packet/record. We must be careful
705718
* to cope with writespace callbacks arriving _after_ we have
706719
* called sendmsg(). */
707720
while (1) {
708721
status = xs_sendpages(transport->sock,
709-
NULL, 0, xdr, req->rq_bytes_sent);
722+
NULL, 0, xdr, req->rq_bytes_sent,
723+
zerocopy);
710724

711725
dprintk("RPC: xs_tcp_send_request(%u) = %d\n",
712726
xdr->len - req->rq_bytes_sent, status);

0 commit comments

Comments
 (0)