From 33849792cbcdae2b04819cfb09fe3dca0a84a11e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 11 Apr 2017 13:22:46 -0400 Subject: [PATCH] xprtrdma: Detect unreachable NFS/RDMA servers more reliably Current NFS clients rely on connection loss to determine when to retransmit. In particular, for protocols like NFSv4, clients no longer rely on RPC timeouts to drive retransmission: NFSv4 servers are required to terminate a connection when they need a client to retransmit pending RPCs. When a server is no longer reachable, either because it has crashed or because the network path has broken, the server cannot actively terminate a connection. Thus NFS clients depend on transport-level keepalive to determine when a connection must be replaced and pending RPCs retransmitted. However, RDMA RC connections do not have a native keepalive mechanism. If an NFS/RDMA server crashes after a client has sent RPCs successfully (an RC ACK has been received for all OTW RDMA requests), there is no way for the client to know the connection is moribund. In addition, new RDMA requests are subject to the RPC-over-RDMA credit limit. If the client has consumed all granted credits with NFS traffic, it is not allowed to send another RDMA request until the server replies. Thus it has no way to send a true keepalive when the workload has already consumed all credits with pending RPCs. To address this, forcibly disconnect a transport when an RPC times out. This prevents moribund connections from stopping the detection of failover or other configuration changes on the server. Note that even if the connection is still good, retransmitting any RPC will trigger a disconnect thanks to this logic in xprt_rdma_send_request: /* Must suppress retransmit to maintain credits */ if (req->rl_connect_cookie == xprt->connect_cookie) goto drop_connection; req->rl_connect_cookie = xprt->connect_cookie; Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/transport.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index c717f5410776..acf5d81f4d6e 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -484,6 +484,27 @@ xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port) dprintk("RPC: %s: %u\n", __func__, port); } +/** + * xprt_rdma_timer - invoked when an RPC times out + * @xprt: controlling RPC transport + * @task: RPC task that timed out + * + * Invoked when the transport is still connected, but an RPC + * retransmit timeout occurs. + * + * Since RDMA connections don't have a keep-alive, forcibly + * disconnect and retry to connect. This drives full + * detection of the network path, and retransmissions of + * all pending RPCs. + */ +static void +xprt_rdma_timer(struct rpc_xprt *xprt, struct rpc_task *task) +{ + dprintk("RPC: %5u %s: xprt = %p\n", task->tk_pid, __func__, xprt); + + xprt_force_disconnect(xprt); +} + static void xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) { @@ -776,6 +797,7 @@ static struct rpc_xprt_ops xprt_rdma_procs = { .alloc_slot = xprt_alloc_slot, .release_request = xprt_release_rqst_cong, /* ditto */ .set_retrans_timeout = xprt_set_retrans_timeout_def, /* ditto */ + .timer = xprt_rdma_timer, .rpcbind = rpcb_getport_async, /* sunrpc/rpcb_clnt.c */ .set_port = xprt_rdma_set_port, .connect = xprt_rdma_connect, -- 2.30.2