RDMA/cma: Add an ID_REUSEADDR option
authorHefty, Sean <sean.hefty@intel.com>
Tue, 10 May 2011 05:06:10 +0000 (22:06 -0700)
committerRoland Dreier <roland@purestorage.com>
Tue, 10 May 2011 05:06:10 +0000 (22:06 -0700)
Lustre requires that clients bind to a privileged port number before
connecting to a remote server.  On larger clusters (typically more
than about 1000 nodes), the number of privileged ports is exhausted,
resulting in lustre being unusable.

To handle this, we add support for reusable addresses to the rdma_cm.
This mimics the behavior of the socket option SO_REUSEADDR.  A user
may set an rdma_cm_id to reuse an address before calling
rdma_bind_addr() (explicitly or implicitly).  If set, other
rdma_cm_id's may be bound to the same address, provided that they all
have reuse enabled, and there are no active listens.

If rdma_listen() is called on an rdma_cm_id that has reuse enabled, it
will only succeed if there are no other id's bound to that same
address.  The reuse option is exported to user space.  The behavior of
the kernel reuse implementation was verified against that given by
sockets.

This patch is derived from a path by Ira Weiny <weiny2@llnl.gov>

Signed-off-by: Sean Hefty <sean.hefty@intel.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
drivers/infiniband/core/cma.c
drivers/infiniband/core/ucma.c
include/rdma/rdma_cm.h
include/rdma/rdma_user_cm.h

index eff5e46f005cbba483dfe2055a70fe9e92a7e1ab..99dde874fbbdf53e93675643ac1b1e1bba372bd9 100644 (file)
@@ -148,6 +148,7 @@ struct rdma_id_private {
        u32                     qp_num;
        u8                      srq;
        u8                      tos;
+       u8                      reuseaddr;
 };
 
 struct cma_multicast {
@@ -1579,50 +1580,6 @@ static void cma_listen_on_all(struct rdma_id_private *id_priv)
        mutex_unlock(&lock);
 }
 
-int rdma_listen(struct rdma_cm_id *id, int backlog)
-{
-       struct rdma_id_private *id_priv;
-       int ret;
-
-       id_priv = container_of(id, struct rdma_id_private, id);
-       if (id_priv->state == CMA_IDLE) {
-               ((struct sockaddr *) &id->route.addr.src_addr)->sa_family = AF_INET;
-               ret = rdma_bind_addr(id, (struct sockaddr *) &id->route.addr.src_addr);
-               if (ret)
-                       return ret;
-       }
-
-       if (!cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_LISTEN))
-               return -EINVAL;
-
-       id_priv->backlog = backlog;
-       if (id->device) {
-               switch (rdma_node_get_transport(id->device->node_type)) {
-               case RDMA_TRANSPORT_IB:
-                       ret = cma_ib_listen(id_priv);
-                       if (ret)
-                               goto err;
-                       break;
-               case RDMA_TRANSPORT_IWARP:
-                       ret = cma_iw_listen(id_priv, backlog);
-                       if (ret)
-                               goto err;
-                       break;
-               default:
-                       ret = -ENOSYS;
-                       goto err;
-               }
-       } else
-               cma_listen_on_all(id_priv);
-
-       return 0;
-err:
-       id_priv->backlog = 0;
-       cma_comp_exch(id_priv, CMA_LISTEN, CMA_ADDR_BOUND);
-       return ret;
-}
-EXPORT_SYMBOL(rdma_listen);
-
 void rdma_set_service_type(struct rdma_cm_id *id, int tos)
 {
        struct rdma_id_private *id_priv;
@@ -2105,6 +2062,25 @@ err:
 }
 EXPORT_SYMBOL(rdma_resolve_addr);
 
+int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse)
+{
+       struct rdma_id_private *id_priv;
+       unsigned long flags;
+       int ret;
+
+       id_priv = container_of(id, struct rdma_id_private, id);
+       spin_lock_irqsave(&id_priv->lock, flags);
+       if (id_priv->state == CMA_IDLE) {
+               id_priv->reuseaddr = reuse;
+               ret = 0;
+       } else {
+               ret = -EINVAL;
+       }
+       spin_unlock_irqrestore(&id_priv->lock, flags);
+       return ret;
+}
+EXPORT_SYMBOL(rdma_set_reuseaddr);
+
 static void cma_bind_port(struct rdma_bind_list *bind_list,
                          struct rdma_id_private *id_priv)
 {
@@ -2180,43 +2156,73 @@ retry:
        return -EADDRNOTAVAIL;
 }
 
-static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv)
+/*
+ * Check that the requested port is available.  This is called when trying to
+ * bind to a specific port, or when trying to listen on a bound port.  In
+ * the latter case, the provided id_priv may already be on the bind_list, but
+ * we still need to check that it's okay to start listening.
+ */
+static int cma_check_port(struct rdma_bind_list *bind_list,
+                         struct rdma_id_private *id_priv, uint8_t reuseaddr)
 {
        struct rdma_id_private *cur_id;
        struct sockaddr *addr, *cur_addr;
-       struct rdma_bind_list *bind_list;
        struct hlist_node *node;
-       unsigned short snum;
 
        addr = (struct sockaddr *) &id_priv->id.route.addr.src_addr;
-       snum = ntohs(cma_port(addr));
-       if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
-               return -EACCES;
-
-       bind_list = idr_find(ps, snum);
-       if (!bind_list)
-               return cma_alloc_port(ps, id_priv, snum);
-
-       /*
-        * We don't support binding to any address if anyone is bound to
-        * a specific address on the same port.
-        */
-       if (cma_any_addr(addr))
+       if (cma_any_addr(addr) && !reuseaddr)
                return -EADDRNOTAVAIL;
 
        hlist_for_each_entry(cur_id, node, &bind_list->owners, node) {
-               cur_addr = (struct sockaddr *) &cur_id->id.route.addr.src_addr;
-               if (cma_any_addr(cur_addr))
-                       return -EADDRNOTAVAIL;
+               if (id_priv == cur_id)
+                       continue;
 
-               if (!cma_addr_cmp(addr, cur_addr))
-                       return -EADDRINUSE;
-       }
+               if ((cur_id->state == CMA_LISTEN) ||
+                   !reuseaddr || !cur_id->reuseaddr) {
+                       cur_addr = (struct sockaddr *) &cur_id->id.route.addr.src_addr;
+                       if (cma_any_addr(cur_addr))
+                               return -EADDRNOTAVAIL;
 
-       cma_bind_port(bind_list, id_priv);
+                       if (!cma_addr_cmp(addr, cur_addr))
+                               return -EADDRINUSE;
+               }
+       }
        return 0;
 }
 
+static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv)
+{
+       struct rdma_bind_list *bind_list;
+       unsigned short snum;
+       int ret;
+
+       snum = ntohs(cma_port((struct sockaddr *) &id_priv->id.route.addr.src_addr));
+       if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
+               return -EACCES;
+
+       bind_list = idr_find(ps, snum);
+       if (!bind_list) {
+               ret = cma_alloc_port(ps, id_priv, snum);
+       } else {
+               ret = cma_check_port(bind_list, id_priv, id_priv->reuseaddr);
+               if (!ret)
+                       cma_bind_port(bind_list, id_priv);
+       }
+       return ret;
+}
+
+static int cma_bind_listen(struct rdma_id_private *id_priv)
+{
+       struct rdma_bind_list *bind_list = id_priv->bind_list;
+       int ret = 0;
+
+       mutex_lock(&lock);
+       if (bind_list->owners.first->next)
+               ret = cma_check_port(bind_list, id_priv, 0);
+       mutex_unlock(&lock);
+       return ret;
+}
+
 static int cma_get_port(struct rdma_id_private *id_priv)
 {
        struct idr *ps;
@@ -2268,6 +2274,56 @@ static int cma_check_linklocal(struct rdma_dev_addr *dev_addr,
        return 0;
 }
 
+int rdma_listen(struct rdma_cm_id *id, int backlog)
+{
+       struct rdma_id_private *id_priv;
+       int ret;
+
+       id_priv = container_of(id, struct rdma_id_private, id);
+       if (id_priv->state == CMA_IDLE) {
+               ((struct sockaddr *) &id->route.addr.src_addr)->sa_family = AF_INET;
+               ret = rdma_bind_addr(id, (struct sockaddr *) &id->route.addr.src_addr);
+               if (ret)
+                       return ret;
+       }
+
+       if (!cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_LISTEN))
+               return -EINVAL;
+
+       if (id_priv->reuseaddr) {
+               ret = cma_bind_listen(id_priv);
+               if (ret)
+                       goto err;
+       }
+
+       id_priv->backlog = backlog;
+       if (id->device) {
+               switch (rdma_node_get_transport(id->device->node_type)) {
+               case RDMA_TRANSPORT_IB:
+                       ret = cma_ib_listen(id_priv);
+                       if (ret)
+                               goto err;
+                       break;
+               case RDMA_TRANSPORT_IWARP:
+                       ret = cma_iw_listen(id_priv, backlog);
+                       if (ret)
+                               goto err;
+                       break;
+               default:
+                       ret = -ENOSYS;
+                       goto err;
+               }
+       } else
+               cma_listen_on_all(id_priv);
+
+       return 0;
+err:
+       id_priv->backlog = 0;
+       cma_comp_exch(id_priv, CMA_LISTEN, CMA_ADDR_BOUND);
+       return ret;
+}
+EXPORT_SYMBOL(rdma_listen);
+
 int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr)
 {
        struct rdma_id_private *id_priv;
index ec1e9da1488b321df67b7d77fe2921e364276115..b3fa798525b2a1d8d3df793f41fe255eacf676dd 100644 (file)
@@ -883,6 +883,13 @@ static int ucma_set_option_id(struct ucma_context *ctx, int optname,
                }
                rdma_set_service_type(ctx->cm_id, *((u8 *) optval));
                break;
+       case RDMA_OPTION_ID_REUSEADDR:
+               if (optlen != sizeof(int)) {
+                       ret = -EINVAL;
+                       break;
+               }
+               ret = rdma_set_reuseaddr(ctx->cm_id, *((int *) optval) ? 1 : 0);
+               break;
        default:
                ret = -ENOSYS;
        }
index 4fae903046483f7882cebfeea6777ee43dd37038..169f7a53fb0c696cf7cbb9f441a4e5fe021641bd 100644 (file)
@@ -329,4 +329,14 @@ void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr);
  */
 void rdma_set_service_type(struct rdma_cm_id *id, int tos);
 
+/**
+ * rdma_set_reuseaddr - Allow the reuse of local addresses when binding
+ *    the rdma_cm_id.
+ * @id: Communication identifier to configure.
+ * @reuse: Value indicating if the bound address is reusable.
+ *
+ * Reuse must be set before an address is bound to the id.
+ */
+int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse);
+
 #endif /* RDMA_CM_H */
index 1d165022c02d55880b16b42111ba590983141bab..fc82c1896f751a0e9f1c94c6761433f7ac7a8ad6 100644 (file)
@@ -221,8 +221,9 @@ enum {
 
 /* Option details */
 enum {
-       RDMA_OPTION_ID_TOS      = 0,
-       RDMA_OPTION_IB_PATH     = 1
+       RDMA_OPTION_ID_TOS       = 0,
+       RDMA_OPTION_ID_REUSEADDR = 1,
+       RDMA_OPTION_IB_PATH      = 1
 };
 
 struct rdma_ucm_set_option {