Whamcloud - gitweb
LU-165: Support privileged ports in the o2iblnd driver.
[fs/lustre-release.git] / lnet / klnds / o2iblnd / o2iblnd_cb.c
index 577db3e..8127bd2 100644 (file)
@@ -328,6 +328,11 @@ kiblnd_handle_rx (kib_rx_t *rx)
 
                 conn->ibc_credits += credits;
 
+                /* This ensures the credit taken by NOOP can be returned */
+                if (msg->ibm_type == IBLND_MSG_NOOP &&
+                    !IBLND_OOB_CAPABLE(conn->ibc_version)) /* v1 only */
+                        conn->ibc_outstanding_credits++;
+
                 cfs_spin_unlock(&conn->ibc_lock);
                 kiblnd_check_sends(conn);
         }
@@ -341,9 +346,14 @@ kiblnd_handle_rx (kib_rx_t *rx)
                 break;
 
         case IBLND_MSG_NOOP:
-                if (IBLND_OOB_CAPABLE(conn->ibc_version))
+                if (IBLND_OOB_CAPABLE(conn->ibc_version)) {
                         post_credit = IBLND_POSTRX_NO_CREDIT;
-                else
+                        break;
+                }
+
+                if (credits != 0) /* credit already posted */
+                        post_credit = IBLND_POSTRX_NO_CREDIT;
+                else              /* a keepalive NOOP */
                         post_credit = IBLND_POSTRX_PEER_CREDIT;
                 break;
 
@@ -791,8 +801,8 @@ kiblnd_post_tx_locked (kib_conn_t *conn, kib_tx_t *tx, int credit)
         }
 
         if (credit != 0 && !IBLND_OOB_CAPABLE(ver) &&
-            conn->ibc_credits == 1 &&   /* last credit reserved for */
-            conn->ibc_outstanding_credits == 0) { /* giving back credits */
+            conn->ibc_credits == 1 &&   /* last credit reserved */
+            msg->ibm_type != IBLND_MSG_NOOP) {      /* for NOOP */
                 CDEBUG(D_NET, "%s: not using last credit\n",
                        libcfs_nid2str(peer->ibp_nid));
                 return -EAGAIN;
@@ -939,6 +949,11 @@ kiblnd_check_sends (kib_conn_t *conn)
                         credit = 0;
                         tx = cfs_list_entry(conn->ibc_tx_queue_nocred.next,
                                             kib_tx_t, tx_list);
+                } else if (!cfs_list_empty(&conn->ibc_tx_noops)) {
+                        LASSERT (!IBLND_OOB_CAPABLE(ver));
+                        credit = 1;
+                        tx = cfs_list_entry(conn->ibc_tx_noops.next,
+                                        kib_tx_t, tx_list);
                 } else if (!cfs_list_empty(&conn->ibc_tx_queue)) {
                         credit = 1;
                         tx = cfs_list_entry(conn->ibc_tx_queue.next,
@@ -1171,7 +1186,7 @@ kiblnd_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn)
                 if (IBLND_OOB_CAPABLE(conn->ibc_version))
                         q = &conn->ibc_tx_queue_nocred;
                 else
-                        q = &conn->ibc_tx_queue;
+                        q = &conn->ibc_tx_noops;
                 break;
 
         case IBLND_MSG_IMMEDIATE:
@@ -1192,6 +1207,48 @@ kiblnd_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
         kiblnd_check_sends(conn);
 }
 
+static int kiblnd_resolve_addr(struct rdma_cm_id *cmid,
+                               struct sockaddr_in *srcaddr,
+                               struct sockaddr_in *dstaddr,
+                               int timeout_ms)
+{
+        unsigned short port;
+        int rc;
+
+#ifdef HAVE_OFED_RDMA_SET_REUSEADDR
+        /* allow the port to be reused */
+        rc = rdma_set_reuseaddr(cmid, 1);
+        if (rc != 0) {
+                CERROR("Unable to set reuse on cmid: %d\n", rc);
+                return rc;
+        }
+#endif
+
+        /* look for a free privileged port */
+        for (port = PROT_SOCK-1; port > 0; port--) {
+                srcaddr->sin_port = htons(port);
+                rc = rdma_resolve_addr(cmid,
+                                       (struct sockaddr *)srcaddr,
+                                       (struct sockaddr *)dstaddr,
+                                       timeout_ms);
+                if (rc == 0) {
+                        CDEBUG(D_NET, "bound to port %hu\n", port);
+                        return 0;
+                } else if (rc == -EADDRINUSE || rc == -EADDRNOTAVAIL) {
+                        CDEBUG(D_NET, "bind to port %hu failed: %d\n",
+                               port, rc);
+                } else {
+                        return rc;
+                }
+        }
+
+        CERROR("Failed to bind to a free privileged port\n");
+#ifndef HAVE_OFED_RDMA_SET_REUSEADDR
+        CERROR("You may need IB verbs that supports rdma_set_reuseaddr()\n");
+#endif
+        return rc;
+}
+
 void
 kiblnd_connect_peer (kib_peer_t *peer)
 {
@@ -1225,22 +1282,30 @@ kiblnd_connect_peer (kib_peer_t *peer)
 
         kiblnd_peer_addref(peer);               /* cmid's ref */
 
-        rc = rdma_resolve_addr(cmid,
-                               (struct sockaddr *)&srcaddr,
-                               (struct sockaddr *)&dstaddr,
-                               *kiblnd_tunables.kib_timeout * 1000);
-        if (rc == 0) {
-                LASSERT (cmid->device != NULL);
-                CDEBUG(D_NET, "%s: connection bound to %s:%u.%u.%u.%u:%s\n",
-                       libcfs_nid2str(peer->ibp_nid), dev->ibd_ifname,
-                       HIPQUAD(dev->ibd_ifip), cmid->device->name);
-                return;
+        if (*kiblnd_tunables.kib_use_priv_port) {
+                rc = kiblnd_resolve_addr(cmid, &srcaddr, &dstaddr,
+                                         *kiblnd_tunables.kib_timeout * 1000);
+        } else {
+                rc = rdma_resolve_addr(cmid,
+                                       (struct sockaddr *)&srcaddr,
+                                       (struct sockaddr *)&dstaddr,
+                                       *kiblnd_tunables.kib_timeout * 1000);
+        }
+        if (rc != 0) {
+                /* Can't initiate address resolution:  */
+                CERROR("Can't resolve addr for %s: %d\n",
+                       libcfs_nid2str(peer->ibp_nid), rc);
+                goto failed2;
         }
 
-        /* Can't initiate address resolution:  */
-        CERROR("Can't resolve addr for %s: %d\n",
-               libcfs_nid2str(peer->ibp_nid), rc);
+        LASSERT (cmid->device != NULL);
+        CDEBUG(D_NET, "%s: connection bound to %s:%u.%u.%u.%u:%s\n",
+               libcfs_nid2str(peer->ibp_nid), dev->ibd_ifname,
+               HIPQUAD(dev->ibd_ifip), cmid->device->name);
+
+        return;
 
+ failed2:
         kiblnd_peer_decref(peer);               /* cmid's ref */
         rdma_destroy_id(cmid);
  failed:
@@ -1788,6 +1853,7 @@ kiblnd_close_conn_locked (kib_conn_t *conn, int error)
                 return; /* already being handled  */
 
         if (error == 0 &&
+            cfs_list_empty(&conn->ibc_tx_noops) &&
             cfs_list_empty(&conn->ibc_tx_queue) &&
             cfs_list_empty(&conn->ibc_tx_queue_rsrvd) &&
             cfs_list_empty(&conn->ibc_tx_queue_nocred) &&
@@ -1795,9 +1861,10 @@ kiblnd_close_conn_locked (kib_conn_t *conn, int error)
                 CDEBUG(D_NET, "closing conn to %s\n", 
                        libcfs_nid2str(peer->ibp_nid));
         } else {
-                CNETERR("Closing conn to %s: error %d%s%s%s%s\n",
+                CNETERR("Closing conn to %s: error %d%s%s%s%s%s\n",
                        libcfs_nid2str(peer->ibp_nid), error,
                        cfs_list_empty(&conn->ibc_tx_queue) ? "" : "(sending)",
+                       cfs_list_empty(&conn->ibc_tx_noops) ? "" : "(sending_noops)",
                        cfs_list_empty(&conn->ibc_tx_queue_rsrvd) ? "" : "(sending_rsrvd)",
                        cfs_list_empty(&conn->ibc_tx_queue_nocred) ? "" : "(sending_nocred)",
                        cfs_list_empty(&conn->ibc_active_txs) ? "" : "(waiting)");
@@ -1921,6 +1988,7 @@ kiblnd_finalise_conn (kib_conn_t *conn)
         /* Complete all tx descs not waiting for sends to complete.
          * NB we should be safe from RDMA now that the QP has changed state */
 
+        kiblnd_abort_txs(conn, &conn->ibc_tx_noops);
         kiblnd_abort_txs(conn, &conn->ibc_tx_queue);
         kiblnd_abort_txs(conn, &conn->ibc_tx_queue_rsrvd);
         kiblnd_abort_txs(conn, &conn->ibc_tx_queue_nocred);
@@ -2108,7 +2176,7 @@ kiblnd_passive_connect (struct rdma_cm_id *cmid, void *priv, int priv_nob)
         int                    version = IBLND_MSG_VERSION;
         unsigned long          flags;
         int                    rc;
-
+        struct sockaddr_in    *peer_addr;
         LASSERT (!cfs_in_interrupt());
 
         /* cmid inherits 'context' from the corresponding listener id */
@@ -2120,6 +2188,15 @@ kiblnd_passive_connect (struct rdma_cm_id *cmid, void *priv, int priv_nob)
         rej.ibr_why                  = IBLND_REJECT_FATAL;
         rej.ibr_cp.ibcp_max_msg_size = IBLND_MSG_SIZE;
 
+        peer_addr = (struct sockaddr_in *)&(cmid->route.addr.dst_addr);
+        if (*kiblnd_tunables.kib_require_priv_port &&
+            ntohs(peer_addr->sin_port) >= PROT_SOCK) {
+                __u32 ip = ntohl(peer_addr->sin_addr.s_addr);
+                CERROR("Peer's port (%u.%u.%u.%u:%hu) is not privileged\n",
+                       HIPQUAD(ip), ntohs(peer_addr->sin_port));
+                goto failed;
+        }
+
         if (priv_nob < offsetof(kib_msg_t, ibm_type)) {
                 CERROR("Short connection request\n");
                 goto failed;
@@ -2926,6 +3003,7 @@ int
 kiblnd_conn_timed_out (kib_conn_t *conn)
 {
         return  kiblnd_check_txs(conn, &conn->ibc_tx_queue) ||
+                kiblnd_check_txs(conn, &conn->ibc_tx_noops) ||
                 kiblnd_check_txs(conn, &conn->ibc_tx_queue_rsrvd) ||
                 kiblnd_check_txs(conn, &conn->ibc_tx_queue_nocred) ||
                 kiblnd_check_txs(conn, &conn->ibc_active_txs);