Whamcloud - gitweb
LU-7734 lnet: handle N NIs to 1 LND peer
[fs/lustre-release.git] / lnet / klnds / o2iblnd / o2iblnd_cb.c
index d0823c8..41b9cdb 100644 (file)
@@ -23,7 +23,7 @@
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2015, Intel Corporation.
+ * Copyright (c) 2012, 2016, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
@@ -116,7 +116,7 @@ kiblnd_get_idle_tx(lnet_ni_t *ni, lnet_nid_t target)
        kib_tx_t                *tx;
        kib_tx_poolset_t        *tps;
 
-       tps = net->ibn_tx_ps[lnet_cpt_of_nid(target)];
+       tps = net->ibn_tx_ps[lnet_cpt_of_nid(target, ni)];
        node = kiblnd_pool_alloc_node(&tps->tps_poolset);
         if (node == NULL)
                 return NULL;
@@ -763,6 +763,7 @@ __must_hold(&conn->ibc_lock)
        LASSERT(tx->tx_queued);
        /* We rely on this for QP sizing */
        LASSERT(tx->tx_nwrq > 0);
+       LASSERT(tx->tx_nwrq <= 1 + conn->ibc_max_frags);
 
        LASSERT(credit == 0 || credit == 1);
        LASSERT(conn->ibc_outstanding_credits >= 0);
@@ -1067,14 +1068,6 @@ kiblnd_init_rdma(kib_conn_t *conn, kib_tx_t *tx, int type,
        LASSERT (type == IBLND_MSG_GET_DONE ||
                 type == IBLND_MSG_PUT_DONE);
 
-       if (kiblnd_rd_size(srcrd) > conn->ibc_max_frags << PAGE_SHIFT) {
-               CERROR("RDMA is too large for peer %s (%d), src size: %d dst size: %d\n",
-                      libcfs_nid2str(conn->ibc_peer->ibp_nid),
-                      conn->ibc_max_frags << PAGE_SHIFT,
-                      kiblnd_rd_size(srcrd), kiblnd_rd_size(dstrd));
-               GOTO(too_big, rc = -EMSGSIZE);
-       }
-
        srcidx = dstidx = 0;
 
         while (resid > 0) {
@@ -1090,11 +1083,11 @@ kiblnd_init_rdma(kib_conn_t *conn, kib_tx_t *tx, int type,
                         break;
                 }
 
-               if (tx->tx_nwrq >= IBLND_MAX_RDMA_FRAGS) {
+               if (tx->tx_nwrq >= conn->ibc_max_frags) {
                        CERROR("RDMA has too many fragments for peer %s (%d), "
                               "src idx/frags: %d/%d dst idx/frags: %d/%d\n",
                               libcfs_nid2str(conn->ibc_peer->ibp_nid),
-                              IBLND_MAX_RDMA_FRAGS,
+                              conn->ibc_max_frags,
                               srcidx, srcrd->rd_nfrags,
                               dstidx, dstrd->rd_nfrags);
                        rc = -EMSGSIZE;
@@ -1135,7 +1128,7 @@ kiblnd_init_rdma(kib_conn_t *conn, kib_tx_t *tx, int type,
                 wrq++;
                 sge++;
         }
-too_big:
+
         if (rc < 0)                             /* no RDMA if completing with failure */
                 tx->tx_nwrq = 0;
 
@@ -1386,7 +1379,7 @@ kiblnd_launch_tx (lnet_ni_t *ni, kib_tx_t *tx, lnet_nid_t nid)
          * connected */
        read_lock_irqsave(g_lock, flags);
 
-        peer = kiblnd_find_peer_locked(nid);
+        peer = kiblnd_find_peer_locked(ni, nid);
        if (peer != NULL && !list_empty(&peer->ibp_conns)) {
                 /* Found a peer with an established connection */
                 conn = kiblnd_get_conn_locked(peer);
@@ -1404,7 +1397,7 @@ kiblnd_launch_tx (lnet_ni_t *ni, kib_tx_t *tx, lnet_nid_t nid)
        /* Re-try with a write lock */
        write_lock(g_lock);
 
-        peer = kiblnd_find_peer_locked(nid);
+        peer = kiblnd_find_peer_locked(ni, nid);
         if (peer != NULL) {
                if (list_empty(&peer->ibp_conns)) {
                         /* found a peer, but it's still connecting... */
@@ -1442,7 +1435,7 @@ kiblnd_launch_tx (lnet_ni_t *ni, kib_tx_t *tx, lnet_nid_t nid)
 
        write_lock_irqsave(g_lock, flags);
 
-        peer2 = kiblnd_find_peer_locked(nid);
+        peer2 = kiblnd_find_peer_locked(ni, nid);
         if (peer2 != NULL) {
                if (list_empty(&peer2->ibp_conns)) {
                         /* found a peer, but it's still connecting... */
@@ -2231,7 +2224,6 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
         kib_rej_t              rej;
        int                    version = IBLND_MSG_VERSION;
        unsigned long          flags;
-       int max_frags;
        int                    rc;
        struct sockaddr_in    *peer_addr;
        LASSERT (!in_interrupt());
@@ -2251,75 +2243,75 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
                __u32 ip = ntohl(peer_addr->sin_addr.s_addr);
                CERROR("Peer's port (%pI4h:%hu) is not privileged\n",
                       &ip, ntohs(peer_addr->sin_port));
-                goto failed;
-        }
+               goto failed;
+       }
 
-        if (priv_nob < offsetof(kib_msg_t, ibm_type)) {
-                CERROR("Short connection request\n");
-                goto failed;
-        }
+       if (priv_nob < offsetof(kib_msg_t, ibm_type)) {
+               CERROR("Short connection request\n");
+               goto failed;
+       }
 
-        /* Future protocol version compatibility support!  If the
-         * o2iblnd-specific protocol changes, or when LNET unifies
-         * protocols over all LNDs, the initial connection will
-         * negotiate a protocol version.  I trap this here to avoid
-         * console errors; the reject tells the peer which protocol I
-         * speak. */
-        if (reqmsg->ibm_magic == LNET_PROTO_MAGIC ||
-            reqmsg->ibm_magic == __swab32(LNET_PROTO_MAGIC))
-                goto failed;
-        if (reqmsg->ibm_magic == IBLND_MSG_MAGIC &&
-            reqmsg->ibm_version != IBLND_MSG_VERSION &&
-            reqmsg->ibm_version != IBLND_MSG_VERSION_1)
-                goto failed;
-        if (reqmsg->ibm_magic == __swab32(IBLND_MSG_MAGIC) &&
-            reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION) &&
-            reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION_1))
-                goto failed;
+       /* Future protocol version compatibility support!  If the
+        * o2iblnd-specific protocol changes, or when LNET unifies
+        * protocols over all LNDs, the initial connection will
+        * negotiate a protocol version.  I trap this here to avoid
+        * console errors; the reject tells the peer which protocol I
+        * speak. */
+       if (reqmsg->ibm_magic == LNET_PROTO_MAGIC ||
+           reqmsg->ibm_magic == __swab32(LNET_PROTO_MAGIC))
+               goto failed;
+       if (reqmsg->ibm_magic == IBLND_MSG_MAGIC &&
+           reqmsg->ibm_version != IBLND_MSG_VERSION &&
+           reqmsg->ibm_version != IBLND_MSG_VERSION_1)
+               goto failed;
+       if (reqmsg->ibm_magic == __swab32(IBLND_MSG_MAGIC) &&
+           reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION) &&
+           reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION_1))
+               goto failed;
 
-        rc = kiblnd_unpack_msg(reqmsg, priv_nob);
-        if (rc != 0) {
-                CERROR("Can't parse connection request: %d\n", rc);
-                goto failed;
-        }
+       rc = kiblnd_unpack_msg(reqmsg, priv_nob);
+       if (rc != 0) {
+               CERROR("Can't parse connection request: %d\n", rc);
+               goto failed;
+       }
 
-        nid = reqmsg->ibm_srcnid;
-        ni  = lnet_net2ni(LNET_NIDNET(reqmsg->ibm_dstnid));
+       nid = reqmsg->ibm_srcnid;
+       ni  = lnet_nid2ni_addref(reqmsg->ibm_dstnid);
 
-        if (ni != NULL) {
-                net = (kib_net_t *)ni->ni_data;
-                rej.ibr_incarnation = net->ibn_incarnation;
-        }
+       if (ni != NULL) {
+               net = (kib_net_t *)ni->ni_data;
+               rej.ibr_incarnation = net->ibn_incarnation;
+       }
 
-        if (ni == NULL ||                         /* no matching net */
-            ni->ni_nid != reqmsg->ibm_dstnid ||   /* right NET, wrong NID! */
-            net->ibn_dev != ibdev) {              /* wrong device */
+       if (ni == NULL ||                         /* no matching net */
+           ni->ni_nid != reqmsg->ibm_dstnid ||   /* right NET, wrong NID! */
+           net->ibn_dev != ibdev) {              /* wrong device */
                CERROR("Can't accept conn from %s on %s (%s:%d:%pI4h): "
-                       "bad dst nid %s\n", libcfs_nid2str(nid),
-                       ni == NULL ? "NA" : libcfs_nid2str(ni->ni_nid),
-                       ibdev->ibd_ifname, ibdev->ibd_nnets,
+                      "bad dst nid %s\n", libcfs_nid2str(nid),
+                      ni == NULL ? "NA" : libcfs_nid2str(ni->ni_nid),
+                      ibdev->ibd_ifname, ibdev->ibd_nnets,
                        &ibdev->ibd_ifip,
-                       libcfs_nid2str(reqmsg->ibm_dstnid));
+                      libcfs_nid2str(reqmsg->ibm_dstnid));
 
-                goto failed;
-        }
+               goto failed;
+       }
 
        /* check time stamp as soon as possible */
-        if (reqmsg->ibm_dststamp != 0 &&
-            reqmsg->ibm_dststamp != net->ibn_incarnation) {
-                CWARN("Stale connection request\n");
-                rej.ibr_why = IBLND_REJECT_CONN_STALE;
-                goto failed;
-        }
+       if (reqmsg->ibm_dststamp != 0 &&
+           reqmsg->ibm_dststamp != net->ibn_incarnation) {
+               CWARN("Stale connection request\n");
+               rej.ibr_why = IBLND_REJECT_CONN_STALE;
+               goto failed;
+       }
 
-        /* I can accept peer's version */
-        version = reqmsg->ibm_version;
+       /* I can accept peer's version */
+       version = reqmsg->ibm_version;
 
-        if (reqmsg->ibm_type != IBLND_MSG_CONNREQ) {
-                CERROR("Unexpected connreq msg type: %x from %s\n",
-                       reqmsg->ibm_type, libcfs_nid2str(nid));
-                goto failed;
-        }
+       if (reqmsg->ibm_type != IBLND_MSG_CONNREQ) {
+               CERROR("Unexpected connreq msg type: %x from %s\n",
+                      reqmsg->ibm_type, libcfs_nid2str(nid));
+               goto failed;
+       }
 
        if (reqmsg->ibm_u.connparams.ibcp_queue_depth >
            kiblnd_msg_queue_size(version, ni)) {
@@ -2335,23 +2327,26 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
                goto failed;
        }
 
-       max_frags = reqmsg->ibm_u.connparams.ibcp_max_frags >> IBLND_FRAG_SHIFT;
-       if (max_frags > kiblnd_rdma_frags(version, ni)) {
+       if (reqmsg->ibm_u.connparams.ibcp_max_frags >
+           kiblnd_rdma_frags(version, ni)) {
                CWARN("Can't accept conn from %s (version %x): "
-                     "max message size %d is too large (%d wanted)\n",
-                     libcfs_nid2str(nid), version, max_frags,
+                     "max_frags %d too large (%d wanted)\n",
+                     libcfs_nid2str(nid), version,
+                     reqmsg->ibm_u.connparams.ibcp_max_frags,
                      kiblnd_rdma_frags(version, ni));
 
                if (version >= IBLND_MSG_VERSION)
                        rej.ibr_why = IBLND_REJECT_RDMA_FRAGS;
 
                goto failed;
-       } else if ((max_frags < kiblnd_rdma_frags(version, ni)) &&
+       } else if (reqmsg->ibm_u.connparams.ibcp_max_frags <
+                  kiblnd_rdma_frags(version, ni) &&
                   net->ibn_fmr_ps == NULL) {
                CWARN("Can't accept conn from %s (version %x): "
-                     "max message size %d incompatible without FMR pool "
+                     "max_frags %d incompatible without FMR pool "
                      "(%d wanted)\n",
-                     libcfs_nid2str(nid), version, max_frags,
+                     libcfs_nid2str(nid), version,
+                     reqmsg->ibm_u.connparams.ibcp_max_frags,
                      kiblnd_rdma_frags(version, ni));
 
                if (version == IBLND_MSG_VERSION)
@@ -2377,12 +2372,12 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
        }
 
        /* We have validated the peer's parameters so use those */
-       peer->ibp_max_frags = max_frags;
+       peer->ibp_max_frags = reqmsg->ibm_u.connparams.ibcp_max_frags;
        peer->ibp_queue_depth = reqmsg->ibm_u.connparams.ibcp_queue_depth;
 
        write_lock_irqsave(g_lock, flags);
 
-        peer2 = kiblnd_find_peer_locked(nid);
+        peer2 = kiblnd_find_peer_locked(ni, nid);
         if (peer2 != NULL) {
                 if (peer2->ibp_version == 0) {
                         peer2->ibp_version     = version;
@@ -2492,7 +2487,7 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
         kiblnd_init_msg(ackmsg, IBLND_MSG_CONNACK,
                         sizeof(ackmsg->ibm_u.connparams));
        ackmsg->ibm_u.connparams.ibcp_queue_depth  = conn->ibc_queue_depth;
-       ackmsg->ibm_u.connparams.ibcp_max_frags = conn->ibc_max_frags << IBLND_FRAG_SHIFT;
+       ackmsg->ibm_u.connparams.ibcp_max_frags    = conn->ibc_max_frags;
        ackmsg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE;
 
         kiblnd_pack_msg(ni, ackmsg, version, 0, nid, reqmsg->ibm_srcstamp);
@@ -2555,7 +2550,7 @@ kiblnd_check_reconnect(kib_conn_t *conn, int version,
 
        if (cp) {
                msg_size        = cp->ibcp_max_msg_size;
-               frag_num        = cp->ibcp_max_frags << IBLND_FRAG_SHIFT;
+               frag_num        = cp->ibcp_max_frags;
                queue_dep       = cp->ibcp_queue_depth;
        }
 
@@ -2580,14 +2575,14 @@ kiblnd_check_reconnect(kib_conn_t *conn, int version,
                 break;
 
        case IBLND_REJECT_RDMA_FRAGS: {
-               struct lnet_ioctl_config_lnd_tunables *tunables;
+               struct lnet_ioctl_config_o2iblnd_tunables *tunables;
 
                if (!cp) {
                        reason = "can't negotiate max frags";
                        goto out;
                }
-               tunables = peer->ibp_ni->ni_lnd_tunables;
-               if (!tunables->lt_tun_u.lt_o2ib.lnd_map_on_demand) {
+               tunables = &peer->ibp_ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
+               if (!tunables->lnd_map_on_demand) {
                        reason = "map_on_demand must be enabled";
                        goto out;
                }
@@ -2816,11 +2811,11 @@ kiblnd_check_connreply (kib_conn_t *conn, void *priv, int priv_nob)
                goto failed;
        }
 
-       if ((msg->ibm_u.connparams.ibcp_max_frags >> IBLND_FRAG_SHIFT) >
+       if (msg->ibm_u.connparams.ibcp_max_frags >
            conn->ibc_max_frags) {
                CERROR("%s has incompatible max_frags %d (<=%d wanted)\n",
                       libcfs_nid2str(peer->ibp_nid),
-                      msg->ibm_u.connparams.ibcp_max_frags >> IBLND_FRAG_SHIFT,
+                      msg->ibm_u.connparams.ibcp_max_frags,
                       conn->ibc_max_frags);
                rc = -EPROTO;
                goto failed;
@@ -2855,7 +2850,7 @@ kiblnd_check_connreply (kib_conn_t *conn, void *priv, int priv_nob)
        conn->ibc_credits          = msg->ibm_u.connparams.ibcp_queue_depth;
        conn->ibc_reserved_credits = msg->ibm_u.connparams.ibcp_queue_depth;
        conn->ibc_queue_depth      = msg->ibm_u.connparams.ibcp_queue_depth;
-       conn->ibc_max_frags = msg->ibm_u.connparams.ibcp_max_frags >> IBLND_FRAG_SHIFT;
+       conn->ibc_max_frags        = msg->ibm_u.connparams.ibcp_max_frags;
        LASSERT(conn->ibc_credits + conn->ibc_reserved_credits +
                IBLND_OOB_MSGS(ver) <= IBLND_RX_MSGS(conn));
 
@@ -2910,7 +2905,7 @@ kiblnd_active_connect (struct rdma_cm_id *cmid)
        memset(msg, 0, sizeof(*msg));
        kiblnd_init_msg(msg, IBLND_MSG_CONNREQ, sizeof(msg->ibm_u.connparams));
        msg->ibm_u.connparams.ibcp_queue_depth  = conn->ibc_queue_depth;
-       msg->ibm_u.connparams.ibcp_max_frags = conn->ibc_max_frags << IBLND_FRAG_SHIFT;
+       msg->ibm_u.connparams.ibcp_max_frags    = conn->ibc_max_frags;
        msg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE;
 
         kiblnd_pack_msg(peer->ibp_ni, msg, version,
@@ -3325,8 +3320,9 @@ kiblnd_connd (void *arg)
                 }
 
                while (reconn < KIB_RECONN_BREAK) {
-                       if (kiblnd_data.kib_reconn_sec != get_seconds()) {
-                               kiblnd_data.kib_reconn_sec = get_seconds();
+                       if (kiblnd_data.kib_reconn_sec !=
+                           ktime_get_real_seconds()) {
+                               kiblnd_data.kib_reconn_sec = ktime_get_real_seconds();
                                list_splice_init(&kiblnd_data.kib_reconn_wait,
                                                 &kiblnd_data.kib_reconn_list);
                        }
@@ -3522,10 +3518,10 @@ kiblnd_scheduler(void *arg)
 
        rc = cfs_cpt_bind(lnet_cpt_table(), sched->ibs_cpt);
        if (rc != 0) {
-               CWARN("Failed to bind on CPT %d, please verify whether "
-                     "all CPUs are healthy and reload modules if necessary, "
-                     "otherwise your system might under risk of low "
-                     "performance\n", sched->ibs_cpt);
+               CWARN("Unable to bind on CPU partition %d, please verify "
+                     "whether all CPUs are healthy and reload modules if "
+                     "necessary, otherwise your system might under risk of "
+                     "low performance\n", sched->ibs_cpt);
        }
 
        spin_lock_irqsave(&sched->ibs_lock, flags);