Whamcloud - gitweb
LU-7734 lnet: proper cpt locking
[fs/lustre-release.git] / lnet / lnet / lib-move.c
index 769856c..ce62ff9 100644 (file)
@@ -674,18 +674,26 @@ lnet_ni_eager_recv(lnet_ni_t *ni, lnet_msg_t *msg)
        return rc;
 }
 
-/* NB: caller shall hold a ref on 'lp' as I'd drop lnet_net_lock */
+/*
+ * This function can be called from two paths:
+ *     1. when sending a message
+ *     2. when decommiting a message (lnet_msg_decommit_tx())
+ * In both these cases the peer_ni should have it's reference count
+ * acquired by the caller and therefore it is safe to drop the spin
+ * lock before calling lnd_query()
+ */
 static void
 lnet_ni_query_locked(lnet_ni_t *ni, struct lnet_peer_ni *lp)
 {
        cfs_time_t last_alive = 0;
+       int cpt = lnet_cpt_of_nid_locked(lp->lpni_nid, ni);
 
        LASSERT(lnet_peer_aliveness_enabled(lp));
        LASSERT(ni->ni_net->net_lnd->lnd_query != NULL);
 
-       lnet_net_unlock(lp->lpni_cpt);
+       lnet_net_unlock(cpt);
        (ni->ni_net->net_lnd->lnd_query)(ni, lp->lpni_nid, &last_alive);
-       lnet_net_lock(lp->lpni_cpt);
+       lnet_net_lock(cpt);
 
        lp->lpni_last_query = cfs_time_current();
 
@@ -706,9 +714,12 @@ lnet_peer_is_alive (struct lnet_peer_ni *lp, cfs_time_t now)
         * Trust lnet_notify() if it has more recent aliveness news, but
         * ignore the initial assumed death (see lnet_peers_start_down()).
         */
+       spin_lock(&lp->lpni_lock);
        if (!lp->lpni_alive && lp->lpni_alive_count > 0 &&
-           cfs_time_aftereq(lp->lpni_timestamp, lp->lpni_last_alive))
+           cfs_time_aftereq(lp->lpni_timestamp, lp->lpni_last_alive)) {
+               spin_unlock(&lp->lpni_lock);
                return 0;
+       }
 
        deadline =
          cfs_time_add(lp->lpni_last_alive,
@@ -722,8 +733,12 @@ lnet_peer_is_alive (struct lnet_peer_ni *lp, cfs_time_t now)
         * case, and moreover lpni_last_alive at peer creation is assumed.
         */
        if (alive && !lp->lpni_alive &&
-           !(lnet_isrouter(lp) && lp->lpni_alive_count == 0))
+           !(lnet_isrouter(lp) && lp->lpni_alive_count == 0)) {
+               spin_unlock(&lp->lpni_lock);
                lnet_notify_locked(lp, 0, 1, lp->lpni_last_alive);
+       } else {
+               spin_unlock(&lp->lpni_lock);
+       }
 
        return alive;
 }
@@ -805,6 +820,10 @@ lnet_post_send_locked(lnet_msg_t *msg, int do_send)
                the_lnet.ln_counters[cpt]->drop_count++;
                the_lnet.ln_counters[cpt]->drop_length += msg->msg_len;
                lnet_net_unlock(cpt);
+               if (msg->msg_txpeer)
+                       atomic_inc(&msg->msg_txpeer->lpni_stats.drop_count);
+               if (msg->msg_txni)
+                       atomic_inc(&msg->msg_txni->ni_stats.drop_count);
 
                CNETERR("Dropping message for %s: peer not alive\n",
                        libcfs_id2str(msg->msg_target));
@@ -853,6 +872,7 @@ lnet_post_send_locked(lnet_msg_t *msg, int do_send)
 
                msg->msg_txcredit = 1;
                tq->tq_credits--;
+               atomic_dec(&ni->ni_tx_credits);
 
                if (tq->tq_credits < tq->tq_credits_min)
                        tq->tq_credits_min = tq->tq_credits;
@@ -985,6 +1005,7 @@ lnet_return_tx_credits_locked(lnet_msg_t *msg)
                        !list_empty(&tq->tq_delayed));
 
                tq->tq_credits++;
+               atomic_inc(&ni->ni_tx_credits);
                if (tq->tq_credits <= 0) {
                        msg2 = list_entry(tq->tq_delayed.next,
                                          lnet_msg_t, msg_list);
@@ -1289,6 +1310,10 @@ lnet_select_pathway(lnet_nid_t src_nid, lnet_nid_t dst_nid,
        int                     best_credits = 0;
        __u32                   seq, seq2;
        int                     best_lpni_credits = INT_MIN;
+       int                     md_cpt = 0;
+       int                     shortest_distance = INT_MAX;
+       int                     distance = 0;
+       bool                    found_ir = false;
 
 again:
        /*
@@ -1307,12 +1332,20 @@ again:
        routing = false;
        local_net = NULL;
        best_ni = NULL;
+       shortest_distance = INT_MAX;
+       found_ir = false;
 
        if (the_lnet.ln_shutdown) {
                lnet_net_unlock(cpt);
                return -ESHUTDOWN;
        }
 
+       if (msg->msg_md != NULL)
+               /* get the cpt of the MD, used during NUMA based selection */
+               md_cpt = lnet_cpt_of_cookie(msg->msg_md->md_lh.lh_cookie);
+       else
+               md_cpt = CFS_CPT_ANY;
+
        /*
         * initialize the variables which could be reused if we go to
         * again
@@ -1320,10 +1353,10 @@ again:
        lpni = NULL;
        seq = lnet_get_dlc_seq_locked();
 
-       rc = lnet_find_or_create_peer_locked(dst_nid, cpt, &peer);
-       if (rc != 0) {
+       peer = lnet_find_or_create_peer_locked(dst_nid, cpt);
+       if (IS_ERR(peer)) {
                lnet_net_unlock(cpt);
-               return rc;
+               return PTR_ERR(peer);
        }
 
        /* If peer is not healthy then can not send anything to it */
@@ -1332,6 +1365,13 @@ again:
                return -EHOSTUNREACH;
        }
 
+       if (!peer->lp_multi_rail && lnet_get_num_peer_nis(peer) > 1) {
+               CERROR("peer %s is declared to be non MR capable, "
+                      "yet configured with more than one NID\n",
+                      libcfs_nid2str(dst_nid));
+               return -EINVAL;
+       }
+
        /*
         * STEP 1: first jab at determineing best_ni
         * if src_nid is explicitly specified, then best_ni is already
@@ -1357,23 +1397,6 @@ again:
                }
        }
 
-       if (best_ni == the_lnet.ln_loni) {
-               /* No send credit hassles with LOLND */
-               msg->msg_hdr.dest_nid = cpu_to_le64(best_ni->ni_nid);
-               if (!msg->msg_routing)
-                       msg->msg_hdr.src_nid = cpu_to_le64(best_ni->ni_nid);
-               msg->msg_target.nid = best_ni->ni_nid;
-               lnet_msg_commit(msg, cpt);
-
-               lnet_ni_addref_locked(best_ni, cpt);
-               lnet_net_unlock(cpt);
-               msg->msg_txni = best_ni;
-               lnet_ni_send(best_ni, msg);
-
-               *lo_sent = true;
-               return 0;
-       }
-
        if (best_ni)
                goto pick_peer;
 
@@ -1438,34 +1461,116 @@ again:
                        continue;
 
                /*
-                * Second jab at determining best_ni
-                * if we get here then the peer we're trying to send
-                * to is on a directly connected network, and we'll
-                * need to pick the local_ni on that network to send
-                * from
+                * Iterate through the NIs in this local Net and select
+                * the NI to send from. The selection is determined by
+                * these 3 criterion in the following priority:
+                *      1. NUMA
+                *      2. NI available credits
+                *      3. Round Robin
                 */
                while ((ni = lnet_get_next_ni_locked(local_net, ni))) {
+                       int ni_credits;
+
                        if (!lnet_is_ni_healthy_locked(ni))
                                continue;
-                       /* TODO: compare NUMA distance */
-                       if (ni->ni_tx_queues[cpt]->tq_credits <=
-                           best_credits) {
+
+                       ni_credits = atomic_read(&ni->ni_tx_credits);
+
+                       /*
+                        * calculate the distance from the cpt on which
+                        * the message memory is allocated to the CPT of
+                        * the NI's physical device
+                        */
+                       distance = cfs_cpt_distance(lnet_cpt_table(),
+                                                   md_cpt,
+                                                   ni->dev_cpt);
+
+                       /*
+                        * If we already have a closer NI within the NUMA
+                        * range provided, then there is no need to
+                        * consider the current NI. Move on to the next
+                        * one.
+                        */
+                       if (distance > shortest_distance &&
+                           distance > lnet_get_numa_range())
+                               continue;
+
+                       if (distance < shortest_distance &&
+                           distance > lnet_get_numa_range()) {
                                /*
-                                * all we want is to read tq_credits
-                                * value as an approximation of how
-                                * busy the NI is. No need to grab a lock
+                                * The current NI is the closest one that we
+                                * have found, even though it's not in the
+                                * NUMA range specified. This occurs if
+                                * the NUMA range is less than the least
+                                * of the distances in the system.
+                                * In effect NUMA range consideration is
+                                * turned off.
                                 */
-                               continue;
-                       } else if (best_ni) {
-                               if ((best_ni)->ni_seq - ni->ni_seq <= 0)
+                               shortest_distance = distance;
+                       } else if ((distance <= shortest_distance &&
+                                   distance < lnet_get_numa_range()) ||
+                                  distance == shortest_distance) {
+                               /*
+                                * This NI is either within range or it's
+                                * equidistant. In both of these cases we
+                                * would want to select the NI based on
+                                * its available credits first, and then
+                                * via Round Robin.
+                                */
+                               if (distance <= shortest_distance &&
+                                   distance < lnet_get_numa_range()) {
+                                       /*
+                                        * If this is the first NI that's
+                                        * within range, then set the
+                                        * shortest distance to the range
+                                        * specified by the user. In
+                                        * effect we're saying that all
+                                        * NIs that fall within this NUMA
+                                        * range shall be dealt with as
+                                        * having equal NUMA weight. Which
+                                        * will mean that we should select
+                                        * through that set by their
+                                        * available credits first
+                                        * followed by Round Robin.
+                                        *
+                                        * And since this is the first NI
+                                        * in the range, let's just set it
+                                        * as our best_ni for now. The
+                                        * following NIs found in the
+                                        * range will be dealt with as
+                                        * mentioned previously.
+                                        */
+                                       shortest_distance = lnet_get_numa_range();
+                                       if (!found_ir) {
+                                               found_ir = true;
+                                               goto set_ni;
+                                       }
+                               }
+                               /*
+                                * This NI is NUMA equidistant let's
+                                * select using credits followed by Round
+                                * Robin.
+                                */
+                               if (ni_credits < best_credits) {
                                        continue;
-                               (best_ni)->ni_seq = ni->ni_seq + 1;
+                               } else if (ni_credits == best_credits) {
+                                       if (best_ni) {
+                                               if (best_ni->ni_seq <= ni->ni_seq)
+                                                       continue;
+                                       }
+                               }
                        }
-
+set_ni:
                        best_ni = ni;
-                       best_credits = ni->ni_tx_queues[cpt]->tq_credits;
+                       best_credits = ni_credits;
                }
        }
+       /*
+        * if the peer is not MR capable, then we should always send to it
+        * using the first NI in the NET we determined.
+        */
+       if (!peer->lp_multi_rail && local_net != NULL)
+               best_ni = lnet_net2ni_locked(local_net->net_id, cpt);
 
        if (!best_ni) {
                lnet_net_unlock(cpt);
@@ -1474,10 +1579,34 @@ again:
                return -EINVAL;
        }
 
+       /*
+        * Now that we selected the NI to use increment its sequence
+        * number so the Round Robin algorithm will detect that it has
+        * been used and pick the next NI.
+        */
+       best_ni->ni_seq++;
+
        if (routing)
                goto send;
 
 pick_peer:
+       if (best_ni == the_lnet.ln_loni) {
+               /* No send credit hassles with LOLND */
+               lnet_ni_addref_locked(best_ni, cpt);
+               msg->msg_hdr.dest_nid = cpu_to_le64(best_ni->ni_nid);
+               if (!msg->msg_routing)
+                       msg->msg_hdr.src_nid = cpu_to_le64(best_ni->ni_nid);
+               msg->msg_target.nid = best_ni->ni_nid;
+               lnet_msg_commit(msg, cpt);
+
+               lnet_net_unlock(cpt);
+               msg->msg_txni = best_ni;
+               lnet_ni_send(best_ni, msg);
+
+               *lo_sent = true;
+               return 0;
+       }
+
        lpni = NULL;
 
        if (msg->msg_type == LNET_MSG_REPLY ||
@@ -1487,7 +1616,7 @@ pick_peer:
                 * received the message on if possible. If not, then pick
                 * a peer_ni to send to
                 */
-               best_lpni = lnet_find_peer_ni_locked(dst_nid, cpt);
+               best_lpni = lnet_find_peer_ni_locked(dst_nid);
                if (best_lpni) {
                        lnet_peer_ni_decref_locked(best_lpni);
                        goto send;
@@ -1523,11 +1652,11 @@ pick_peer:
                }
 
                CDEBUG(D_NET, "Best route to %s via %s for %s %d\n",
-                       libcfs_nid2str(lpni->lpni_nid),
+                       libcfs_nid2str(dst_nid),
                        libcfs_nid2str(best_gw->lpni_nid),
                        lnet_msgtyp2str(msg->msg_type), msg->msg_len);
 
-               best_lpni = lnet_find_peer_ni_locked(dst_nid, cpt);
+               best_lpni = lnet_find_peer_ni_locked(dst_nid);
                LASSERT(best_lpni != NULL);
                lnet_peer_ni_decref_locked(best_lpni);
 
@@ -1550,47 +1679,72 @@ pick_peer:
        best_lpni = NULL;
        while ((lpni = lnet_get_next_peer_ni_locked(peer, peer_net, lpni))) {
                /*
-                * if this peer ni is not healty just skip it, no point in
+                * if this peer ni is not healthy just skip it, no point in
                 * examining it further
                 */
                if (!lnet_is_peer_ni_healthy_locked(lpni))
                        continue;
                ni_is_pref = lnet_peer_is_ni_pref_locked(lpni, best_ni);
 
+               /* if this is a preferred peer use it */
                if (!preferred && ni_is_pref) {
                        preferred = true;
                } else if (preferred && !ni_is_pref) {
+                       /*
+                        * this is not the preferred peer so let's ignore
+                        * it.
+                        */
                        continue;
-               } if (lpni->lpni_txcredits <= best_lpni_credits)
+               } if (lpni->lpni_txcredits < best_lpni_credits)
+                       /*
+                        * We already have a peer that has more credits
+                        * available than this one. No need to consider
+                        * this peer further.
+                        */
                        continue;
-               else if (best_lpni) {
-                       if (best_lpni->lpni_seq - lpni->lpni_seq <= 0)
-                               continue;
-                       best_lpni->lpni_seq = lpni->lpni_seq + 1;
+               else if (lpni->lpni_txcredits == best_lpni_credits) {
+                       /*
+                        * The best peer found so far and the current peer
+                        * have the same number of available credits let's
+                        * make sure to select between them using Round
+                        * Robin
+                        */
+                       if (best_lpni) {
+                               if (best_lpni->lpni_seq <= lpni->lpni_seq)
+                                       continue;
+                       }
                }
 
                best_lpni = lpni;
                best_lpni_credits = lpni->lpni_txcredits;
        }
 
+       /*
+        * Increment sequence number of the peer selected so that we can
+        * pick the next one in Round Robin.
+        */
+       best_lpni->lpni_seq++;
+
        /* if we still can't find a peer ni then we can't reach it */
        if (!best_lpni) {
                __u32 net_id = peer_net->lpn_net_id;
                lnet_net_unlock(cpt);
                LCONSOLE_WARN("no peer_ni found on peer net %s\n",
                                libcfs_net2str(net_id));
-               goto again;
+               return -EHOSTUNREACH;
        }
 
 send:
        /*
-        * determine the cpt to use and if it has changed then
-        * lock the new cpt and check if the config has changed.
-        * If it has changed then repeat the algorithm since the
-        * ni or peer list could have changed and the algorithm
-        * would endup picking a different ni/peer_ni pair.
+        * Use lnet_cpt_of_nid() to determine the CPT used to commit the
+        * message. This ensures that we get a CPT that is correct for
+        * the NI when the NI has been restricted to a subset of all CPTs.
+        * If the selected CPT differs from the one currently locked, we
+        * must unlock and relock the lnet_net_lock(), and then check whether
+        * the configuration has changed. We don't have a hold on the best_ni
+        * or best_peer_ni yet, and they may have vanished.
         */
-       cpt2 = best_lpni->lpni_cpt;
+       cpt2 = lnet_cpt_of_nid_locked(best_lpni->lpni_nid, best_ni);
        if (cpt != cpt2) {
                lnet_net_unlock(cpt);
                cpt = cpt2;
@@ -1734,14 +1888,15 @@ lnet_parse_put(lnet_ni_t *ni, lnet_msg_t *msg)
        hdr->msg.put.ptl_index  = le32_to_cpu(hdr->msg.put.ptl_index);
        hdr->msg.put.offset     = le32_to_cpu(hdr->msg.put.offset);
 
-       info.mi_id.nid  = hdr->src_nid;
+       /* Primary peer NID. */
+       info.mi_id.nid  = msg->msg_initiator;
        info.mi_id.pid  = hdr->src_pid;
        info.mi_opc     = LNET_MD_OP_PUT;
        info.mi_portal  = hdr->msg.put.ptl_index;
        info.mi_rlength = hdr->payload_length;
        info.mi_roffset = hdr->msg.put.offset;
        info.mi_mbits   = hdr->msg.put.match_bits;
-       info.mi_cpt     = msg->msg_rxpeer->lpni_cpt;
+       info.mi_cpt     = lnet_cpt_of_nid(msg->msg_rxpeer->lpni_nid, ni);
 
        msg->msg_rx_ready_delay = ni->ni_net->net_lnd->lnd_eager_recv == NULL;
        ready_delay = msg->msg_rx_ready_delay;
@@ -1784,6 +1939,7 @@ lnet_parse_get(lnet_ni_t *ni, lnet_msg_t *msg, int rdma_get)
 {
        struct lnet_match_info  info;
        lnet_hdr_t              *hdr = &msg->msg_hdr;
+       lnet_process_id_t       source_id;
        struct lnet_handle_wire reply_wmd;
        int                     rc;
 
@@ -1793,7 +1949,10 @@ lnet_parse_get(lnet_ni_t *ni, lnet_msg_t *msg, int rdma_get)
        hdr->msg.get.sink_length  = le32_to_cpu(hdr->msg.get.sink_length);
        hdr->msg.get.src_offset   = le32_to_cpu(hdr->msg.get.src_offset);
 
-       info.mi_id.nid  = hdr->src_nid;
+       source_id.nid = hdr->src_nid;
+       source_id.pid = hdr->src_pid;
+       /* Primary peer NID */
+       info.mi_id.nid  = msg->msg_initiator;
        info.mi_id.pid  = hdr->src_pid;
        info.mi_opc     = LNET_MD_OP_GET;
        info.mi_portal  = hdr->msg.get.ptl_index;
@@ -1816,7 +1975,7 @@ lnet_parse_get(lnet_ni_t *ni, lnet_msg_t *msg, int rdma_get)
 
        reply_wmd = hdr->msg.get.return_wmd;
 
-       lnet_prep_send(msg, LNET_MSG_REPLY, info.mi_id,
+       lnet_prep_send(msg, LNET_MSG_REPLY, source_id,
                       msg->msg_offset, msg->msg_wanted);
 
        msg->msg_hdr.msg.reply.dst_wmd = reply_wmd;
@@ -2109,8 +2268,9 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid,
        lnet_pid_t     dest_pid;
        lnet_nid_t     dest_nid;
        lnet_nid_t     src_nid;
-       __u32          payload_length;
-       __u32          type;
+       struct lnet_peer_ni *lpni;
+       __u32          payload_length;
+       __u32          type;
 
        LASSERT (!in_interrupt ());
 
@@ -2268,21 +2428,24 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid,
                msg->msg_hdr.dest_pid   = dest_pid;
                msg->msg_hdr.payload_length = payload_length;
        }
+       /* Multi-Rail: Primary NID of source. */
+       msg->msg_initiator = lnet_peer_primary_nid(src_nid);
 
        lnet_net_lock(cpt);
-       rc = lnet_nid2peerni_locked(&msg->msg_rxpeer, from_nid, cpt);
-       if (rc != 0) {
+       lpni = lnet_nid2peerni_locked(from_nid, cpt);
+       if (IS_ERR(lpni)) {
                lnet_net_unlock(cpt);
                CERROR("%s, src %s: Dropping %s "
-                      "(error %d looking up sender)\n",
+                      "(error %ld looking up sender)\n",
                       libcfs_nid2str(from_nid), libcfs_nid2str(src_nid),
-                      lnet_msgtyp2str(type), rc);
+                      lnet_msgtyp2str(type), PTR_ERR(lpni));
                lnet_msg_free(msg);
                if (rc == -ESHUTDOWN)
                        /* We are shutting down.  Don't do anything more */
                        return 0;
                goto drop;
        }
+       msg->msg_rxpeer = lpni;
        msg->msg_rxni = ni;
        lnet_ni_addref_locked(ni, cpt);
 
@@ -2369,8 +2532,7 @@ lnet_drop_delayed_msg_list(struct list_head *head, char *reason)
                 * called lnet_drop_message(), so I just hang onto msg as well
                 * until that's done */
 
-               lnet_drop_message(msg->msg_rxni,
-                                 msg->msg_rxpeer->lpni_cpt,
+               lnet_drop_message(msg->msg_rxni, msg->msg_rx_cpt,
                                  msg->msg_private, msg->msg_len);
                /*
                 * NB: message will not generate event because w/o attached MD,
@@ -2586,6 +2748,8 @@ lnet_create_reply_msg (lnet_ni_t *ni, lnet_msg_t *getmsg)
               libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id), getmd);
 
        /* setup information for lnet_build_msg_event */
+       msg->msg_initiator = lnet_peer_primary_nid(peer_id.nid);
+       /* Cheaper: msg->msg_initiator = getmsg->msg_txpeer->lp_nid; */
        msg->msg_from = peer_id.nid;
        msg->msg_type = LNET_MSG_GET; /* flag this msg as an "optimized" GET */
        msg->msg_hdr.src_nid = peer_id.nid;