Whamcloud - gitweb
LU-13362 lnet: Disc reply race with finalize and routed recv
[fs/lustre-release.git] / lnet / lnet / lib-move.c
index 332c038..d74cfcc 100644 (file)
@@ -1012,8 +1012,6 @@ lnet_post_routed_recv_locked(struct lnet_msg *msg, int do_recv)
        if (!msg->msg_peerrtrcredit) {
                /* lpni_lock protects the credit manipulation */
                spin_lock(&lpni->lpni_lock);
-               /* lp_lock protects the lp_rtrq */
-               spin_lock(&lp->lp_lock);
 
                msg->msg_peerrtrcredit = 1;
                lpni->lpni_rtrcredits--;
@@ -1021,15 +1019,16 @@ lnet_post_routed_recv_locked(struct lnet_msg *msg, int do_recv)
                        lpni->lpni_minrtrcredits = lpni->lpni_rtrcredits;
 
                if (lpni->lpni_rtrcredits < 0) {
+                       spin_unlock(&lpni->lpni_lock);
                        /* must have checked eager_recv before here */
                        LASSERT(msg->msg_rx_ready_delay);
                        msg->msg_rx_delayed = 1;
+                       /* lp_lock protects the lp_rtrq */
+                       spin_lock(&lp->lp_lock);
                        list_add_tail(&msg->msg_list, &lp->lp_rtrq);
                        spin_unlock(&lp->lp_lock);
-                       spin_unlock(&lpni->lpni_lock);
                        return LNET_CREDIT_WAIT;
                }
-               spin_unlock(&lp->lp_lock);
                spin_unlock(&lpni->lpni_lock);
        }
 
@@ -1256,15 +1255,15 @@ routing_off:
                LASSERT(rxpeerni->lpni_peer_net);
                LASSERT(rxpeerni->lpni_peer_net->lpn_peer);
 
-               lp = rxpeerni->lpni_peer_net->lpn_peer;
-
                /* give back peer router credits */
                msg->msg_peerrtrcredit = 0;
 
                spin_lock(&rxpeerni->lpni_lock);
-               spin_lock(&lp->lp_lock);
-
                rxpeerni->lpni_rtrcredits++;
+               spin_unlock(&rxpeerni->lpni_lock);
+
+               lp = rxpeerni->lpni_peer_net->lpn_peer;
+               spin_lock(&lp->lp_lock);
 
                /* drop all messages which are queued to be routed on that
                 * peer. */
@@ -1272,7 +1271,6 @@ routing_off:
                        LIST_HEAD(drop);
                        list_splice_init(&lp->lp_rtrq, &drop);
                        spin_unlock(&lp->lp_lock);
-                       spin_unlock(&rxpeerni->lpni_lock);
                        lnet_drop_routed_msgs_locked(&drop, msg->msg_rx_cpt);
                } else if (!list_empty(&lp->lp_rtrq)) {
                        int msg2_cpt;
@@ -1282,7 +1280,6 @@ routing_off:
                        list_del(&msg2->msg_list);
                        msg2_cpt = msg2->msg_rx_cpt;
                        spin_unlock(&lp->lp_lock);
-                       spin_unlock(&rxpeerni->lpni_lock);
                        /*
                         * messages on the lp_rtrq can be from any NID in
                         * the peer, which means they might have different
@@ -1300,7 +1297,6 @@ routing_off:
                        }
                } else {
                        spin_unlock(&lp->lp_lock);
-                       spin_unlock(&rxpeerni->lpni_lock);
                }
        }
        if (rxni != NULL) {
@@ -1442,13 +1438,6 @@ lnet_find_best_lpni_on_net(struct lnet_ni *lni, lnet_nid_t dst_nid,
         */
        peer_net = lnet_peer_get_net_locked(peer, net_id);
 
-       if (!peer_net) {
-               CERROR("gateway peer %s has no NI on net %s\n",
-                      libcfs_nid2str(peer->lp_primary_nid),
-                      libcfs_net2str(net_id));
-               return NULL;
-       }
-
        return lnet_select_peer_ni(lni, dst_nid, peer, peer_net);
 }
 
@@ -1484,29 +1473,28 @@ lnet_find_route_locked(struct lnet_remotenet *rnet, __u32 src_net,
        struct lnet_route *last_route;
        struct lnet_route *route;
        int rc;
-       __u32 restrict_net;
-       __u32 any_net = LNET_NIDNET(LNET_NID_ANY);
 
        best_route = last_route = NULL;
        list_for_each_entry(route, &rnet->lrn_routes, lr_list) {
                if (!lnet_is_route_alive(route))
                        continue;
 
-               /* If the src_net is specified then we need to find an lpni
-                * on that network
+               /*
+                * Restrict the selection of the router NI on the src_net
+                * provided. If the src_net is LNET_NID_ANY, then select
+                * the best interface available.
                 */
-               restrict_net = src_net == any_net ? route->lr_lnet : src_net;
                if (!best_route) {
                        lpni = lnet_find_best_lpni_on_net(NULL, LNET_NID_ANY,
                                                          route->lr_gateway,
-                                                         restrict_net);
+                                                         src_net);
                        if (lpni) {
                                best_route = last_route = route;
                                best_gw_ni = lpni;
                        } else
                                CERROR("Gateway %s does not have a peer NI on net %s\n",
                                       libcfs_nid2str(route->lr_gateway->lp_primary_nid),
-                                      libcfs_net2str(restrict_net));
+                                      libcfs_net2str(src_net));
 
                        continue;
                }
@@ -1521,11 +1509,11 @@ lnet_find_route_locked(struct lnet_remotenet *rnet, __u32 src_net,
 
                lpni = lnet_find_best_lpni_on_net(NULL, LNET_NID_ANY,
                                                  route->lr_gateway,
-                                                 restrict_net);
+                                                 src_net);
                if (!lpni) {
                        CERROR("Gateway %s does not have a peer NI on net %s\n",
                               libcfs_nid2str(route->lr_gateway->lp_primary_nid),
-                              libcfs_net2str(restrict_net));
+                              libcfs_net2str(src_net));
                        continue;
                }
 
@@ -1951,12 +1939,10 @@ lnet_find_best_ni_on_spec_net(struct lnet_ni *cur_best_ni,
 }
 
 static int
-lnet_initiate_peer_discovery(struct lnet_peer_ni *lpni,
-                            struct lnet_msg *msg, lnet_nid_t rtr_nid,
+lnet_initiate_peer_discovery(struct lnet_peer_ni *lpni, struct lnet_msg *msg,
                             int cpt)
 {
        struct lnet_peer *peer;
-       lnet_nid_t primary_nid;
        int rc;
 
        lnet_peer_ni_addref_locked(lpni);
@@ -1987,17 +1973,15 @@ lnet_initiate_peer_discovery(struct lnet_peer_ni *lpni,
                return 0;
        }
        /* queue message and return */
-       msg->msg_rtr_nid_param = rtr_nid;
        msg->msg_sending = 0;
        msg->msg_txpeer = NULL;
        list_add_tail(&msg->msg_list, &peer->lp_dc_pendq);
-       primary_nid = peer->lp_primary_nid;
        spin_unlock(&peer->lp_lock);
 
        lnet_peer_ni_decref_locked(lpni);
 
        CDEBUG(D_NET, "msg %p delayed. %s pending discovery\n",
-               msg, libcfs_nid2str(primary_nid));
+              msg, libcfs_nid2str(peer->lp_primary_nid));
 
        return LNET_DC_WAIT;
 }
@@ -2106,8 +2090,7 @@ lnet_handle_find_routed_path(struct lnet_send_data *sd,
         * completed
         */
        sd->sd_msg->msg_src_nid_param = sd->sd_src_nid;
-       rc = lnet_initiate_peer_discovery(gwni, sd->sd_msg, sd->sd_rtr_nid,
-                                         sd->sd_cpt);
+       rc = lnet_initiate_peer_discovery(gwni, sd->sd_msg, sd->sd_cpt);
        if (rc)
                return rc;
 
@@ -2202,6 +2185,7 @@ lnet_find_best_ni_on_local_net(struct lnet_peer *peer, int md_cpt,
 {
        struct lnet_peer_net *peer_net = NULL;
        struct lnet_ni *best_ni = NULL;
+       int lpn_healthv = 0;
 
        /*
         * The peer can have multiple interfaces, some of them can be on
@@ -2218,8 +2202,15 @@ lnet_find_best_ni_on_local_net(struct lnet_peer *peer, int md_cpt,
                 */
                if (!lnet_get_net_locked(peer_net->lpn_net_id))
                        continue;
-               best_ni = lnet_find_best_ni_on_spec_net(best_ni, peer,
-                                                  peer_net, md_cpt, false);
+
+               /* always select the lpn with the best health */
+               if (lpn_healthv <= peer_net->lpn_healthv)
+                       lpn_healthv = peer_net->lpn_healthv;
+               else
+                       continue;
+
+               best_ni = lnet_find_best_ni_on_spec_net(best_ni, peer, peer_net,
+                                                       md_cpt, false);
 
                /*
                 * if this is a discovery message and lp_disc_net_id is
@@ -2664,20 +2655,22 @@ again:
        }
 
        /*
-        * Cache the original src_nid. If we need to resend the message
-        * then we'll need to know whether the src_nid was originally
+        * Cache the original src_nid and rtr_nid. If we need to resend the
+        * message then we'll need to know whether the src_nid was originally
         * specified for this message. If it was originally specified,
         * then we need to keep using the same src_nid since it's
-        * continuing the same sequence of messages.
+        * continuing the same sequence of messages. Similarly, rtr_nid will
+        * affect our choice of next hop.
         */
        msg->msg_src_nid_param = src_nid;
+       msg->msg_rtr_nid_param = rtr_nid;
 
        /*
         * If necessary, perform discovery on the peer that owns this peer_ni.
         * Note, this can result in the ownership of this peer_ni changing
         * to another peer object.
         */
-       rc = lnet_initiate_peer_discovery(lpni, msg, rtr_nid, cpt);
+       rc = lnet_initiate_peer_discovery(lpni, msg, cpt);
        if (rc) {
                lnet_peer_ni_decref_locked(lpni);
                lnet_net_unlock(cpt);
@@ -2701,11 +2694,16 @@ again:
                send_case |= REMOTE_DST;
 
        /*
-        * if this is a non-MR peer or if we're recovering a peer ni then
-        * let's consider this an NMR case so we can hit the destination
-        * NID.
+        * Deal with the peer as NMR in the following cases:
+        * 1. the peer is NMR
+        * 2. We're trying to recover a specific peer NI
+        * 3. I'm a router sending to the final destination
+        *    In this case the source of the message would've
+        *    already selected the final destination so my job
+        *    is to honor the selection.
         */
-       if (!lnet_peer_is_multi_rail(peer) || msg->msg_recovery)
+       if (!lnet_peer_is_multi_rail(peer) || msg->msg_recovery ||
+           (msg->msg_routing && (send_case & LOCAL_DST)))
                send_case |= NMR_DST;
        else
                send_case |= MR_DST;
@@ -3005,40 +3003,19 @@ lnet_resend_pending_msgs_locked(struct list_head *resendq, int cpt)
                        lnet_finalize(msg, -EFAULT);
                        lnet_net_lock(cpt);
                } else {
-                       struct lnet_peer *peer;
                        int rc;
-                       lnet_nid_t src_nid = LNET_NID_ANY;
 
-                       /*
-                        * if this message is not being routed and the
-                        * peer is non-MR then we must use the same
-                        * src_nid that was used in the original send.
-                        * Otherwise if we're routing the message (IE
-                        * we're a router) then we can use any of our
-                        * local interfaces. It doesn't matter to the
-                        * final destination.
-                        */
-                       peer = lpni->lpni_peer_net->lpn_peer;
-                       if (!msg->msg_routing &&
-                           !lnet_peer_is_multi_rail(peer))
-                               src_nid = le64_to_cpu(msg->msg_hdr.src_nid);
-
-                       /*
-                        * If we originally specified a src NID, then we
-                        * must attempt to reuse it in the resend as well.
-                        */
-                       if (msg->msg_src_nid_param != LNET_NID_ANY)
-                               src_nid = msg->msg_src_nid_param;
                        lnet_peer_ni_decref_locked(lpni);
 
                        lnet_net_unlock(cpt);
                        CDEBUG(D_NET, "resending %s->%s: %s recovery %d try# %d\n",
-                              libcfs_nid2str(src_nid),
+                              libcfs_nid2str(msg->msg_src_nid_param),
                               libcfs_id2str(msg->msg_target),
                               lnet_msgtyp2str(msg->msg_type),
                               msg->msg_recovery,
                               msg->msg_retry_count);
-                       rc = lnet_send(src_nid, msg, LNET_NID_ANY);
+                       rc = lnet_send(msg->msg_src_nid_param, msg,
+                                      msg->msg_rtr_nid_param);
                        if (rc) {
                                CERROR("Error sending %s to %s: %d\n",
                                       lnet_msgtyp2str(msg->msg_type),
@@ -3200,7 +3177,7 @@ lnet_recover_local_nis(void)
                        ev_info->mt_type = MT_TYPE_LOCAL_NI;
                        ev_info->mt_nid = nid;
                        rc = lnet_send_ping(nid, &mdh, LNET_INTERFACES_MIN,
-                                           ev_info, the_lnet.ln_mt_eqh, true);
+                                           ev_info, the_lnet.ln_mt_eq, true);
                        /* lookup the nid again */
                        lnet_net_lock(0);
                        ni = lnet_nid2ni_locked(nid, 0);
@@ -3433,7 +3410,7 @@ lnet_recover_peer_nis(void)
                        ev_info->mt_type = MT_TYPE_PEER_NI;
                        ev_info->mt_nid = nid;
                        rc = lnet_send_ping(nid, &mdh, LNET_INTERFACES_MIN,
-                                           ev_info, the_lnet.ln_mt_eqh, true);
+                                           ev_info, the_lnet.ln_mt_eq, true);
                        lnet_net_lock(0);
                        /*
                         * lnet_find_peer_ni_locked() grabs a refcount for
@@ -3497,8 +3474,6 @@ lnet_monitor_thread(void *arg)
         *  4. Checks if there are any NIs on the remote recovery queue
         *     and pings them.
         */
-       cfs_block_allsigs();
-
        while (the_lnet.ln_mt_state == LNET_MT_STATE_RUNNING) {
                now = ktime_get_real_seconds();
 
@@ -3565,7 +3540,7 @@ lnet_monitor_thread(void *arg)
 int
 lnet_send_ping(lnet_nid_t dest_nid,
               struct lnet_handle_md *mdh, int nnis,
-              void *user_data, struct lnet_handle_eq eqh, bool recovery)
+              void *user_data, struct lnet_eq *eq, bool recovery)
 {
        struct lnet_md md = { NULL };
        struct lnet_process_id id;
@@ -3590,7 +3565,7 @@ lnet_send_ping(lnet_nid_t dest_nid,
        md.max_size  = 0;
        md.options   = LNET_MD_TRUNCATE;
        md.user_ptr  = user_data;
-       md.eq_handle = eqh;
+       md.eq_handle = eq;
 
        rc = LNetMDBind(md, LNET_UNLINK, mdh);
        if (rc) {
@@ -3785,7 +3760,7 @@ clean_thread:
        lnet_clean_local_ni_recoveryq();
        lnet_clean_peer_ni_recoveryq();
        lnet_clean_resendqs();
-       LNetInvalidateEQHandle(&the_lnet.ln_mt_eqh);
+       the_lnet.ln_mt_eq = NULL;
        return rc;
 clean_queues:
        lnet_rsp_tracker_clean();