Whamcloud - gitweb
LU-12739 lnet: Don't queue msg when discovery has completed
[fs/lustre-release.git] / lnet / lnet / lib-move.c
index 49941de..88f20e7 100644 (file)
@@ -2009,15 +2009,21 @@ lnet_initiate_peer_discovery(struct lnet_peer_ni *lpni,
        }
        /* The peer may have changed. */
        peer = lpni->lpni_peer_net->lpn_peer;
+       spin_lock(&peer->lp_lock);
+       if (lnet_peer_is_uptodate_locked(peer)) {
+               spin_unlock(&peer->lp_lock);
+               lnet_peer_ni_decref_locked(lpni);
+               return 0;
+       }
        /* queue message and return */
        msg->msg_rtr_nid_param = rtr_nid;
        msg->msg_sending = 0;
        msg->msg_txpeer = NULL;
-       spin_lock(&peer->lp_lock);
        list_add_tail(&msg->msg_list, &peer->lp_dc_pendq);
+       primary_nid = peer->lp_primary_nid;
        spin_unlock(&peer->lp_lock);
+
        lnet_peer_ni_decref_locked(lpni);
-       primary_nid = peer->lp_primary_nid;
 
        CDEBUG(D_NET, "msg %p delayed. %s pending discovery\n",
                msg, libcfs_nid2str(primary_nid));
@@ -2671,11 +2677,10 @@ again:
        msg->msg_src_nid_param = src_nid;
 
        /*
-        * Now that we have a peer_ni, check if we want to discover
-        * the peer. Traffic to the LNET_RESERVED_PORTAL should not
-        * trigger discovery.
+        * If necessary, perform discovery on the peer that owns this peer_ni.
+        * Note, this can result in the ownership of this peer_ni changing
+        * to another peer object.
         */
-       peer = lpni->lpni_peer_net->lpn_peer;
        rc = lnet_initiate_peer_discovery(lpni, msg, rtr_nid, cpt);
        if (rc) {
                lnet_peer_ni_decref_locked(lpni);
@@ -2684,6 +2689,8 @@ again:
        }
        lnet_peer_ni_decref_locked(lpni);
 
+       peer = lpni->lpni_peer_net->lpn_peer;
+
        /*
         * Identify the different send cases
         */
@@ -3542,9 +3549,13 @@ lnet_monitor_thread(void *arg)
                               min((unsigned int) alive_router_check_interval /
                                        lnet_current_net_count,
                                   lnet_transaction_timeout / 2));
-               wait_event_interruptible_timeout(the_lnet.ln_mt_waitq,
-                                               false,
-                                               cfs_time_seconds(interval));
+               wait_for_completion_interruptible_timeout(
+                       &the_lnet.ln_mt_wait_complete,
+                       cfs_time_seconds(interval));
+               /* Must re-init the completion before testing anything,
+                * including ln_mt_state.
+                */
+               reinit_completion(&the_lnet.ln_mt_wait_complete);
        }
 
        /* Shutting down */
@@ -3699,6 +3710,7 @@ lnet_mt_event_handler(struct lnet_event *event)
        case LNET_EVENT_UNLINK:
                CDEBUG(D_NET, "%s recovery ping unlinked\n",
                       libcfs_nid2str(ev_info->mt_nid));
+               /* fallthrough */
        case LNET_EVENT_REPLY:
                lnet_handle_recovery_reply(ev_info, event->status,
                                           event->type == LNET_EVENT_UNLINK);
@@ -3808,7 +3820,7 @@ void lnet_monitor_thr_stop(void)
        lnet_net_unlock(LNET_LOCK_EX);
 
        /* tell the monitor thread that we're shutting down */
-       wake_up(&the_lnet.ln_mt_waitq);
+       complete(&the_lnet.ln_mt_wait_complete);
 
        /* block until monitor thread signals that it's done */
        down(&the_lnet.ln_mt_signal);