LU-4423 lustre: don't declare extern variables in C files.

[fs/lustre-release.git] / lnet / lnet / lib-move.c
diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c

index 7067164..75a0650 100644 (file)
--- a/lnet/lnet/lib-move.c
+++ b/lnet/lnet/lib-move.c
@@ -42,8 +42,6 @@
  #include <linux/nsproxy.h>
  #include <net/net_namespace.h>
  
-extern unsigned int lnet_current_net_count;
-
  static int local_nid_dist_zero = 1;
  module_param(local_nid_dist_zero, int, 0444);
  MODULE_PARM_DESC(local_nid_dist_zero, "Reserved");
@@ -796,12 +794,32 @@ lnet_ni_eager_recv(struct lnet_ni *ni, struct lnet_msg *msg)
         return rc;
  }
  
+static bool
+lnet_is_peer_deadline_passed(struct lnet_peer_ni *lpni, time64_t now)
+{
+       time64_t deadline;
+
+       deadline = lpni->lpni_last_alive +
+                  lpni->lpni_net->net_tunables.lct_peer_timeout;
+
+       /*
+        * assume peer_ni is alive as long as we're within the configured
+        * peer timeout
+        */
+       if (deadline > now)
+               return false;
+
+       return true;
+}
+
  /* NB: returns 1 when alive, 0 when dead, negative when error;
   *     may drop the lnet_net_lock */
  static int
  lnet_peer_alive_locked(struct lnet_ni *ni, struct lnet_peer_ni *lpni,
                        struct lnet_msg *msg)
  {
+       time64_t now = ktime_get_seconds();
+
         if (!lnet_peer_aliveness_enabled(lpni))
                 return -ENODEV;
  
@@ -821,6 +839,9 @@ lnet_peer_alive_locked(struct lnet_ni *ni, struct lnet_peer_ni *lpni,
             msg->msg_type == LNET_MSG_REPLY)
                 return 1;
  
+       if (!lnet_is_peer_deadline_passed(lpni, now))
+               return true;
+
         return lnet_is_peer_ni_alive(lpni);
  }
  
@@ -1887,8 +1908,11 @@ lnet_handle_spec_local_nmr_dst(struct lnet_send_data *sd)
   * Local Destination
   * MR Peer
   *
- * Run the selection algorithm on the peer NIs unless we're sending
- * a response, in this case just send to the destination
+ * Don't run the selection algorithm on the peer NIs. By specifying the
+ * local NID, we're also saying that we should always use the destination NID
+ * provided. This handles the case where we should be using the same
+ * destination NID for the all the messages which belong to the same RPC
+ * request.
   */
  static int
  lnet_handle_spec_local_mr_dst(struct lnet_send_data *sd)
@@ -1901,17 +1925,6 @@ lnet_handle_spec_local_mr_dst(struct lnet_send_data *sd)
                 return -EINVAL;
         }
  
-       /*
-        * only run the selection algorithm to pick the peer_ni if we're
-        * sending a GET or a PUT. Responses are sent to the same
-        * destination NID provided.
-        */
-       if (!(sd->sd_send_case & SND_RESP)) {
-               sd->sd_best_lpni =
-                 lnet_find_best_lpni_on_net(sd, sd->sd_peer,
-                                            sd->sd_best_ni->ni_net->net_id);
-       }
-
         if (sd->sd_best_lpni &&
             sd->sd_best_lpni->lpni_nid == the_lnet.ln_loni->ni_nid)
                 return lnet_handle_lo_send(sd);
@@ -1986,15 +1999,21 @@ lnet_initiate_peer_discovery(struct lnet_peer_ni *lpni,
         }
         /* The peer may have changed. */
         peer = lpni->lpni_peer_net->lpn_peer;
+       spin_lock(&peer->lp_lock);
+       if (lnet_peer_is_uptodate_locked(peer)) {
+               spin_unlock(&peer->lp_lock);
+               lnet_peer_ni_decref_locked(lpni);
+               return 0;
+       }
         /* queue message and return */
         msg->msg_rtr_nid_param = rtr_nid;
         msg->msg_sending = 0;
         msg->msg_txpeer = NULL;
-       spin_lock(&peer->lp_lock);
         list_add_tail(&msg->msg_list, &peer->lp_dc_pendq);
+       primary_nid = peer->lp_primary_nid;
         spin_unlock(&peer->lp_lock);
+
         lnet_peer_ni_decref_locked(lpni);
-       primary_nid = peer->lp_primary_nid;
  
         CDEBUG(D_NET, "msg %p delayed. %s pending discovery\n",
                 msg, libcfs_nid2str(primary_nid));
@@ -2010,21 +2029,59 @@ lnet_handle_find_routed_path(struct lnet_send_data *sd,
  {
         int rc;
         struct lnet_peer *gw;
+       struct lnet_peer *lp;
+       struct lnet_peer_net *lpn;
+       struct lnet_peer_net *best_lpn = NULL;
+       struct lnet_remotenet *rnet;
         struct lnet_route *best_route;
         struct lnet_route *last_route;
         struct lnet_peer_ni *lpni = NULL;
+       struct lnet_peer_ni *gwni = NULL;
         lnet_nid_t src_nid = sd->sd_src_nid;
  
-       best_route = lnet_find_route_locked(NULL, LNET_NIDNET(dst_nid),
+       /* we've already looked up the initial lpni using dst_nid */
+       lpni = sd->sd_best_lpni;
+       /* the peer tree must be in existence */
+       LASSERT(lpni && lpni->lpni_peer_net && lpni->lpni_peer_net->lpn_peer);
+       lp = lpni->lpni_peer_net->lpn_peer;
+
+       list_for_each_entry(lpn, &lp->lp_peer_nets, lpn_peer_nets) {
+               /* is this remote network reachable?  */
+               rnet = lnet_find_rnet_locked(lpn->lpn_net_id);
+               if (!rnet)
+                       continue;
+
+               if (!best_lpn)
+                       best_lpn = lpn;
+
+               if (best_lpn->lpn_seq <= lpn->lpn_seq)
+                       continue;
+
+               best_lpn = lpn;
+       }
+
+       if (!best_lpn) {
+               CERROR("peer %s has no available nets \n",
+                      libcfs_nid2str(sd->sd_dst_nid));
+               return -EHOSTUNREACH;
+       }
+
+       sd->sd_best_lpni = lnet_find_best_lpni_on_net(sd, lp, best_lpn->lpn_net_id);
+       if (!sd->sd_best_lpni) {
+               CERROR("peer %s down\n", libcfs_nid2str(sd->sd_dst_nid));
+               return -EHOSTUNREACH;
+       }
+
+       best_route = lnet_find_route_locked(NULL, best_lpn->lpn_net_id,
                                             sd->sd_rtr_nid, &last_route,
-                                           &lpni);
+                                           &gwni);
         if (!best_route) {
                 CERROR("no route to %s from %s\n",
                        libcfs_nid2str(dst_nid), libcfs_nid2str(src_nid));
                 return -EHOSTUNREACH;
         }
  
-       if (!lpni) {
+       if (!gwni) {
                 CERROR("Internal Error. Route expected to %s from %s\n",
                         libcfs_nid2str(dst_nid),
                         libcfs_nid2str(src_nid));
@@ -2032,7 +2089,7 @@ lnet_handle_find_routed_path(struct lnet_send_data *sd,
         }
  
         gw = best_route->lr_gateway;
-       LASSERT(gw == lpni->lpni_peer_net->lpn_peer);
+       LASSERT(gw == gwni->lpni_peer_net->lpn_peer);
  
         /*
          * Discover this gateway if it hasn't already been discovered.
@@ -2040,7 +2097,7 @@ lnet_handle_find_routed_path(struct lnet_send_data *sd,
          * completed
          */
         sd->sd_msg->msg_src_nid_param = sd->sd_src_nid;
-       rc = lnet_initiate_peer_discovery(lpni, sd->sd_msg, sd->sd_rtr_nid,
+       rc = lnet_initiate_peer_discovery(gwni, sd->sd_msg, sd->sd_rtr_nid,
                                           sd->sd_cpt);
         if (rc)
                 return rc;
@@ -2060,15 +2117,16 @@ lnet_handle_find_routed_path(struct lnet_send_data *sd,
                 return -EFAULT;
         }
  
-       *gw_lpni = lpni;
+       *gw_lpni = gwni;
         *gw_peer = gw;
  
         /*
-        * increment the route sequence number since now we're sure we're
-        * going to use it
+        * increment the sequence numbers since now we're sure we're
+        * going to use this path
          */
         LASSERT(best_route && last_route);
         best_route->lr_seq = last_route->lr_seq + 1;
+       best_lpn->lpn_seq++;
  
         return 0;
  }
@@ -2439,11 +2497,11 @@ lnet_handle_any_mr_dst(struct lnet_send_data *sd)
                 return rc;
  
         /*
-        * TODO; One possible enhancement is to run the selection
-        * algorithm on the peer. However for remote peers the credits are
-        * not decremented, so we'll be basically going over the peer NIs
-        * in round robin. An MR router will run the selection algorithm
-        * on the next-hop interfaces.
+        * Now that we must route to the destination, we must consider the
+        * MR case, where the destination has multiple interfaces, some of
+        * which we can route to and others we do not. For this reason we
+        * need to select the destination which we can route to and if
+        * there are multiple, we need to round robin.
          */
         rc = lnet_handle_find_routed_path(sd, sd->sd_dst_nid, &gw_lpni,
                                           &gw_peer);
@@ -2609,11 +2667,10 @@ again:
         msg->msg_src_nid_param = src_nid;
  
         /*
-        * Now that we have a peer_ni, check if we want to discover
-        * the peer. Traffic to the LNET_RESERVED_PORTAL should not
-        * trigger discovery.
+        * If necessary, perform discovery on the peer that owns this peer_ni.
+        * Note, this can result in the ownership of this peer_ni changing
+        * to another peer object.
          */
-       peer = lpni->lpni_peer_net->lpn_peer;
         rc = lnet_initiate_peer_discovery(lpni, msg, rtr_nid, cpt);
         if (rc) {
                 lnet_peer_ni_decref_locked(lpni);
@@ -2622,6 +2679,8 @@ again:
         }
         lnet_peer_ni_decref_locked(lpni);
  
+       peer = lpni->lpni_peer_net->lpn_peer;
+
         /*
          * Identify the different send cases
          */
@@ -2702,8 +2761,13 @@ lnet_send(lnet_nid_t src_nid, struct lnet_msg *msg, lnet_nid_t rtr_nid)
         LASSERT(!msg->msg_tx_committed);
  
         rc = lnet_select_pathway(src_nid, dst_nid, msg, rtr_nid);
-       if (rc < 0)
+       if (rc < 0) {
+               if (rc == -EHOSTUNREACH)
+                       msg->msg_health_status = LNET_MSG_STATUS_REMOTE_ERROR;
+               else
+                       msg->msg_health_status = LNET_MSG_STATUS_LOCAL_ERROR;
                 return rc;
+       }
  
         if (rc == LNET_CREDIT_OK)
                 lnet_ni_send(msg->msg_txni, msg);
@@ -2737,25 +2801,57 @@ lnet_detach_rsp_tracker(struct lnet_libmd *md, int cpt)
                 return;
  
         rspt = md->md_rspt_ptr;
-       md->md_rspt_ptr = NULL;
  
         /* debug code */
         LASSERT(rspt->rspt_cpt == cpt);
  
-       /*
-        * invalidate the handle to indicate that a response has been
-        * received, which will then lead the monitor thread to clean up
-        * the rspt block.
-        */
-       LNetInvalidateMDHandle(&rspt->rspt_mdh);
+       md->md_rspt_ptr = NULL;
+
+       if (LNetMDHandleIsInvalid(rspt->rspt_mdh)) {
+               /*
+                * The monitor thread has invalidated this handle because the
+                * response timed out, but it failed to lookup the MD. That
+                * means this response tracker is on the zombie list. We can
+                * safely remove it under the resource lock (held by caller) and
+                * free the response tracker block.
+                */
+               list_del(&rspt->rspt_on_list);
+               lnet_rspt_free(rspt, cpt);
+       } else {
+               /*
+                * invalidate the handle to indicate that a response has been
+                * received, which will then lead the monitor thread to clean up
+                * the rspt block.
+                */
+               LNetInvalidateMDHandle(&rspt->rspt_mdh);
+       }
+}
+
+void
+lnet_clean_zombie_rstqs(void)
+{
+       struct lnet_rsp_tracker *rspt, *tmp;
+       int i;
+
+       cfs_cpt_for_each(i, lnet_cpt_table()) {
+               list_for_each_entry_safe(rspt, tmp,
+                                        the_lnet.ln_mt_zombie_rstqs[i],
+                                        rspt_on_list) {
+                       list_del(&rspt->rspt_on_list);
+                       lnet_rspt_free(rspt, i);
+               }
+       }
+
+       cfs_percpt_free(the_lnet.ln_mt_zombie_rstqs);
  }
  
  static void
-lnet_finalize_expired_responses(bool force)
+lnet_finalize_expired_responses(void)
  {
         struct lnet_libmd *md;
         struct list_head local_queue;
         struct lnet_rsp_tracker *rspt, *tmp;
+       ktime_t now;
         int i;
  
         if (the_lnet.ln_mt_rstq == NULL)
@@ -2772,6 +2868,8 @@ lnet_finalize_expired_responses(bool force)
                 list_splice_init(the_lnet.ln_mt_rstq[i], &local_queue);
                 lnet_net_unlock(i);
  
+               now = ktime_get();
+
                 list_for_each_entry_safe(rspt, tmp, &local_queue, rspt_on_list) {
                         /*
                          * The rspt mdh will be invalidated when a response
@@ -2787,41 +2885,74 @@ lnet_finalize_expired_responses(bool force)
                         lnet_res_lock(i);
                         if (LNetMDHandleIsInvalid(rspt->rspt_mdh)) {
                                 lnet_res_unlock(i);
-                               list_del_init(&rspt->rspt_on_list);
+                               list_del(&rspt->rspt_on_list);
                                 lnet_rspt_free(rspt, i);
                                 continue;
                         }
  
-                       if (ktime_compare(ktime_get(), rspt->rspt_deadline) >= 0 ||
-                           force) {
+                       if (ktime_compare(now, rspt->rspt_deadline) >= 0 ||
+                           the_lnet.ln_mt_state == LNET_MT_STATE_SHUTDOWN) {
                                 struct lnet_peer_ni *lpni;
                                 lnet_nid_t nid;
  
                                 md = lnet_handle2md(&rspt->rspt_mdh);
                                 if (!md) {
+                                       /* MD has been queued for unlink, but
+                                        * rspt hasn't been detached (Note we've
+                                        * checked above that the rspt_mdh is
+                                        * valid). Since we cannot lookup the MD
+                                        * we're unable to detach the rspt
+                                        * ourselves. Thus, move the rspt to the
+                                        * zombie list where we'll wait for
+                                        * either:
+                                        *   1. The remaining operations on the
+                                        *   MD to complete. In this case the
+                                        *   final operation will result in
+                                        *   lnet_msg_detach_md()->
+                                        *   lnet_detach_rsp_tracker() where
+                                        *   we will clean up this response
+                                        *   tracker.
+                                        *   2. LNet to shutdown. In this case
+                                        *   we'll wait until after all LND Nets
+                                        *   have shutdown and then we can
+                                        *   safely free any remaining response
+                                        *   tracker blocks on the zombie list.
+                                        * Note: We need to hold the resource
+                                        * lock when adding to the zombie list
+                                        * because we may have concurrent access
+                                        * with lnet_detach_rsp_tracker().
+                                        */
                                         LNetInvalidateMDHandle(&rspt->rspt_mdh);
+                                       list_move(&rspt->rspt_on_list,
+                                                 the_lnet.ln_mt_zombie_rstqs[i]);
                                         lnet_res_unlock(i);
-                                       list_del_init(&rspt->rspt_on_list);
-                                       lnet_rspt_free(rspt, i);
                                         continue;
                                 }
                                 LASSERT(md->md_rspt_ptr == rspt);
                                 md->md_rspt_ptr = NULL;
                                 lnet_res_unlock(i);
  
-                               lnet_net_lock(i);
-                               the_lnet.ln_counters[i]->lct_health.lch_response_timeout_count++;
-                               lnet_net_unlock(i);
-
-                               list_del_init(&rspt->rspt_on_list);
+                               LNetMDUnlink(rspt->rspt_mdh);
  
                                 nid = rspt->rspt_next_hop_nid;
  
-                               CNETERR("Response timed out: md = %p: nid = %s\n",
-                                       md, libcfs_nid2str(nid));
-                               LNetMDUnlink(rspt->rspt_mdh);
+                               list_del(&rspt->rspt_on_list);
                                 lnet_rspt_free(rspt, i);
  
+                               /* If we're shutting down we just want to clean
+                                * up the rspt blocks
+                                */
+                               if (the_lnet.ln_mt_state == LNET_MT_STATE_SHUTDOWN)
+                                       continue;
+
+                               lnet_net_lock(i);
+                               the_lnet.ln_counters[i]->lct_health.lch_response_timeout_count++;
+                               lnet_net_unlock(i);
+
+                               CDEBUG(D_NET,
+                                      "Response timeout: md = %p: nid = %s\n",
+                                      md, libcfs_nid2str(nid));
+
                                 /*
                                  * If there is a timeout on the response
                                  * from the next hop decrement its health
@@ -2840,10 +2971,11 @@ lnet_finalize_expired_responses(bool force)
                         }
                 }
  
-               lnet_net_lock(i);
-               if (!list_empty(&local_queue))
+               if (!list_empty(&local_queue)) {
+                       lnet_net_lock(i);
                         list_splice(&local_queue, the_lnet.ln_mt_rstq[i]);
-               lnet_net_unlock(i);
+                       lnet_net_unlock(i);
+               }
         }
  }
  
@@ -3116,26 +3248,6 @@ lnet_recover_local_nis(void)
         lnet_net_unlock(0);
  }
  
-static struct list_head **
-lnet_create_array_of_queues(void)
-{
-       struct list_head **qs;
-       struct list_head *q;
-       int i;
-
-       qs = cfs_percpt_alloc(lnet_cpt_table(),
-                             sizeof(struct list_head));
-       if (!qs) {
-               CERROR("Failed to allocate queues\n");
-               return NULL;
-       }
-
-       cfs_percpt_for_each(q, i, qs)
-               INIT_LIST_HEAD(q);
-
-       return qs;
-}
-
  static int
  lnet_resendqs_create(void)
  {
@@ -3378,6 +3490,7 @@ lnet_monitor_thread(void *arg)
         int interval;
         time64_t now;
  
+       wait_for_completion(&the_lnet.ln_started);
         /*
          * The monitor thread takes care of the following:
          *  1. Checks the aliveness of routers
@@ -3399,7 +3512,7 @@ lnet_monitor_thread(void *arg)
                 lnet_resend_pending_msgs();
  
                 if (now >= rsp_timeout) {
-                       lnet_finalize_expired_responses(false);
+                       lnet_finalize_expired_responses();
                         rsp_timeout = now + (lnet_transaction_timeout / 2);
                 }
  
@@ -3426,9 +3539,13 @@ lnet_monitor_thread(void *arg)
                                min((unsigned int) alive_router_check_interval /
                                         lnet_current_net_count,
                                    lnet_transaction_timeout / 2));
-               wait_event_interruptible_timeout(the_lnet.ln_mt_waitq,
-                                               false,
-                                               cfs_time_seconds(interval));
+               wait_for_completion_interruptible_timeout(
+                       &the_lnet.ln_mt_wait_complete,
+                       cfs_time_seconds(interval));
+               /* Must re-init the completion before testing anything,
+                * including ln_mt_state.
+                */
+               reinit_completion(&the_lnet.ln_mt_wait_complete);
         }
  
         /* Shutting down */
@@ -3583,6 +3700,7 @@ lnet_mt_event_handler(struct lnet_event *event)
         case LNET_EVENT_UNLINK:
                 CDEBUG(D_NET, "%s recovery ping unlinked\n",
                        libcfs_nid2str(ev_info->mt_nid));
+               /* fallthrough */
         case LNET_EVENT_REPLY:
                 lnet_handle_recovery_reply(ev_info, event->status,
                                            event->type == LNET_EVENT_UNLINK);
@@ -3621,7 +3739,7 @@ lnet_rsp_tracker_create(void)
  static void
  lnet_rsp_tracker_clean(void)
  {
-       lnet_finalize_expired_responses(true);
+       lnet_finalize_expired_responses();
  
         cfs_percpt_free(the_lnet.ln_mt_rstq);
         the_lnet.ln_mt_rstq = NULL;
@@ -3692,7 +3810,7 @@ void lnet_monitor_thr_stop(void)
         lnet_net_unlock(LNET_LOCK_EX);
  
         /* tell the monitor thread that we're shutting down */
-       wake_up(&the_lnet.ln_mt_waitq);
+       complete(&the_lnet.ln_mt_wait_complete);
  
         /* block until monitor thread signals that it's done */
         down(&the_lnet.ln_mt_signal);
@@ -3703,8 +3821,6 @@ void lnet_monitor_thr_stop(void)
         lnet_clean_local_ni_recoveryq();
         lnet_clean_peer_ni_recoveryq();
         lnet_clean_resendqs();
-
-       return;
  }
  
  void
@@ -4376,6 +4492,10 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid,
                         return 0;
                 goto drop;
         }
+
+       if (the_lnet.ln_routing)
+               lpni->lpni_last_alive = ktime_get_seconds();
+
         msg->msg_rxpeer = lpni;
         msg->msg_rxni = ni;
         lnet_ni_addref_locked(ni, cpt);
@@ -4973,9 +5093,9 @@ LNetDist(lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp)
                          * current net namespace.
                          * If not, assign order above 0xffff0000,
                          * to make this ni not a priority. */
-                       if (!net_eq(ni->ni_net_ns, current->nsproxy->net_ns))
-                               order += 0xffff0000;
-
+                       if (current->nsproxy &&
+                           !net_eq(ni->ni_net_ns, current->nsproxy->net_ns))
+                                       order += 0xffff0000;
                         if (srcnidp != NULL)
                                 *srcnidp = ni->ni_nid;
                         if (orderp != NULL)