Whamcloud - gitweb
i=maxim,b=18460,b=20171:
[fs/lustre-release.git] / lnet / lnet / router.c
index 9714c21..b89c5f3 100644 (file)
@@ -77,15 +77,19 @@ static int check_routers_before_use = 0;
 CFS_MODULE_PARM(check_routers_before_use, "i", int, 0444,
                 "Assume routers are down and ping them before use");
 
-static int dead_router_check_interval = 0;
+static int avoid_asym_router_failure = 0;
+CFS_MODULE_PARM(avoid_asym_router_failure, "i", int, 0444,
+                "Avoid asymmetrical failures: reserved, use at your own risk");
+
+int dead_router_check_interval = 0;
 CFS_MODULE_PARM(dead_router_check_interval, "i", int, 0444,
                 "Seconds between dead router health checks (<= 0 to disable)");
 
-static int live_router_check_interval = 0;
+int live_router_check_interval = 0;
 CFS_MODULE_PARM(live_router_check_interval, "i", int, 0444,
                 "Seconds between live router health checks (<= 0 to disable)");
 
-static int router_ping_timeout = 50;
+int router_ping_timeout = 50;
 CFS_MODULE_PARM(router_ping_timeout, "i", int, 0444,
                 "Seconds to wait for the reply to a router health query");
 
@@ -96,9 +100,9 @@ lnet_peers_start_down(void)
 }
 
 void
-lnet_notify_locked(lnet_peer_t *lp, int notifylnd, int alive, time_t when)
+lnet_notify_locked(lnet_peer_t *lp, int notifylnd, int alive, cfs_time_t when)
 {
-        if (when < lp->lp_timestamp) {          /* out of date information */
+        if (cfs_time_before(when, lp->lp_timestamp)) { /* out of date information */
                 CDEBUG(D_NET, "Out of date\n");
                 return;
         }
@@ -201,6 +205,12 @@ lnet_rtr_decref_locked(lnet_peer_t *lp)
 
         lp->lp_rtr_refcount--;
         if (lp->lp_rtr_refcount == 0) {
+                if (lp->lp_rcd != NULL) {
+                        list_add(&lp->lp_rcd->rcd_list,
+                                 &the_lnet.ln_zombie_rcd);
+                        lp->lp_rcd = NULL;
+                }
+
                 list_del(&lp->lp_rtr_list);
                 /* decref for the_lnet.ln_routers */
                 lnet_peer_decref_locked(lp);
@@ -496,6 +506,103 @@ lnet_get_route (int idx, __u32 *net, __u32 *hops,
 }
 
 void
+lnet_swap_pinginfo(lnet_ping_info_t *info)
+{
+        int               i;
+        lnet_ni_status_t *stat;
+
+        __swab32s(&info->pi_version);
+        __swab32s(&info->pi_pid);
+        __swab32s(&info->pi_nnis);
+        for (i = 0; i < info->pi_nnis && i < LNET_MAX_RTR_NIS; i++) {
+                stat = &info->pi_ni[i];
+                __swab64s(&stat->ns_nid);
+                __swab32s(&stat->ns_status);
+        }
+        return;
+}
+
+/* Returns # of down NIs, or negative error codes; ignore downed NIs
+ * if a NI in 'net' is up */
+int
+lnet_router_down_ni(lnet_peer_t *rtr, __u32 net)
+{
+        int               i;
+        int               down = 0;
+        int               ptl_up = 0;
+        int               ptl_down = 0;
+        lnet_ping_info_t *info;
+
+        if (!avoid_asym_router_failure)
+                return -ENOENT;
+
+        if (rtr->lp_rcd == NULL)
+                return -EINVAL;
+
+        if (!rtr->lp_alive)
+                return -EINVAL;  /* stale lp_rcd */
+
+        info = rtr->lp_rcd->rcd_pinginfo;
+        LASSERT (info != NULL);
+
+        /* NB always racing with network! */
+        if (info->pi_magic == __swab32(LNET_PROTO_PING_MAGIC)) {
+                lnet_swap_pinginfo(info);
+        } else if (info->pi_magic != LNET_PROTO_PING_MAGIC) {
+                CDEBUG(D_NETERROR, "%s: Unexpected magic %08x\n",
+                       libcfs_nid2str(rtr->lp_nid), info->pi_magic);
+                return -EPROTO;
+        }
+
+        if (info->pi_version == LNET_PROTO_PING_VERSION1)
+                return -ENOENT;  /* v1 doesn't carry NI status info */
+
+        if (info->pi_version != LNET_PROTO_PING_VERSION) {
+                CDEBUG(D_NETERROR, "%s: Unexpected version 0x%x\n",
+                       libcfs_nid2str(rtr->lp_nid), info->pi_version);
+                return -EPROTO;
+        }
+
+        for (i = 0; i < info->pi_nnis && i < LNET_MAX_RTR_NIS; i++) {
+                lnet_ni_status_t *stat = &info->pi_ni[i];
+                lnet_nid_t        nid = stat->ns_nid;
+
+                if (nid == LNET_NID_ANY) {
+                        CDEBUG(D_NETERROR, "%s: unexpected LNET_NID_ANY\n",
+                               libcfs_nid2str(rtr->lp_nid));
+                        return -EPROTO;
+                }
+
+                if (LNET_NETTYP(LNET_NIDNET(nid)) == LOLND)
+                        continue;
+
+                if (stat->ns_status == LNET_NI_STATUS_DOWN) {
+                        if (LNET_NETTYP(LNET_NIDNET(nid)) == PTLLND)
+                                ptl_down = 1;
+                        else
+                                down++;
+                        continue;
+                }
+
+                if (stat->ns_status != LNET_NI_STATUS_UP) {
+                        CDEBUG(D_NETERROR, "%s: Unexpected status 0x%x\n",
+                               libcfs_nid2str(rtr->lp_nid), stat->ns_status);
+                        return -EPROTO;
+                }
+
+                /* ignore downed NIs if there's a NI up for dest network */
+                if (LNET_NIDNET(nid) == net)
+                        return 0;
+
+                if (LNET_NETTYP(LNET_NIDNET(nid)) == PTLLND)
+                        ptl_up = 1;
+        }
+
+        /* ptl NIs are considered down only when they're all down */
+        return down + (ptl_up ? 0 : ptl_down);
+}
+
+void
 lnet_wait_known_routerstate(void)
 {
         lnet_peer_t         *rtr;
@@ -535,11 +642,17 @@ lnet_router_checker_event (lnet_event_t *event)
         /* CAVEAT EMPTOR: I'm called with LNET_LOCKed and I'm not allowed to
          * drop it (that's how come I see _every_ event, even ones that would
          * overflow my EQ) */
-        lnet_peer_t   *lp;
-        lnet_nid_t     nid;
+        lnet_rc_data_t *rcd = event->md.user_ptr;
+        lnet_peer_t    *lp;
+        lnet_nid_t      nid;
 
         if (event->unlinked) {
-                /* The router checker thread has unlinked the rc_md
+                if (rcd != NULL) {
+                        LNetInvalidateHandle(&rcd->rcd_mdh);
+                        return;
+                }
+
+                /* The router checker thread has unlinked the default rc_md
                  * and exited. */
                 LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_UNLINKING);
                 the_lnet.ln_rc_state = LNET_RC_STATE_UNLINKED;
@@ -575,7 +688,7 @@ lnet_router_checker_event (lnet_event_t *event)
                  * apps get burned). */
 
                 lnet_notify_locked(lp, 1, (event->status == 0),
-                                   cfs_time_current_sec());
+                                   cfs_time_current());
 
                 /* The router checker will wake up very shortly and do the
                  * actual notification.  
@@ -591,6 +704,100 @@ lnet_router_checker_event (lnet_event_t *event)
         lnet_peer_decref_locked(lp);
 }
 
+void
+lnet_update_ni_status(void)
+{
+        cfs_time_t now = cfs_time_current();
+        lnet_ni_t *ni;
+        int        status;
+        int        timeout;
+
+        LASSERT (the_lnet.ln_routing);
+
+        timeout = router_ping_timeout +
+                  MAX(live_router_check_interval, dead_router_check_interval);
+
+        LNET_LOCK();
+
+        list_for_each_entry (ni, &the_lnet.ln_nis, ni_list) {
+                lnet_ni_status_t *ns = ni->ni_status;
+
+                LASSERT (ns != NULL);
+
+                status = LNET_NI_STATUS_UP;
+                if (ni->ni_lnd->lnd_type != LOLND &&  /* @lo forever alive */
+                    cfs_time_after(now, cfs_time_add(ni->ni_last_alive,
+                                                     cfs_time_seconds(timeout))))
+                        status = LNET_NI_STATUS_DOWN;
+
+                if (ns->ns_status != status) {
+                        ns->ns_status = status;
+                        CDEBUG(D_NET, "NI(%s:%d) status changed to %s\n",
+                               libcfs_nid2str(ni->ni_nid), timeout,
+                               status == LNET_NI_STATUS_UP ? "up" : "down");
+                }
+        }
+
+        LNET_UNLOCK();
+}
+
+void
+lnet_destroy_rc_data (lnet_rc_data_t *rcd)
+{
+        LASSERT (list_empty(&rcd->rcd_list));
+        /* detached from network */
+        LASSERT (LNetHandleIsInvalid(rcd->rcd_mdh));
+
+        LIBCFS_FREE(rcd->rcd_pinginfo, LNET_PINGINFO_SIZE);
+        LIBCFS_FREE(rcd, sizeof(*rcd));
+        return;
+}
+
+lnet_rc_data_t *
+lnet_create_rc_data (void)
+{
+        int               i;
+        int               rc;
+        lnet_ping_info_t *pi;
+        lnet_rc_data_t   *rcd;
+
+        LIBCFS_ALLOC(rcd, sizeof(*rcd));
+        if (rcd == NULL)
+                return NULL;
+
+        LIBCFS_ALLOC(pi, LNET_PINGINFO_SIZE);
+        if (pi == NULL) {
+                LIBCFS_FREE(rcd, sizeof(*rcd));
+                return NULL;
+        }
+
+        memset(pi, 0, LNET_PINGINFO_SIZE);
+        for (i = 0; i < LNET_MAX_RTR_NIS; i++) {
+                pi->pi_ni[i].ns_nid = LNET_NID_ANY;
+                pi->pi_ni[i].ns_status = LNET_NI_STATUS_INVALID;
+        }
+        rcd->rcd_pinginfo = pi;
+        LNetInvalidateHandle(&rcd->rcd_mdh);
+        CFS_INIT_LIST_HEAD(&rcd->rcd_list);
+
+        LASSERT (!LNetHandleIsInvalid(the_lnet.ln_rc_eqh));
+        rc = LNetMDBind((lnet_md_t){.start     = pi,
+                                    .user_ptr  = rcd,
+                                    .length    = LNET_PINGINFO_SIZE,
+                                    .threshold = LNET_MD_THRESH_INF,
+                                    .options   = LNET_MD_TRUNCATE,
+                                    .eq_handle = the_lnet.ln_rc_eqh},
+                        LNET_UNLINK,
+                        &rcd->rcd_mdh);
+        if (rc < 0) {
+                CERROR("Can't bind MD: %d\n", rc);
+                lnet_destroy_rc_data(rcd);
+                return NULL;
+        }
+        LASSERT (rc == 0);
+        return rcd;
+}
+
 static int
 lnet_router_check_interval (lnet_peer_t *rtr)
 {
@@ -607,23 +814,42 @@ lnet_router_check_interval (lnet_peer_t *rtr)
 static void
 lnet_ping_router_locked (lnet_peer_t *rtr)
 {
-        lnet_process_id_t id;
-        int               secs;
-        time_t            now = cfs_time_current_sec();
+        int             newrcd = 0;
+        lnet_rc_data_t *rcd = NULL;
+        cfs_time_t      now = cfs_time_current();
+        int             secs;
 
         lnet_peer_addref_locked(rtr);
 
         if (rtr->lp_ping_deadline != 0 && /* ping timed out? */
-            now > rtr->lp_ping_deadline)
+            cfs_time_after(now, rtr->lp_ping_deadline))
                 lnet_notify_locked(rtr, 1, 0, now);
 
+        if (avoid_asym_router_failure && rtr->lp_rcd == NULL)
+                newrcd = 1;
+
         LNET_UNLOCK();
 
         /* Run any outstanding notifications */
         lnet_do_notify(rtr);
 
+        if (newrcd)
+                rcd = lnet_create_rc_data();
+
         LNET_LOCK();
 
+        if (!lnet_isrouter(rtr)) {
+                lnet_peer_decref_locked(rtr);
+                if (rcd != NULL)
+                        list_add(&rcd->rcd_list, &the_lnet.ln_zombie_rcd);
+                return; /* router table changed! */
+        }
+
+        if (rcd != NULL) {
+                LASSERT (rtr->lp_rcd == NULL);
+                rtr->lp_rcd = rcd;
+        }
+
         secs = lnet_router_check_interval(rtr);
 
         CDEBUG(D_NET,
@@ -634,23 +860,32 @@ lnet_ping_router_locked (lnet_peer_t *rtr)
                rtr->lp_alive, rtr->lp_alive_count, rtr->lp_ping_timestamp);
 
         if (secs != 0 && !rtr->lp_ping_notsent &&
-            now > rtr->lp_ping_timestamp + secs) {
+            cfs_time_after(now, cfs_time_add(rtr->lp_ping_timestamp,
+                                             cfs_time_seconds(secs)))) {
+                int               rc;
+                lnet_process_id_t id;
+                lnet_handle_md_t  mdh;
+
                 id.nid = rtr->lp_nid;
                 id.pid = LUSTRE_SRV_LNET_PID;
                 CDEBUG(D_NET, "Check: %s\n", libcfs_id2str(id));
 
                 rtr->lp_ping_notsent   = 1;
                 rtr->lp_ping_timestamp = now;
+                mdh = (rtr->lp_rcd == NULL) ? the_lnet.ln_rc_mdh :
+                                              rtr->lp_rcd->rcd_mdh;
 
                 if (rtr->lp_ping_deadline == 0)
-                        rtr->lp_ping_deadline = now + router_ping_timeout;
+                        rtr->lp_ping_deadline = cfs_time_shift(router_ping_timeout);
 
                 LNET_UNLOCK();
 
-                LNetGet(LNET_NID_ANY, the_lnet.ln_rc_mdh, id,
-                        LNET_RESERVED_PORTAL, LNET_PROTO_PING_MATCHBITS, 0);
+                rc = LNetGet(LNET_NID_ANY, mdh, id, LNET_RESERVED_PORTAL,
+                             LNET_PROTO_PING_MATCHBITS, 0);
 
                 LNET_LOCK();
+                if (rc != 0)
+                        rtr->lp_ping_notsent = 0; /* no event pending */
         }
 
         lnet_peer_decref_locked(rtr);
@@ -732,7 +967,8 @@ lnet_router_checker_start(void)
                 return -EINVAL;
         }
 
-        if (live_router_check_interval <= 0 &&
+        if (!the_lnet.ln_routing &&
+            live_router_check_interval <= 0 &&
             dead_router_check_interval <= 0)
                 return 0;
 
@@ -753,6 +989,7 @@ lnet_router_checker_start(void)
         }
 
         memset(&md, 0, sizeof(md));
+        md.user_ptr  = NULL;
         md.start     = &pinginfo;
         md.length    = sizeof(pinginfo);
         md.options   = LNET_MD_TRUNCATE;
@@ -824,6 +1061,68 @@ lnet_router_checker_stop (void)
 
 #if defined(__KERNEL__) && defined(LNET_ROUTER)
 
+static void
+lnet_prune_zombie_rcd (int wait_unlink)
+{
+        lnet_rc_data_t   *rcd;
+        lnet_rc_data_t   *tmp;
+        struct list_head  free_rcd;
+        int               i;
+        __u64             version;
+
+        CFS_INIT_LIST_HEAD(&free_rcd);
+
+        LNET_LOCK();
+rescan:
+        version = the_lnet.ln_routers_version;
+        list_for_each_entry_safe (rcd, tmp, &the_lnet.ln_zombie_rcd, rcd_list) {
+                if (LNetHandleIsInvalid(rcd->rcd_mdh)) {
+                        list_del(&rcd->rcd_list);
+                        list_add(&rcd->rcd_list, &free_rcd);
+                        continue;
+                }
+
+                LNET_UNLOCK();
+
+                LNetMDUnlink(rcd->rcd_mdh);
+
+                LNET_LOCK();
+                if (version != the_lnet.ln_routers_version)
+                        goto rescan;
+        }
+
+        i = 2;
+        while (wait_unlink && !list_empty(&the_lnet.ln_zombie_rcd)) {
+                rcd = list_entry(the_lnet.ln_zombie_rcd.next,
+                                 lnet_rc_data_t, rcd_list);
+                if (LNetHandleIsInvalid(rcd->rcd_mdh)) {
+                        list_del(&rcd->rcd_list);
+                        list_add(&rcd->rcd_list, &free_rcd);
+                        continue;
+                }
+
+                LNET_UNLOCK();
+
+                LNetMDUnlink(rcd->rcd_mdh);
+
+                i++;
+                CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
+                       "Waiting for rc buffers to unlink\n");
+                cfs_pause(cfs_time_seconds(1));
+
+                LNET_LOCK();
+        }
+
+        LNET_UNLOCK();
+
+        while (!list_empty(&free_rcd)) {
+                rcd = list_entry(free_rcd.next, lnet_rc_data_t, rcd_list);
+                list_del_init(&rcd->rcd_list);
+                lnet_destroy_rc_data(rcd);
+        }
+        return;
+}
+
 static int
 lnet_router_checker(void *arg)
 {
@@ -859,6 +1158,11 @@ rescan:
 
                 LNET_UNLOCK();
 
+                if (the_lnet.ln_routing)
+                        lnet_update_ni_status();
+
+                lnet_prune_zombie_rcd(0); /* don't wait for UNLINK */
+
                 /* Call cfs_pause() here always adds 1 to load average 
                  * because kernel counts # active tasks as nr_running 
                  * + nr_uninterruptible. */
@@ -866,6 +1170,23 @@ rescan:
                                      cfs_time_seconds(1));
         }
 
+        LNET_LOCK();
+
+        list_for_each (entry, &the_lnet.ln_routers) {
+                rtr = list_entry(entry, lnet_peer_t, lp_rtr_list);
+
+                if (rtr->lp_rcd == NULL)
+                        continue;
+
+                LASSERT (list_empty(&rtr->lp_rcd->rcd_list));
+                list_add(&rtr->lp_rcd->rcd_list, &the_lnet.ln_zombie_rcd);
+                rtr->lp_rcd = NULL;
+        }
+
+        LNET_UNLOCK();
+
+        lnet_prune_zombie_rcd(1); /* wait for UNLINK */
+
         LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_STOPTHREAD);
         the_lnet.ln_rc_state = LNET_RC_STATE_UNLINKING;
 
@@ -1079,10 +1400,10 @@ lnet_alloc_rtrpools(int im_a_router)
 }
 
 int
-lnet_notify (lnet_ni_t *ni, lnet_nid_t nid, int alive, time_t when)
+lnet_notify (lnet_ni_t *ni, lnet_nid_t nid, int alive, cfs_time_t when)
 {
-        lnet_peer_t         *lp = NULL;
-        time_t               now = cfs_time_current_sec();
+        lnet_peer_t *lp = NULL;
+        cfs_time_t   now = cfs_time_current();
 
         LASSERT (!in_interrupt ());
 
@@ -1100,12 +1421,12 @@ lnet_notify (lnet_ni_t *ni, lnet_nid_t nid, int alive, time_t when)
         }
 
         /* can't do predictions... */
-        if (when > now) {
+        if (cfs_time_after(when, now)) {
                 CWARN ("Ignoring prediction from %s of %s %s "
                        "%ld seconds in the future\n",
                        (ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid),
                        libcfs_nid2str(nid), alive ? "up" : "down",
-                       when - now);
+                       cfs_duration_sec(cfs_time_sub(when, now)));
                 return -EINVAL;
         }
 
@@ -1156,7 +1477,7 @@ lnet_get_tunables (void)
 #else
 
 int
-lnet_notify (lnet_ni_t *ni, lnet_nid_t nid, int alive, time_t when)
+lnet_notify (lnet_ni_t *ni, lnet_nid_t nid, int alive, cfs_time_t when)
 {
         return -EOPNOTSUPP;
 }