Whamcloud - gitweb
LU-2466 lnet: Use Hash Table for Remote Route List
[fs/lustre-release.git] / lnet / lnet / router.c
index 49fa60f..80c0a59 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  *
- * Copyright (c) 2011, Whamcloud, Inc.
+ * Copyright (c) 2011, 2012, Intel Corporation.
  *
  *   This file is part of Portals
  *   http://sourceforge.net/projects/sandiaportals/
@@ -82,23 +82,23 @@ lnet_peer_buffer_credits(lnet_ni_t *ni)
 
 static int check_routers_before_use = 0;
 CFS_MODULE_PARM(check_routers_before_use, "i", int, 0444,
-                "Assume routers are down and ping them before use");
+               "Assume routers are down and ping them before use");
 
-static int avoid_asym_router_failure = 0;
-CFS_MODULE_PARM(avoid_asym_router_failure, "i", int, 0444,
-                "Avoid asymmetrical failures: reserved, use at your own risk");
+static int avoid_asym_router_failure = 1;
+CFS_MODULE_PARM(avoid_asym_router_failure, "i", int, 0644,
+               "Avoid asymmetrical router failures (0 to disable)");
 
-static int dead_router_check_interval = 0;
-CFS_MODULE_PARM(dead_router_check_interval, "i", int, 0444,
-                "Seconds between dead router health checks (<= 0 to disable)");
+static int dead_router_check_interval = 60;
+CFS_MODULE_PARM(dead_router_check_interval, "i", int, 0644,
+               "Seconds between dead router health checks (<= 0 to disable)");
 
-static int live_router_check_interval = 0;
-CFS_MODULE_PARM(live_router_check_interval, "i", int, 0444,
-                "Seconds between live router health checks (<= 0 to disable)");
+static int live_router_check_interval = 60;
+CFS_MODULE_PARM(live_router_check_interval, "i", int, 0644,
+               "Seconds between live router health checks (<= 0 to disable)");
 
 static int router_ping_timeout = 50;
-CFS_MODULE_PARM(router_ping_timeout, "i", int, 0444,
-                "Seconds to wait for the reply to a router health query");
+CFS_MODULE_PARM(router_ping_timeout, "i", int, 0644,
+               "Seconds to wait for the reply to a router health query");
 
 int
 lnet_peers_start_down(void)
@@ -130,9 +130,9 @@ lnet_notify_locked(lnet_peer_t *lp, int notifylnd, int alive, cfs_time_t when)
         lp->lp_notify = 1;
         lp->lp_notifylnd |= notifylnd;
        if (lp->lp_alive)
-               lp->lp_ping_version = LNET_PROTO_PING_UNKNOWN; /* reset */
+               lp->lp_ping_feats = LNET_PING_FEAT_INVAL; /* reset */
 
-        CDEBUG(D_NET, "set %s %d\n", libcfs_nid2str(lp->lp_nid), alive);
+       CDEBUG(D_NET, "set %s %d\n", libcfs_nid2str(lp->lp_nid), alive);
 }
 
 void
@@ -227,18 +227,20 @@ lnet_rtr_decref_locked(lnet_peer_t *lp)
 lnet_remotenet_t *
 lnet_find_net_locked (__u32 net)
 {
-        lnet_remotenet_t *rnet;
-        cfs_list_t       *tmp;
+       lnet_remotenet_t        *rnet;
+       cfs_list_t              *tmp;
+       cfs_list_t              *rn_list;
 
-        LASSERT (!the_lnet.ln_shutdown);
+       LASSERT(!the_lnet.ln_shutdown);
 
-        cfs_list_for_each (tmp, &the_lnet.ln_remote_nets) {
-                rnet = cfs_list_entry(tmp, lnet_remotenet_t, lrn_list);
+       rn_list = lnet_net2rnethash(net);
+       cfs_list_for_each(tmp, rn_list) {
+               rnet = cfs_list_entry(tmp, lnet_remotenet_t, lrn_list);
 
-                if (rnet->lrn_net == net)
-                        return rnet;
-        }
-        return NULL;
+               if (rnet->lrn_net == net)
+                       return rnet;
+       }
+       return NULL;
 }
 
 static void lnet_shuffle_seed(void)
@@ -294,8 +296,8 @@ lnet_add_route_to_rnet (lnet_remotenet_t *rnet, lnet_route_t *route)
         cfs_list_add(&route->lr_list, e);
        cfs_list_add(&route->lr_gwlist, &route->lr_gateway->lp_routes);
 
-        the_lnet.ln_remote_nets_version++;
-        lnet_rtr_addref_locked(route->lr_gateway);
+       the_lnet.ln_remote_nets_version++;
+       lnet_rtr_addref_locked(route->lr_gateway);
 }
 
 int
@@ -351,7 +353,7 @@ lnet_add_route (__u32 net, unsigned int hops, lnet_nid_t gateway)
                LIBCFS_FREE(rnet, sizeof(*rnet));
 
                if (rc == -EHOSTUNREACH) { /* gateway is not on a local net */
-                        return 0;               /* ignore the route entry */
+                       return 0;       /* ignore the route entry */
                } else {
                        CERROR("Error %d creating route %s %d %s\n", rc,
                               libcfs_net2str(net), hops,
@@ -365,7 +367,7 @@ lnet_add_route (__u32 net, unsigned int hops, lnet_nid_t gateway)
         rnet2 = lnet_find_net_locked(net);
         if (rnet2 == NULL) {
                 /* new network */
-                cfs_list_add_tail(&rnet->lrn_list, &the_lnet.ln_remote_nets);
+               cfs_list_add_tail(&rnet->lrn_list, lnet_net2rnethash(net));
                 rnet2 = rnet;
         }
 
@@ -404,10 +406,10 @@ lnet_add_route (__u32 net, unsigned int hops, lnet_nid_t gateway)
        if (!add_route)
                LIBCFS_FREE(route, sizeof(*route));
 
-        if (rnet != rnet2)
-                LIBCFS_FREE(rnet, sizeof(*rnet));
+       if (rnet != rnet2)
+               LIBCFS_FREE(rnet, sizeof(*rnet));
 
-        return 0;
+       return 0;
 }
 
 int
@@ -419,39 +421,47 @@ lnet_check_routes(void)
        cfs_list_t              *e1;
        cfs_list_t              *e2;
        int                     cpt;
+       cfs_list_t              *rn_list;
+       int                     i;
 
        cpt = lnet_net_lock_current();
 
-       cfs_list_for_each(e1, &the_lnet.ln_remote_nets) {
-               rnet = cfs_list_entry(e1, lnet_remotenet_t, lrn_list);
+       for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) {
+               rn_list = &the_lnet.ln_remote_nets_hash[i];
+               cfs_list_for_each(e1, rn_list) {
+                       rnet = cfs_list_entry(e1, lnet_remotenet_t, lrn_list);
 
-               route2 = NULL;
-               cfs_list_for_each(e2, &rnet->lrn_routes) {
-                       lnet_nid_t      nid1;
-                       lnet_nid_t      nid2;
-                       int             net;
+                       route2 = NULL;
+                       cfs_list_for_each(e2, &rnet->lrn_routes) {
+                               lnet_nid_t      nid1;
+                               lnet_nid_t      nid2;
+                               int             net;
 
-                       route = cfs_list_entry(e2, lnet_route_t, lr_list);
+                               route = cfs_list_entry(e2, lnet_route_t,
+                                                      lr_list);
 
-                       if (route2 == NULL) {
-                               route2 = route;
-                               continue;
-                       }
+                               if (route2 == NULL) {
+                                       route2 = route;
+                                       continue;
+                               }
 
-                       if (route->lr_gateway->lp_ni ==
-                           route2->lr_gateway->lp_ni)
-                               continue;
+                               if (route->lr_gateway->lp_ni ==
+                                   route2->lr_gateway->lp_ni)
+                                       continue;
 
-                       nid1 = route->lr_gateway->lp_nid;
-                       nid2 = route2->lr_gateway->lp_nid;
-                       net = rnet->lrn_net;
+                               nid1 = route->lr_gateway->lp_nid;
+                               nid2 = route2->lr_gateway->lp_nid;
+                               net = rnet->lrn_net;
 
-                       lnet_net_unlock(cpt);
+                               lnet_net_unlock(cpt);
 
-                       CERROR("Routes to %s via %s and %s not supported\n",
-                              libcfs_net2str(net), libcfs_nid2str(nid1),
-                              libcfs_nid2str(nid2));
-                       return -EINVAL;
+                               CERROR("Routes to %s via %s and %s not "
+                                      "supported\n",
+                                      libcfs_net2str(net),
+                                      libcfs_nid2str(nid1),
+                                      libcfs_nid2str(nid2));
+                               return -EINVAL;
+                       }
                }
        }
 
@@ -460,33 +470,39 @@ lnet_check_routes(void)
 }
 
 int
-lnet_del_route (__u32 net, lnet_nid_t gw_nid)
+lnet_del_route(__u32 net, lnet_nid_t gw_nid)
 {
        struct lnet_peer        *gateway;
-        lnet_remotenet_t    *rnet;
-        lnet_route_t        *route;
-        cfs_list_t          *e1;
-        cfs_list_t          *e2;
-        int                  rc = -ENOENT;
+       lnet_remotenet_t        *rnet;
+       lnet_route_t            *route;
+       cfs_list_t              *e1;
+       cfs_list_t              *e2;
+       int                     rc = -ENOENT;
+       cfs_list_t              *rn_list;
+       int                     idx = 0;
 
-        CDEBUG(D_NET, "Del route: net %s : gw %s\n",
-               libcfs_net2str(net), libcfs_nid2str(gw_nid));
+       CDEBUG(D_NET, "Del route: net %s : gw %s\n",
+              libcfs_net2str(net), libcfs_nid2str(gw_nid));
 
-        /* NB Caller may specify either all routes via the given gateway
-         * or a specific route entry actual NIDs) */
+       /* NB Caller may specify either all routes via the given gateway
+        * or a specific route entry actual NIDs) */
 
- again:
        lnet_net_lock(LNET_LOCK_EX);
+       if (net == LNET_NIDNET(LNET_NID_ANY))
+               rn_list = &the_lnet.ln_remote_nets_hash[0];
+       else
+               rn_list = lnet_net2rnethash(net);
 
-        cfs_list_for_each (e1, &the_lnet.ln_remote_nets) {
-                rnet = cfs_list_entry(e1, lnet_remotenet_t, lrn_list);
+ again:
+       cfs_list_for_each(e1, rn_list) {
+               rnet = cfs_list_entry(e1, lnet_remotenet_t, lrn_list);
 
-                if (!(net == LNET_NIDNET(LNET_NID_ANY) ||
-                      net == rnet->lrn_net))
-                        continue;
+               if (!(net == LNET_NIDNET(LNET_NID_ANY) ||
+                       net == rnet->lrn_net))
+                       continue;
 
-                cfs_list_for_each (e2, &rnet->lrn_routes) {
-                        route = cfs_list_entry(e2, lnet_route_t, lr_list);
+               cfs_list_for_each(e2, &rnet->lrn_routes) {
+                       route = cfs_list_entry(e2, lnet_route_t, lr_list);
 
                        gateway = route->lr_gateway;
                        if (!(gw_nid == LNET_NID_ANY ||
@@ -495,29 +511,36 @@ lnet_del_route (__u32 net, lnet_nid_t gw_nid)
 
                        cfs_list_del(&route->lr_list);
                        cfs_list_del(&route->lr_gwlist);
-                        the_lnet.ln_remote_nets_version++;
+                       the_lnet.ln_remote_nets_version++;
 
-                        if (cfs_list_empty(&rnet->lrn_routes))
-                                cfs_list_del(&rnet->lrn_list);
-                        else
-                                rnet = NULL;
+                       if (cfs_list_empty(&rnet->lrn_routes))
+                               cfs_list_del(&rnet->lrn_list);
+                       else
+                               rnet = NULL;
 
                        lnet_rtr_decref_locked(gateway);
                        lnet_peer_decref_locked(gateway);
 
                        lnet_net_unlock(LNET_LOCK_EX);
 
-                        LIBCFS_FREE(route, sizeof (*route));
+                       LIBCFS_FREE(route, sizeof(*route));
 
-                        if (rnet != NULL)
-                                LIBCFS_FREE(rnet, sizeof(*rnet));
+                       if (rnet != NULL)
+                               LIBCFS_FREE(rnet, sizeof(*rnet));
 
-                        rc = 0;
-                        goto again;
-                }
-        }
+                       rc = 0;
+                       lnet_net_lock(LNET_LOCK_EX);
+                       goto again;
+               }
+       }
 
+       if (net == LNET_NIDNET(LNET_NID_ANY) &&
+           ++idx < LNET_REMOTE_NETS_HASH_SIZE) {
+               rn_list = &the_lnet.ln_remote_nets_hash[idx];
+               goto again;
+       }
        lnet_net_unlock(LNET_LOCK_EX);
+
        return rc;
 }
 
@@ -536,22 +559,28 @@ lnet_get_route(int idx, __u32 *net, __u32 *hops,
        lnet_remotenet_t        *rnet;
        lnet_route_t            *route;
        int                     cpt;
+       int                     i;
+       cfs_list_t              *rn_list;
 
        cpt = lnet_net_lock_current();
 
-        cfs_list_for_each (e1, &the_lnet.ln_remote_nets) {
-                rnet = cfs_list_entry(e1, lnet_remotenet_t, lrn_list);
-
-                cfs_list_for_each (e2, &rnet->lrn_routes) {
-                        route = cfs_list_entry(e2, lnet_route_t, lr_list);
-
-                        if (idx-- == 0) {
-                                *net     = rnet->lrn_net;
-                                *hops    = route->lr_hops;
-                                *gateway = route->lr_gateway->lp_nid;
-                                *alive   = route->lr_gateway->lp_alive;
-                               lnet_net_unlock(cpt);
-                               return 0;
+       for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) {
+               rn_list = &the_lnet.ln_remote_nets_hash[i];
+               cfs_list_for_each(e1, rn_list) {
+                       rnet = cfs_list_entry(e1, lnet_remotenet_t, lrn_list);
+
+                       cfs_list_for_each(e2, &rnet->lrn_routes) {
+                               route = cfs_list_entry(e2, lnet_route_t,
+                                                      lr_list);
+
+                               if (idx-- == 0) {
+                                       *net     = rnet->lrn_net;
+                                       *hops    = route->lr_hops;
+                                       *gateway = route->lr_gateway->lp_nid;
+                                       *alive   = route->lr_gateway->lp_alive;
+                                       lnet_net_unlock(cpt);
+                                       return 0;
+                               }
                        }
                }
        }
@@ -563,11 +592,11 @@ lnet_get_route(int idx, __u32 *net, __u32 *hops,
 void
 lnet_swap_pinginfo(lnet_ping_info_t *info)
 {
-        int               i;
-        lnet_ni_status_t *stat;
+       int               i;
+       lnet_ni_status_t *stat;
 
        __swab32s(&info->pi_magic);
-        __swab32s(&info->pi_version);
+       __swab32s(&info->pi_features);
         __swab32s(&info->pi_pid);
         __swab32s(&info->pi_nnis);
         for (i = 0; i < info->pi_nnis && i < LNET_MAX_RTR_NIS; i++) {
@@ -599,21 +628,20 @@ lnet_parse_rc_info(lnet_rc_data_t *rcd)
        if (info->pi_magic != LNET_PROTO_PING_MAGIC) {
                CDEBUG(D_NET, "%s: Unexpected magic %08x\n",
                       libcfs_nid2str(gw->lp_nid), info->pi_magic);
-               gw->lp_ping_version = LNET_PROTO_PING_UNKNOWN;
+               gw->lp_ping_feats = LNET_PING_FEAT_INVAL;
                return;
        }
 
-       gw->lp_ping_version = info->pi_version;
-       if (gw->lp_ping_version == LNET_PROTO_PING_VERSION_1)
-               return; /* v1 doesn't carry NI status info */
-
-       if (gw->lp_ping_version != LNET_PROTO_PING_VERSION) {
-               CDEBUG(D_NET, "%s: Unexpected version 0x%x\n",
-                      libcfs_nid2str(gw->lp_nid), gw->lp_ping_version);
-               gw->lp_ping_version = LNET_PROTO_PING_UNKNOWN;
-               return;
+       gw->lp_ping_feats = info->pi_features;
+       if ((gw->lp_ping_feats & LNET_PING_FEAT_MASK) == 0) {
+               CDEBUG(D_NET, "%s: Unexpected features 0x%x\n",
+                      libcfs_nid2str(gw->lp_nid), gw->lp_ping_feats);
+               return; /* nothing I can understand */
        }
 
+       if ((gw->lp_ping_feats & LNET_PING_FEAT_NI_STATUS) == 0)
+               return; /* can't carry NI status info */
+
        cfs_list_for_each_entry(rtr, &gw->lp_routes, lr_gwlist) {
                int     ptl_status = LNET_NI_STATUS_INVALID;
                int     down = 0;
@@ -627,7 +655,7 @@ lnet_parse_rc_info(lnet_rc_data_t *rcd)
                        if (nid == LNET_NID_ANY) {
                                CDEBUG(D_NET, "%s: unexpected LNET_NID_ANY\n",
                                       libcfs_nid2str(gw->lp_nid));
-                               gw->lp_ping_version = LNET_PROTO_PING_UNKNOWN;
+                               gw->lp_ping_feats = LNET_PING_FEAT_INVAL;
                                return;
                        }
 
@@ -656,7 +684,7 @@ lnet_parse_rc_info(lnet_rc_data_t *rcd)
 
                        CDEBUG(D_NET, "%s: Unexpected status 0x%x\n",
                               libcfs_nid2str(gw->lp_nid), stat->ns_status);
-                       gw->lp_ping_version = LNET_PROTO_PING_UNKNOWN;
+                       gw->lp_ping_feats = LNET_PING_FEAT_INVAL;
                        return;
                }
 
@@ -679,7 +707,7 @@ lnet_router_checker_event(lnet_event_t *event)
        if (event->unlinked) {
                LNetInvalidateHandle(&rcd->rcd_mdh);
                return;
-        }
+       }
 
        LASSERT(event->type == LNET_EVENT_SEND ||
                event->type == LNET_EVENT_REPLY);
@@ -954,7 +982,7 @@ lnet_ping_router_locked (lnet_peer_t *rtr)
                mdh = rcd->rcd_mdh;
 
                if (rtr->lp_ping_deadline == 0) {
-                       rtr->lp_ping_deadline = \
+                       rtr->lp_ping_deadline =
                                cfs_time_shift(router_ping_timeout);
                }
 
@@ -1050,7 +1078,7 @@ lnet_router_checker_start(void)
                 return 0;
 
 #ifdef __KERNEL__
-        cfs_sema_init(&the_lnet.ln_rc_signal, 0);
+       sema_init(&the_lnet.ln_rc_signal, 0);
         /* EQ size doesn't matter; the callback is guaranteed to get every
          * event */
        eqsz = 0;
@@ -1071,7 +1099,7 @@ lnet_router_checker_start(void)
         if (rc < 0) {
                 CERROR("Can't start router checker thread: %d\n", rc);
                 /* block until event callback signals exit */
-                cfs_down(&the_lnet.ln_rc_signal);
+               down(&the_lnet.ln_rc_signal);
                 rc = LNetEQFree(the_lnet.ln_rc_eqh);
                 LASSERT (rc == 0);
                 the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN;
@@ -1102,7 +1130,7 @@ lnet_router_checker_stop (void)
 
 #ifdef __KERNEL__
        /* block until event callback signals exit */
-       cfs_down(&the_lnet.ln_rc_signal);
+       down(&the_lnet.ln_rc_signal);
 #else
        lnet_router_checker();
 #endif
@@ -1155,7 +1183,7 @@ lnet_prune_rc_data(int wait_unlink)
                        LNetMDUnlink(rcd->rcd_mdh);
 
                lnet_net_lock(LNET_LOCK_EX);
-        }
+       }
 
        cfs_list_splice_init(&head, &the_lnet.ln_rcd_zombie);
 
@@ -1163,9 +1191,9 @@ lnet_prune_rc_data(int wait_unlink)
        while (!cfs_list_empty(&the_lnet.ln_rcd_zombie)) {
                cfs_list_for_each_entry_safe(rcd, tmp, &the_lnet.ln_rcd_zombie,
                                             rcd_list) {
-                       if (!LNetHandleIsInvalid(rcd->rcd_mdh))
+                       if (LNetHandleIsInvalid(rcd->rcd_mdh))
                                cfs_list_move(&rcd->rcd_list, &head);
-                }
+               }
 
                wait_unlink = wait_unlink &&
                              !cfs_list_empty(&the_lnet.ln_rcd_zombie);
@@ -1180,7 +1208,7 @@ lnet_prune_rc_data(int wait_unlink)
                }
 
                if (!wait_unlink)
-                       break;
+                       return;
 
                i++;
                CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
@@ -1189,6 +1217,8 @@ lnet_prune_rc_data(int wait_unlink)
 
                lnet_net_lock(LNET_LOCK_EX);
        }
+
+       lnet_net_unlock(LNET_LOCK_EX);
 }
 
 
@@ -1255,9 +1285,9 @@ rescan:
        lnet_prune_rc_data(1); /* wait for UNLINK */
 
        the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN;
-       cfs_up(&the_lnet.ln_rc_signal);
-        /* The unlink event callback will signal final completion */
-        return 0;
+       up(&the_lnet.ln_rc_signal);
+       /* The unlink event callback will signal final completion */
+       return 0;
 }
 
 void
@@ -1383,7 +1413,7 @@ void
 lnet_rtrpools_free(void)
 {
        lnet_rtrbufpool_t *rtrp;
-       int                i;
+       int               i;
 
        if (the_lnet.ln_rtrpools == NULL) /* uninitialized or freed */
                return;
@@ -1571,6 +1601,11 @@ lnet_notify(lnet_ni_t *ni, lnet_nid_t nid, int alive, cfs_time_t when)
 
        lnet_net_lock(cpt);
 
+       if (the_lnet.ln_shutdown) {
+               lnet_net_unlock(cpt);
+               return -ESHUTDOWN;
+       }
+
        lp = lnet_find_peer_locked(the_lnet.ln_peer_tables[cpt], nid);
        if (lp == NULL) {
                /* nid not found */