X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lnet%2Flnet%2Frouter.c;h=80c0a59b00da0b49e00f8b4eb4dfa6bec6d94577;hb=b1e22bd4df5bcea4f32dbf510ba784817c529ce8;hp=08bb57cbfc3b4309c59f53a7bc2027d6a833708a;hpb=51a5b4df5bbbf5fd12c73d2722b230e93fe93327;p=fs%2Flustre-release.git diff --git a/lnet/lnet/router.c b/lnet/lnet/router.c index 08bb57c..80c0a59 100644 --- a/lnet/lnet/router.c +++ b/lnet/lnet/router.c @@ -1,7 +1,7 @@ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. * - * Copyright (c) 2011, Whamcloud, Inc. + * Copyright (c) 2011, 2012, Intel Corporation. * * This file is part of Portals * http://sourceforge.net/projects/sandiaportals/ @@ -26,19 +26,26 @@ #if defined(__KERNEL__) && defined(LNET_ROUTER) +#define LNET_NRB_TINY_MIN 512 /* min value for each CPT */ +#define LNET_NRB_TINY (LNET_NRB_TINY_MIN * 4) +#define LNET_NRB_SMALL_MIN 4096 /* min value for each CPT */ +#define LNET_NRB_SMALL (LNET_NRB_SMALL_MIN * 4) +#define LNET_NRB_LARGE_MIN 256 /* min value for each CPT */ +#define LNET_NRB_LARGE (LNET_NRB_LARGE_MIN * 4) + static char *forwarding = ""; CFS_MODULE_PARM(forwarding, "s", charp, 0444, "Explicitly enable/disable forwarding between networks"); -static int tiny_router_buffers = 1024; +static int tiny_router_buffers; CFS_MODULE_PARM(tiny_router_buffers, "i", int, 0444, - "# of 0 payload messages to buffer in the router"); -static int small_router_buffers = 8192; + "# of 0 payload messages to buffer in the router"); +static int small_router_buffers; CFS_MODULE_PARM(small_router_buffers, "i", int, 0444, - "# of small (1 page) messages to buffer in the router"); -static int large_router_buffers = 512; + "# of small (1 page) messages to buffer in the router"); +static int large_router_buffers; CFS_MODULE_PARM(large_router_buffers, "i", int, 0444, - "# of large messages to buffer in the router"); + "# of large messages to buffer in the router"); static int peer_buffer_credits = 0; CFS_MODULE_PARM(peer_buffer_credits, "i", int, 0444, "# router buffer credits per peer"); @@ -75,23 +82,23 @@ lnet_peer_buffer_credits(lnet_ni_t *ni) static int check_routers_before_use = 0; CFS_MODULE_PARM(check_routers_before_use, "i", int, 0444, - "Assume routers are down and ping them before use"); + "Assume routers are down and ping them before use"); -static int avoid_asym_router_failure = 0; -CFS_MODULE_PARM(avoid_asym_router_failure, "i", int, 0444, - "Avoid asymmetrical failures: reserved, use at your own risk"); +static int avoid_asym_router_failure = 1; +CFS_MODULE_PARM(avoid_asym_router_failure, "i", int, 0644, + "Avoid asymmetrical router failures (0 to disable)"); -static int dead_router_check_interval = 0; -CFS_MODULE_PARM(dead_router_check_interval, "i", int, 0444, - "Seconds between dead router health checks (<= 0 to disable)"); +static int dead_router_check_interval = 60; +CFS_MODULE_PARM(dead_router_check_interval, "i", int, 0644, + "Seconds between dead router health checks (<= 0 to disable)"); -static int live_router_check_interval = 0; -CFS_MODULE_PARM(live_router_check_interval, "i", int, 0444, - "Seconds between live router health checks (<= 0 to disable)"); +static int live_router_check_interval = 60; +CFS_MODULE_PARM(live_router_check_interval, "i", int, 0644, + "Seconds between live router health checks (<= 0 to disable)"); static int router_ping_timeout = 50; -CFS_MODULE_PARM(router_ping_timeout, "i", int, 0444, - "Seconds to wait for the reply to a router health query"); +CFS_MODULE_PARM(router_ping_timeout, "i", int, 0644, + "Seconds to wait for the reply to a router health query"); int lnet_peers_start_down(void) @@ -123,9 +130,9 @@ lnet_notify_locked(lnet_peer_t *lp, int notifylnd, int alive, cfs_time_t when) lp->lp_notify = 1; lp->lp_notifylnd |= notifylnd; if (lp->lp_alive) - lp->lp_ping_version = LNET_PROTO_PING_UNKNOWN; /* reset */ + lp->lp_ping_feats = LNET_PING_FEAT_INVAL; /* reset */ - CDEBUG(D_NET, "set %s %d\n", libcfs_nid2str(lp->lp_nid), alive); + CDEBUG(D_NET, "set %s %d\n", libcfs_nid2str(lp->lp_nid), alive); } void @@ -151,27 +158,28 @@ lnet_ni_notify_locked(lnet_ni_t *ni, lnet_peer_t *lp) lp->lp_notify = 0; if (notifylnd && ni->ni_lnd->lnd_notify != NULL) { - LNET_UNLOCK(); + lnet_net_unlock(lp->lp_cpt); - /* A new notification could happen now; I'll handle it - * when control returns to me */ + /* A new notification could happen now; I'll handle it + * when control returns to me */ - (ni->ni_lnd->lnd_notify)(ni, lp->lp_nid, alive); + (ni->ni_lnd->lnd_notify)(ni, lp->lp_nid, alive); - LNET_LOCK(); - } - } + lnet_net_lock(lp->lp_cpt); + } + } - lp->lp_notifying = 0; + lp->lp_notifying = 0; } static void lnet_rtr_addref_locked(lnet_peer_t *lp) { - LASSERT (lp->lp_refcount > 0); - LASSERT (lp->lp_rtr_refcount >= 0); + LASSERT(lp->lp_refcount > 0); + LASSERT(lp->lp_rtr_refcount >= 0); + /* lnet_net_lock must be exclusively locked */ lp->lp_rtr_refcount++; if (lp->lp_rtr_refcount == 1) { cfs_list_t *pos; @@ -195,15 +203,16 @@ lnet_rtr_addref_locked(lnet_peer_t *lp) static void lnet_rtr_decref_locked(lnet_peer_t *lp) { - LASSERT (lp->lp_refcount > 0); - LASSERT (lp->lp_rtr_refcount > 0); + LASSERT(lp->lp_refcount > 0); + LASSERT(lp->lp_rtr_refcount > 0); - lp->lp_rtr_refcount--; - if (lp->lp_rtr_refcount == 0) { + /* lnet_net_lock must be exclusively locked */ + lp->lp_rtr_refcount--; + if (lp->lp_rtr_refcount == 0) { LASSERT(cfs_list_empty(&lp->lp_routes)); - if (lp->lp_rcd != NULL) { - cfs_list_add(&lp->lp_rcd->rcd_list, + if (lp->lp_rcd != NULL) { + cfs_list_add(&lp->lp_rcd->rcd_list, &the_lnet.ln_rcd_deathrow); lp->lp_rcd = NULL; } @@ -218,18 +227,20 @@ lnet_rtr_decref_locked(lnet_peer_t *lp) lnet_remotenet_t * lnet_find_net_locked (__u32 net) { - lnet_remotenet_t *rnet; - cfs_list_t *tmp; + lnet_remotenet_t *rnet; + cfs_list_t *tmp; + cfs_list_t *rn_list; - LASSERT (!the_lnet.ln_shutdown); + LASSERT(!the_lnet.ln_shutdown); - cfs_list_for_each (tmp, &the_lnet.ln_remote_nets) { - rnet = cfs_list_entry(tmp, lnet_remotenet_t, lrn_list); + rn_list = lnet_net2rnethash(net); + cfs_list_for_each(tmp, rn_list) { + rnet = cfs_list_entry(tmp, lnet_remotenet_t, lrn_list); - if (rnet->lrn_net == net) - return rnet; - } - return NULL; + if (rnet->lrn_net == net) + return rnet; + } + return NULL; } static void lnet_shuffle_seed(void) @@ -285,8 +296,8 @@ lnet_add_route_to_rnet (lnet_remotenet_t *rnet, lnet_route_t *route) cfs_list_add(&route->lr_list, e); cfs_list_add(&route->lr_gwlist, &route->lr_gateway->lp_routes); - the_lnet.ln_remote_nets_version++; - lnet_rtr_addref_locked(route->lr_gateway); + the_lnet.ln_remote_nets_version++; + lnet_rtr_addref_locked(route->lr_gateway); } int @@ -332,17 +343,17 @@ lnet_add_route (__u32 net, unsigned int hops, lnet_nid_t gateway) route->lr_hops = hops; route->lr_net = net; - LNET_LOCK(); + lnet_net_lock(LNET_LOCK_EX); - rc = lnet_nid2peer_locked(&route->lr_gateway, gateway); - if (rc != 0) { - LNET_UNLOCK(); + rc = lnet_nid2peer_locked(&route->lr_gateway, gateway, LNET_LOCK_EX); + if (rc != 0) { + lnet_net_unlock(LNET_LOCK_EX); - LIBCFS_FREE(route, sizeof(*route)); - LIBCFS_FREE(rnet, sizeof(*rnet)); + LIBCFS_FREE(route, sizeof(*route)); + LIBCFS_FREE(rnet, sizeof(*rnet)); if (rc == -EHOSTUNREACH) { /* gateway is not on a local net */ - return 0; /* ignore the route entry */ + return 0; /* ignore the route entry */ } else { CERROR("Error %d creating route %s %d %s\n", rc, libcfs_net2str(net), hops, @@ -356,7 +367,7 @@ lnet_add_route (__u32 net, unsigned int hops, lnet_nid_t gateway) rnet2 = lnet_find_net_locked(net); if (rnet2 == NULL) { /* new network */ - cfs_list_add_tail(&rnet->lrn_list, &the_lnet.ln_remote_nets); + cfs_list_add_tail(&rnet->lrn_list, lnet_net2rnethash(net)); rnet2 = rnet; } @@ -379,104 +390,119 @@ lnet_add_route (__u32 net, unsigned int hops, lnet_nid_t gateway) lnet_add_route_to_rnet(rnet2, route); ni = route->lr_gateway->lp_ni; - LNET_UNLOCK(); + lnet_net_unlock(LNET_LOCK_EX); /* XXX Assume alive */ if (ni->ni_lnd->lnd_notify != NULL) (ni->ni_lnd->lnd_notify)(ni, gateway, 1); - LNET_LOCK(); + lnet_net_lock(LNET_LOCK_EX); } /* -1 for notify or !add_route */ lnet_peer_decref_locked(route->lr_gateway); - LNET_UNLOCK(); + lnet_net_unlock(LNET_LOCK_EX); if (!add_route) LIBCFS_FREE(route, sizeof(*route)); - if (rnet != rnet2) - LIBCFS_FREE(rnet, sizeof(*rnet)); + if (rnet != rnet2) + LIBCFS_FREE(rnet, sizeof(*rnet)); - return 0; + return 0; } int -lnet_check_routes (void) +lnet_check_routes(void) { - lnet_remotenet_t *rnet; - lnet_route_t *route; - lnet_route_t *route2; - cfs_list_t *e1; - cfs_list_t *e2; + lnet_remotenet_t *rnet; + lnet_route_t *route; + lnet_route_t *route2; + cfs_list_t *e1; + cfs_list_t *e2; + int cpt; + cfs_list_t *rn_list; + int i; - LNET_LOCK(); + cpt = lnet_net_lock_current(); - cfs_list_for_each (e1, &the_lnet.ln_remote_nets) { - rnet = cfs_list_entry(e1, lnet_remotenet_t, lrn_list); + for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) { + rn_list = &the_lnet.ln_remote_nets_hash[i]; + cfs_list_for_each(e1, rn_list) { + rnet = cfs_list_entry(e1, lnet_remotenet_t, lrn_list); - route2 = NULL; - cfs_list_for_each (e2, &rnet->lrn_routes) { - lnet_nid_t nid1; - lnet_nid_t nid2; - int net; + route2 = NULL; + cfs_list_for_each(e2, &rnet->lrn_routes) { + lnet_nid_t nid1; + lnet_nid_t nid2; + int net; - route = cfs_list_entry(e2, lnet_route_t, lr_list); + route = cfs_list_entry(e2, lnet_route_t, + lr_list); - if (route2 == NULL) { - route2 = route; - continue; - } + if (route2 == NULL) { + route2 = route; + continue; + } - if (route->lr_gateway->lp_ni == - route2->lr_gateway->lp_ni) - continue; + if (route->lr_gateway->lp_ni == + route2->lr_gateway->lp_ni) + continue; - nid1 = route->lr_gateway->lp_nid; - nid2 = route2->lr_gateway->lp_nid; - net = rnet->lrn_net; + nid1 = route->lr_gateway->lp_nid; + nid2 = route2->lr_gateway->lp_nid; + net = rnet->lrn_net; - LNET_UNLOCK(); + lnet_net_unlock(cpt); - CERROR("Routes to %s via %s and %s not supported\n", - libcfs_net2str(net), libcfs_nid2str(nid1), - libcfs_nid2str(nid2)); - return -EINVAL; - } - } + CERROR("Routes to %s via %s and %s not " + "supported\n", + libcfs_net2str(net), + libcfs_nid2str(nid1), + libcfs_nid2str(nid2)); + return -EINVAL; + } + } + } - LNET_UNLOCK(); - return 0; + lnet_net_unlock(cpt); + return 0; } int -lnet_del_route (__u32 net, lnet_nid_t gw_nid) +lnet_del_route(__u32 net, lnet_nid_t gw_nid) { struct lnet_peer *gateway; - lnet_remotenet_t *rnet; - lnet_route_t *route; - cfs_list_t *e1; - cfs_list_t *e2; - int rc = -ENOENT; - - CDEBUG(D_NET, "Del route: net %s : gw %s\n", - libcfs_net2str(net), libcfs_nid2str(gw_nid)); - - /* NB Caller may specify either all routes via the given gateway - * or a specific route entry actual NIDs) */ + lnet_remotenet_t *rnet; + lnet_route_t *route; + cfs_list_t *e1; + cfs_list_t *e2; + int rc = -ENOENT; + cfs_list_t *rn_list; + int idx = 0; + + CDEBUG(D_NET, "Del route: net %s : gw %s\n", + libcfs_net2str(net), libcfs_nid2str(gw_nid)); + + /* NB Caller may specify either all routes via the given gateway + * or a specific route entry actual NIDs) */ + + lnet_net_lock(LNET_LOCK_EX); + if (net == LNET_NIDNET(LNET_NID_ANY)) + rn_list = &the_lnet.ln_remote_nets_hash[0]; + else + rn_list = lnet_net2rnethash(net); again: - LNET_LOCK(); + cfs_list_for_each(e1, rn_list) { + rnet = cfs_list_entry(e1, lnet_remotenet_t, lrn_list); - cfs_list_for_each (e1, &the_lnet.ln_remote_nets) { - rnet = cfs_list_entry(e1, lnet_remotenet_t, lrn_list); - - if (!(net == LNET_NIDNET(LNET_NID_ANY) || - net == rnet->lrn_net)) - continue; + if (!(net == LNET_NIDNET(LNET_NID_ANY) || + net == rnet->lrn_net)) + continue; - cfs_list_for_each (e2, &rnet->lrn_routes) { - route = cfs_list_entry(e2, lnet_route_t, lr_list); + cfs_list_for_each(e2, &rnet->lrn_routes) { + route = cfs_list_entry(e2, lnet_route_t, lr_list); gateway = route->lr_gateway; if (!(gw_nid == LNET_NID_ANY || @@ -485,29 +511,37 @@ lnet_del_route (__u32 net, lnet_nid_t gw_nid) cfs_list_del(&route->lr_list); cfs_list_del(&route->lr_gwlist); - the_lnet.ln_remote_nets_version++; + the_lnet.ln_remote_nets_version++; - if (cfs_list_empty(&rnet->lrn_routes)) - cfs_list_del(&rnet->lrn_list); - else - rnet = NULL; + if (cfs_list_empty(&rnet->lrn_routes)) + cfs_list_del(&rnet->lrn_list); + else + rnet = NULL; lnet_rtr_decref_locked(gateway); lnet_peer_decref_locked(gateway); - LNET_UNLOCK(); - LIBCFS_FREE(route, sizeof (*route)); + lnet_net_unlock(LNET_LOCK_EX); - if (rnet != NULL) - LIBCFS_FREE(rnet, sizeof(*rnet)); + LIBCFS_FREE(route, sizeof(*route)); - rc = 0; - goto again; - } - } + if (rnet != NULL) + LIBCFS_FREE(rnet, sizeof(*rnet)); - LNET_UNLOCK(); - return rc; + rc = 0; + lnet_net_lock(LNET_LOCK_EX); + goto again; + } + } + + if (net == LNET_NIDNET(LNET_NID_ANY) && + ++idx < LNET_REMOTE_NETS_HASH_SIZE) { + rn_list = &the_lnet.ln_remote_nets_hash[idx]; + goto again; + } + lnet_net_unlock(LNET_LOCK_EX); + + return rc; } void @@ -517,45 +551,52 @@ lnet_destroy_routes (void) } int -lnet_get_route (int idx, __u32 *net, __u32 *hops, - lnet_nid_t *gateway, __u32 *alive) +lnet_get_route(int idx, __u32 *net, __u32 *hops, + lnet_nid_t *gateway, __u32 *alive) { - cfs_list_t *e1; - cfs_list_t *e2; - lnet_remotenet_t *rnet; - lnet_route_t *route; - - LNET_LOCK(); - - cfs_list_for_each (e1, &the_lnet.ln_remote_nets) { - rnet = cfs_list_entry(e1, lnet_remotenet_t, lrn_list); - - cfs_list_for_each (e2, &rnet->lrn_routes) { - route = cfs_list_entry(e2, lnet_route_t, lr_list); - - if (idx-- == 0) { - *net = rnet->lrn_net; - *hops = route->lr_hops; - *gateway = route->lr_gateway->lp_nid; - *alive = route->lr_gateway->lp_alive; - LNET_UNLOCK(); - return 0; - } - } - } + cfs_list_t *e1; + cfs_list_t *e2; + lnet_remotenet_t *rnet; + lnet_route_t *route; + int cpt; + int i; + cfs_list_t *rn_list; + + cpt = lnet_net_lock_current(); + + for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) { + rn_list = &the_lnet.ln_remote_nets_hash[i]; + cfs_list_for_each(e1, rn_list) { + rnet = cfs_list_entry(e1, lnet_remotenet_t, lrn_list); + + cfs_list_for_each(e2, &rnet->lrn_routes) { + route = cfs_list_entry(e2, lnet_route_t, + lr_list); + + if (idx-- == 0) { + *net = rnet->lrn_net; + *hops = route->lr_hops; + *gateway = route->lr_gateway->lp_nid; + *alive = route->lr_gateway->lp_alive; + lnet_net_unlock(cpt); + return 0; + } + } + } + } - LNET_UNLOCK(); - return -ENOENT; + lnet_net_unlock(cpt); + return -ENOENT; } void lnet_swap_pinginfo(lnet_ping_info_t *info) { - int i; - lnet_ni_status_t *stat; + int i; + lnet_ni_status_t *stat; __swab32s(&info->pi_magic); - __swab32s(&info->pi_version); + __swab32s(&info->pi_features); __swab32s(&info->pi_pid); __swab32s(&info->pi_nnis); for (i = 0; i < info->pi_nnis && i < LNET_MAX_RTR_NIS; i++) { @@ -587,21 +628,20 @@ lnet_parse_rc_info(lnet_rc_data_t *rcd) if (info->pi_magic != LNET_PROTO_PING_MAGIC) { CDEBUG(D_NET, "%s: Unexpected magic %08x\n", libcfs_nid2str(gw->lp_nid), info->pi_magic); - gw->lp_ping_version = LNET_PROTO_PING_UNKNOWN; + gw->lp_ping_feats = LNET_PING_FEAT_INVAL; return; } - gw->lp_ping_version = info->pi_version; - if (gw->lp_ping_version == LNET_PROTO_PING_VERSION_1) - return; /* v1 doesn't carry NI status info */ - - if (gw->lp_ping_version != LNET_PROTO_PING_VERSION) { - CDEBUG(D_NET, "%s: Unexpected version 0x%x\n", - libcfs_nid2str(gw->lp_nid), gw->lp_ping_version); - gw->lp_ping_version = LNET_PROTO_PING_UNKNOWN; - return; + gw->lp_ping_feats = info->pi_features; + if ((gw->lp_ping_feats & LNET_PING_FEAT_MASK) == 0) { + CDEBUG(D_NET, "%s: Unexpected features 0x%x\n", + libcfs_nid2str(gw->lp_nid), gw->lp_ping_feats); + return; /* nothing I can understand */ } + if ((gw->lp_ping_feats & LNET_PING_FEAT_NI_STATUS) == 0) + return; /* can't carry NI status info */ + cfs_list_for_each_entry(rtr, &gw->lp_routes, lr_gwlist) { int ptl_status = LNET_NI_STATUS_INVALID; int down = 0; @@ -615,7 +655,7 @@ lnet_parse_rc_info(lnet_rc_data_t *rcd) if (nid == LNET_NID_ANY) { CDEBUG(D_NET, "%s: unexpected LNET_NID_ANY\n", libcfs_nid2str(gw->lp_nid)); - gw->lp_ping_version = LNET_PROTO_PING_UNKNOWN; + gw->lp_ping_feats = LNET_PING_FEAT_INVAL; return; } @@ -644,7 +684,7 @@ lnet_parse_rc_info(lnet_rc_data_t *rcd) CDEBUG(D_NET, "%s: Unexpected status 0x%x\n", libcfs_nid2str(gw->lp_nid), stat->ns_status); - gw->lp_ping_version = LNET_PROTO_PING_UNKNOWN; + gw->lp_ping_feats = LNET_PING_FEAT_INVAL; return; } @@ -659,7 +699,6 @@ lnet_parse_rc_info(lnet_rc_data_t *rcd) static void lnet_router_checker_event(lnet_event_t *event) { - /* CAVEAT EMPTOR: I'm called with lnet_res_locked */ lnet_rc_data_t *rcd = event->md.user_ptr; struct lnet_peer *lp; @@ -668,7 +707,7 @@ lnet_router_checker_event(lnet_event_t *event) if (event->unlinked) { LNetInvalidateHandle(&rcd->rcd_mdh); return; - } + } LASSERT(event->type == LNET_EVENT_SEND || event->type == LNET_EVENT_REPLY); @@ -676,13 +715,19 @@ lnet_router_checker_event(lnet_event_t *event) lp = rcd->rcd_gateway; LASSERT(lp != NULL); - if (!lnet_isrouter(lp)) /* ignore if no longer a router */ - return; + /* NB: it's called with holding lnet_res_lock, we have a few + * places need to hold both locks at the same time, please take + * care of lock ordering */ + lnet_net_lock(lp->lp_cpt); + if (!lnet_isrouter(lp) || lp->lp_rcd != rcd) { + /* ignore if no longer a router or rcd is replaced */ + goto out; + } if (event->type == LNET_EVENT_SEND) { - lp->lp_ping_notsent = 0; /* NB: re-enable another ping */ + lp->lp_ping_notsent = 0; if (event->status == 0) - return; + goto out; } /* LNET_EVENT_REPLY */ @@ -699,6 +744,9 @@ lnet_router_checker_event(lnet_event_t *event) if (avoid_asym_router_failure && event->status == 0) lnet_parse_rc_info(rcd); + + out: + lnet_net_unlock(lp->lp_cpt); } void @@ -711,7 +759,7 @@ lnet_wait_known_routerstate(void) LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING); for (;;) { - LNET_LOCK(); + int cpt = lnet_net_lock_current(); all_known = 1; cfs_list_for_each (entry, &the_lnet.ln_routers) { @@ -723,7 +771,7 @@ lnet_wait_known_routerstate(void) } } - LNET_UNLOCK(); + lnet_net_unlock(cpt); if (all_known) return; @@ -736,53 +784,58 @@ lnet_wait_known_routerstate(void) } void -lnet_update_ni_status(void) +lnet_update_ni_status_locked(void) { - cfs_time_t now = cfs_time_current(); - lnet_ni_t *ni; - int status; - int timeout; + lnet_ni_t *ni; + long now; + int timeout; - LASSERT (the_lnet.ln_routing); + LASSERT(the_lnet.ln_routing); - timeout = router_ping_timeout + - MAX(live_router_check_interval, dead_router_check_interval); + timeout = router_ping_timeout + + MAX(live_router_check_interval, dead_router_check_interval); - LNET_LOCK(); + now = cfs_time_current_sec(); + cfs_list_for_each_entry(ni, &the_lnet.ln_nis, ni_list) { + if (ni->ni_lnd->lnd_type == LOLND) + continue; - cfs_list_for_each_entry (ni, &the_lnet.ln_nis, ni_list) { - lnet_ni_status_t *ns = ni->ni_status; + if (now < ni->ni_last_alive + timeout) + continue; - LASSERT (ns != NULL); + lnet_ni_lock(ni); + /* re-check with lock */ + if (now < ni->ni_last_alive + timeout) { + lnet_ni_unlock(ni); + continue; + } - status = LNET_NI_STATUS_UP; - if (ni->ni_lnd->lnd_type != LOLND && /* @lo forever alive */ - cfs_time_after(now, cfs_time_add(ni->ni_last_alive, - cfs_time_seconds(timeout)))) - status = LNET_NI_STATUS_DOWN; + LASSERT(ni->ni_status != NULL); - if (ns->ns_status != status) { - ns->ns_status = status; - CDEBUG(D_NET, "NI(%s:%d) status changed to %s\n", - libcfs_nid2str(ni->ni_nid), timeout, - status == LNET_NI_STATUS_UP ? "up" : "down"); - } - } - - LNET_UNLOCK(); + if (ni->ni_status->ns_status != LNET_NI_STATUS_DOWN) { + CDEBUG(D_NET, "NI(%s:%d) status changed to down\n", + libcfs_nid2str(ni->ni_nid), timeout); + /* NB: so far, this is the only place to set + * NI status to "down" */ + ni->ni_status->ns_status = LNET_NI_STATUS_DOWN; + } + lnet_ni_unlock(ni); + } } void -lnet_destroy_rc_data (lnet_rc_data_t *rcd) +lnet_destroy_rc_data(lnet_rc_data_t *rcd) { LASSERT(cfs_list_empty(&rcd->rcd_list)); /* detached from network */ LASSERT(LNetHandleIsInvalid(rcd->rcd_mdh)); if (rcd->rcd_gateway != NULL) { - LNET_LOCK(); + int cpt = rcd->rcd_gateway->lp_cpt; + + lnet_net_lock(cpt); lnet_peer_decref_locked(rcd->rcd_gateway); - LNET_UNLOCK(); + lnet_net_unlock(cpt); } if (rcd->rcd_pinginfo != NULL) @@ -799,7 +852,7 @@ lnet_create_rc_data_locked(lnet_peer_t *gateway) int rc; int i; - LNET_UNLOCK(); + lnet_net_unlock(gateway->lp_cpt); LIBCFS_ALLOC(rcd, sizeof(*rcd)); if (rcd == NULL) @@ -834,16 +887,18 @@ lnet_create_rc_data_locked(lnet_peer_t *gateway) } LASSERT(rc == 0); - LNET_LOCK(); + lnet_net_lock(gateway->lp_cpt); /* router table changed or someone has created rcd for this gateway */ if (!lnet_isrouter(gateway) || gateway->lp_rcd != NULL) { - LNET_UNLOCK(); + lnet_net_unlock(gateway->lp_cpt); goto out; } lnet_peer_addref_locked(gateway); rcd->rcd_gateway = gateway; gateway->lp_rcd = rcd; + gateway->lp_ping_notsent = 0; + return rcd; out: @@ -855,7 +910,7 @@ lnet_create_rc_data_locked(lnet_peer_t *gateway) lnet_destroy_rc_data(rcd); } - LNET_LOCK(); + lnet_net_lock(gateway->lp_cpt); return gateway->lp_rcd; } @@ -927,16 +982,16 @@ lnet_ping_router_locked (lnet_peer_t *rtr) mdh = rcd->rcd_mdh; if (rtr->lp_ping_deadline == 0) { - rtr->lp_ping_deadline = \ + rtr->lp_ping_deadline = cfs_time_shift(router_ping_timeout); } - LNET_UNLOCK(); + lnet_net_unlock(rtr->lp_cpt); - rc = LNetGet(LNET_NID_ANY, mdh, id, LNET_RESERVED_PORTAL, - LNET_PROTO_PING_MATCHBITS, 0); + rc = LNetGet(LNET_NID_ANY, mdh, id, LNET_RESERVED_PORTAL, + LNET_PROTO_PING_MATCHBITS, 0); - LNET_LOCK(); + lnet_net_lock(rtr->lp_cpt); if (rc != 0) rtr->lp_ping_notsent = 0; /* no event pending */ } @@ -959,7 +1014,7 @@ lnet_router_checker_start(void) LASSERT (check_routers_before_use); LASSERT (dead_router_check_interval > 0); - LNET_LOCK(); + lnet_net_lock(0); /* As an approximation, allow each router the same number of * outstanding events as it is allowed outstanding sends */ @@ -976,7 +1031,7 @@ lnet_router_checker_start(void) id.nid = rtr->lp_nid; id.pid = LUSTRE_SRV_LNET_PID; - LNET_UNLOCK(); + lnet_net_unlock(0); rc = LNetSetAsync(id, 1); if (rc != 0) { @@ -985,12 +1040,12 @@ lnet_router_checker_start(void) return rc; } - LNET_LOCK(); - /* NB router list doesn't change in userspace */ - LASSERT (version == the_lnet.ln_routers_version); - } + lnet_net_lock(0); + /* NB router list doesn't change in userspace */ + LASSERT(version == the_lnet.ln_routers_version); + } - LNET_UNLOCK(); + lnet_net_unlock(0); if (nrtr == 0) { CDEBUG(D_NET, @@ -1023,7 +1078,7 @@ lnet_router_checker_start(void) return 0; #ifdef __KERNEL__ - cfs_sema_init(&the_lnet.ln_rc_signal, 0); + sema_init(&the_lnet.ln_rc_signal, 0); /* EQ size doesn't matter; the callback is guaranteed to get every * event */ eqsz = 0; @@ -1044,7 +1099,7 @@ lnet_router_checker_start(void) if (rc < 0) { CERROR("Can't start router checker thread: %d\n", rc); /* block until event callback signals exit */ - cfs_down(&the_lnet.ln_rc_signal); + down(&the_lnet.ln_rc_signal); rc = LNetEQFree(the_lnet.ln_rc_eqh); LASSERT (rc == 0); the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN; @@ -1075,7 +1130,7 @@ lnet_router_checker_stop (void) #ifdef __KERNEL__ /* block until event callback signals exit */ - cfs_down(&the_lnet.ln_rc_signal); + down(&the_lnet.ln_rc_signal); #else lnet_router_checker(); #endif @@ -1102,7 +1157,7 @@ lnet_prune_rc_data(int wait_unlink) CFS_INIT_LIST_HEAD(&head); - LNET_LOCK(); + lnet_net_lock(LNET_LOCK_EX); if (the_lnet.ln_rc_state != LNET_RC_STATE_RUNNING) { /* router checker is stopping, prune all */ @@ -1122,13 +1177,13 @@ lnet_prune_rc_data(int wait_unlink) cfs_list_splice_init(&the_lnet.ln_rcd_deathrow, &head); if (!cfs_list_empty(&head)) { - LNET_UNLOCK(); + lnet_net_unlock(LNET_LOCK_EX); cfs_list_for_each_entry(rcd, &head, rcd_list) LNetMDUnlink(rcd->rcd_mdh); - LNET_LOCK(); - } + lnet_net_lock(LNET_LOCK_EX); + } cfs_list_splice_init(&head, &the_lnet.ln_rcd_zombie); @@ -1136,14 +1191,14 @@ lnet_prune_rc_data(int wait_unlink) while (!cfs_list_empty(&the_lnet.ln_rcd_zombie)) { cfs_list_for_each_entry_safe(rcd, tmp, &the_lnet.ln_rcd_zombie, rcd_list) { - if (!LNetHandleIsInvalid(rcd->rcd_mdh)) + if (LNetHandleIsInvalid(rcd->rcd_mdh)) cfs_list_move(&rcd->rcd_list, &head); - } + } wait_unlink = wait_unlink && !cfs_list_empty(&the_lnet.ln_rcd_zombie); - LNET_UNLOCK(); + lnet_net_unlock(LNET_LOCK_EX); while (!cfs_list_empty(&head)) { rcd = cfs_list_entry(head.next, @@ -1153,15 +1208,17 @@ lnet_prune_rc_data(int wait_unlink) } if (!wait_unlink) - break; + return; i++; CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, "Waiting for rc buffers to unlink\n"); cfs_pause(cfs_time_seconds(1) / 4); - LNET_LOCK(); + lnet_net_lock(LNET_LOCK_EX); } + + lnet_net_unlock(LNET_LOCK_EX); } @@ -1179,14 +1236,27 @@ lnet_router_checker(void *arg) LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING); while (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING) { - __u64 version; + __u64 version; + int cpt; + int cpt2; - LNET_LOCK(); + cpt = lnet_net_lock_current(); rescan: - version = the_lnet.ln_routers_version; + version = the_lnet.ln_routers_version; + + cfs_list_for_each(entry, &the_lnet.ln_routers) { + rtr = cfs_list_entry(entry, lnet_peer_t, lp_rtr_list); + + cpt2 = lnet_cpt_of_nid_locked(rtr->lp_nid); + if (cpt != cpt2) { + lnet_net_unlock(cpt); + cpt = cpt2; + lnet_net_lock(cpt); + /* the routers list has changed */ + if (version != the_lnet.ln_routers_version) + goto rescan; + } - cfs_list_for_each (entry, &the_lnet.ln_routers) { - rtr = cfs_list_entry(entry, lnet_peer_t, lp_rtr_list); lnet_ping_router_locked(rtr); /* NB dropped lock */ @@ -1196,10 +1266,10 @@ rescan: } } - LNET_UNLOCK(); + if (the_lnet.ln_routing) + lnet_update_ni_status_locked(); - if (the_lnet.ln_routing) - lnet_update_ni_status(); + lnet_net_unlock(cpt); lnet_prune_rc_data(0); /* don't wait for UNLINK */ @@ -1215,9 +1285,9 @@ rescan: lnet_prune_rc_data(1); /* wait for UNLINK */ the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN; - cfs_up(&the_lnet.ln_rc_signal); - /* The unlink event callback will signal final completion */ - return 0; + up(&the_lnet.ln_rc_signal); + /* The unlink event callback will signal final completion */ + return 0; } void @@ -1232,7 +1302,7 @@ lnet_destroy_rtrbuf(lnet_rtrbuf_t *rb, int npages) } lnet_rtrbuf_t * -lnet_new_rtrbuf(lnet_rtrbufpool_t *rbp) +lnet_new_rtrbuf(lnet_rtrbufpool_t *rbp, int cpt) { int npages = rbp->rbp_npages; int sz = offsetof(lnet_rtrbuf_t, rb_kiov[npages]); @@ -1240,14 +1310,15 @@ lnet_new_rtrbuf(lnet_rtrbufpool_t *rbp) lnet_rtrbuf_t *rb; int i; - LIBCFS_ALLOC(rb, sz); - if (rb == NULL) - return NULL; + LIBCFS_CPT_ALLOC(rb, lnet_cpt_table(), cpt, sz); + if (rb == NULL) + return NULL; - rb->rb_pool = rbp; + rb->rb_pool = rbp; - for (i = 0; i < npages; i++) { - page = cfs_alloc_page(CFS_ALLOC_ZERO | CFS_ALLOC_STD); + for (i = 0; i < npages; i++) { + page = cfs_page_cpt_alloc(lnet_cpt_table(), cpt, + CFS_ALLOC_ZERO | CFS_ALLOC_STD); if (page == NULL) { while (--i >= 0) cfs_free_page(rb->rb_kiov[i].kiov_page); @@ -1267,9 +1338,12 @@ lnet_new_rtrbuf(lnet_rtrbufpool_t *rbp) void lnet_rtrpool_free_bufs(lnet_rtrbufpool_t *rbp) { - int npages = rbp->rbp_npages; - int nbuffers = 0; - lnet_rtrbuf_t *rb; + int npages = rbp->rbp_npages; + int nbuffers = 0; + lnet_rtrbuf_t *rb; + + if (rbp->rbp_nbuffers == 0) /* not initialized or already freed */ + return; LASSERT (cfs_list_empty(&rbp->rbp_msgs)); LASSERT (rbp->rbp_credits == rbp->rbp_nbuffers); @@ -1291,7 +1365,7 @@ lnet_rtrpool_free_bufs(lnet_rtrbufpool_t *rbp) } int -lnet_rtrpool_alloc_bufs(lnet_rtrbufpool_t *rbp, int nbufs) +lnet_rtrpool_alloc_bufs(lnet_rtrbufpool_t *rbp, int nbufs, int cpt) { lnet_rtrbuf_t *rb; int i; @@ -1302,7 +1376,7 @@ lnet_rtrpool_alloc_bufs(lnet_rtrbufpool_t *rbp, int nbufs) } for (i = 0; i < nbufs; i++) { - rb = lnet_new_rtrbuf(rbp); + rb = lnet_new_rtrbuf(rbp, cpt); if (rb == NULL) { CERROR("Failed to allocate %d router bufs of %d pages\n", @@ -1336,29 +1410,92 @@ lnet_rtrpool_init(lnet_rtrbufpool_t *rbp, int npages) } void -lnet_free_rtrpools(void) +lnet_rtrpools_free(void) { - lnet_rtrpool_free_bufs(&the_lnet.ln_rtrpools[0]); - lnet_rtrpool_free_bufs(&the_lnet.ln_rtrpools[1]); - lnet_rtrpool_free_bufs(&the_lnet.ln_rtrpools[2]); + lnet_rtrbufpool_t *rtrp; + int i; + + if (the_lnet.ln_rtrpools == NULL) /* uninitialized or freed */ + return; + + cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) { + lnet_rtrpool_free_bufs(&rtrp[0]); + lnet_rtrpool_free_bufs(&rtrp[1]); + lnet_rtrpool_free_bufs(&rtrp[2]); + } + + cfs_percpt_free(the_lnet.ln_rtrpools); + the_lnet.ln_rtrpools = NULL; } -void -lnet_init_rtrpools(void) +static int +lnet_nrb_tiny_calculate(int npages) +{ + int nrbs = LNET_NRB_TINY; + + if (tiny_router_buffers < 0) { + LCONSOLE_ERROR_MSG(0x10c, + "tiny_router_buffers=%d invalid when " + "routing enabled\n", tiny_router_buffers); + return -1; + } + + if (tiny_router_buffers > 0) + nrbs = tiny_router_buffers; + + nrbs /= LNET_CPT_NUMBER; + return max(nrbs, LNET_NRB_TINY_MIN); +} + +static int +lnet_nrb_small_calculate(int npages) { - int small_pages = 1; - int large_pages = (LNET_MTU + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT; + int nrbs = LNET_NRB_SMALL; - lnet_rtrpool_init(&the_lnet.ln_rtrpools[0], 0); - lnet_rtrpool_init(&the_lnet.ln_rtrpools[1], small_pages); - lnet_rtrpool_init(&the_lnet.ln_rtrpools[2], large_pages); + if (small_router_buffers < 0) { + LCONSOLE_ERROR_MSG(0x10c, + "small_router_buffers=%d invalid when " + "routing enabled\n", small_router_buffers); + return -1; + } + + if (small_router_buffers > 0) + nrbs = small_router_buffers; + + nrbs /= LNET_CPT_NUMBER; + return max(nrbs, LNET_NRB_SMALL_MIN); } +static int +lnet_nrb_large_calculate(int npages) +{ + int nrbs = LNET_NRB_LARGE; + + if (large_router_buffers < 0) { + LCONSOLE_ERROR_MSG(0x10c, + "large_router_buffers=%d invalid when " + "routing enabled\n", large_router_buffers); + return -1; + } + + if (large_router_buffers > 0) + nrbs = large_router_buffers; + + nrbs /= LNET_CPT_NUMBER; + return max(nrbs, LNET_NRB_LARGE_MIN); +} int -lnet_alloc_rtrpools(int im_a_router) +lnet_rtrpools_alloc(int im_a_router) { - int rc; + lnet_rtrbufpool_t *rtrp; + int large_pages = (LNET_MTU + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT; + int small_pages = 1; + int nrb_tiny; + int nrb_small; + int nrb_large; + int rc; + int i; if (!strcmp(forwarding, "")) { /* not set either way */ @@ -1375,58 +1512,61 @@ lnet_alloc_rtrpools(int im_a_router) return -EINVAL; } - if (tiny_router_buffers <= 0) { - LCONSOLE_ERROR_MSG(0x10c, "tiny_router_buffers=%d invalid when " - "routing enabled\n", tiny_router_buffers); - rc = -EINVAL; - goto failed; - } - - rc = lnet_rtrpool_alloc_bufs(&the_lnet.ln_rtrpools[0], - tiny_router_buffers); - if (rc != 0) - goto failed; - - if (small_router_buffers <= 0) { - LCONSOLE_ERROR_MSG(0x10d, "small_router_buffers=%d invalid when" - " routing enabled\n", small_router_buffers); - rc = -EINVAL; - goto failed; - } - - rc = lnet_rtrpool_alloc_bufs(&the_lnet.ln_rtrpools[1], - small_router_buffers); - if (rc != 0) - goto failed; - - if (large_router_buffers <= 0) { - LCONSOLE_ERROR_MSG(0x10e, "large_router_buffers=%d invalid when" - " routing enabled\n", large_router_buffers); - rc = -EINVAL; - goto failed; - } + nrb_tiny = lnet_nrb_tiny_calculate(0); + if (nrb_tiny < 0) + return -EINVAL; + + nrb_small = lnet_nrb_small_calculate(small_pages); + if (nrb_small < 0) + return -EINVAL; + + nrb_large = lnet_nrb_large_calculate(large_pages); + if (nrb_large < 0) + return -EINVAL; + + the_lnet.ln_rtrpools = cfs_percpt_alloc(lnet_cpt_table(), + LNET_NRBPOOLS * + sizeof(lnet_rtrbufpool_t)); + if (the_lnet.ln_rtrpools == NULL) { + LCONSOLE_ERROR_MSG(0x10c, + "Failed to initialize router buffe pool\n"); + return -ENOMEM; + } - rc = lnet_rtrpool_alloc_bufs(&the_lnet.ln_rtrpools[2], - large_router_buffers); - if (rc != 0) - goto failed; + cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) { + lnet_rtrpool_init(&rtrp[0], 0); + rc = lnet_rtrpool_alloc_bufs(&rtrp[0], nrb_tiny, i); + if (rc != 0) + goto failed; + + lnet_rtrpool_init(&rtrp[1], small_pages); + rc = lnet_rtrpool_alloc_bufs(&rtrp[1], nrb_small, i); + if (rc != 0) + goto failed; + + lnet_rtrpool_init(&rtrp[2], large_pages); + rc = lnet_rtrpool_alloc_bufs(&rtrp[2], nrb_large, i); + if (rc != 0) + goto failed; + } - LNET_LOCK(); - the_lnet.ln_routing = 1; - LNET_UNLOCK(); + lnet_net_lock(LNET_LOCK_EX); + the_lnet.ln_routing = 1; + lnet_net_unlock(LNET_LOCK_EX); - return 0; + return 0; failed: - lnet_free_rtrpools(); - return rc; + lnet_rtrpools_free(); + return rc; } int -lnet_notify (lnet_ni_t *ni, lnet_nid_t nid, int alive, cfs_time_t when) +lnet_notify(lnet_ni_t *ni, lnet_nid_t nid, int alive, cfs_time_t when) { - lnet_peer_t *lp = NULL; - cfs_time_t now = cfs_time_current(); + struct lnet_peer *lp = NULL; + cfs_time_t now = cfs_time_current(); + int cpt = lnet_cpt_of_nid(nid); LASSERT (!cfs_in_interrupt ()); @@ -1459,12 +1599,17 @@ lnet_notify (lnet_ni_t *ni, lnet_nid_t nid, int alive, cfs_time_t when) return 0; } - LNET_LOCK(); + lnet_net_lock(cpt); - lp = lnet_find_peer_locked(nid); - if (lp == NULL) { - /* nid not found */ - LNET_UNLOCK(); + if (the_lnet.ln_shutdown) { + lnet_net_unlock(cpt); + return -ESHUTDOWN; + } + + lp = lnet_find_peer_locked(the_lnet.ln_peer_tables[cpt], nid); + if (lp == NULL) { + /* nid not found */ + lnet_net_unlock(cpt); CDEBUG(D_NET, "%s not found\n", libcfs_nid2str(nid)); return 0; } @@ -1480,10 +1625,10 @@ lnet_notify (lnet_ni_t *ni, lnet_nid_t nid, int alive, cfs_time_t when) lnet_ni_notify_locked(ni, lp); - lnet_peer_decref_locked(lp); + lnet_peer_decref_locked(lp); - LNET_UNLOCK(); - return 0; + lnet_net_unlock(cpt); + return 0; } EXPORT_SYMBOL(lnet_notify); @@ -1525,12 +1670,14 @@ lnet_router_checker (void) live_router_check_interval, dead_router_check_interval, interval); - LNET_LOCK(); - LASSERT (!running); /* recursion check */ - running = 1; - LNET_UNLOCK(); + LASSERT(LNET_CPT_NUMBER == 1); - last = now; + lnet_net_lock(0); + LASSERT(!running); /* recursion check */ + running = 1; + lnet_net_unlock(0); + + last = now; if (the_lnet.ln_rc_state == LNET_RC_STATE_STOPPING) lnet_prune_rc_data(0); /* unlink all rcd and nowait */ @@ -1554,9 +1701,7 @@ lnet_router_checker (void) LASSERT (rc == 1); - LNET_LOCK(); lnet_router_checker_event(&ev); - LNET_UNLOCK(); } if (the_lnet.ln_rc_state == LNET_RC_STATE_STOPPING) { @@ -1568,7 +1713,7 @@ lnet_router_checker (void) LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING); - LNET_LOCK(); + lnet_net_lock(0); version = the_lnet.ln_routers_version; cfs_list_for_each_entry (rtr, &the_lnet.ln_routers, lp_rtr_list) { @@ -1576,10 +1721,10 @@ lnet_router_checker (void) LASSERT (version == the_lnet.ln_routers_version); } - LNET_UNLOCK(); + lnet_net_unlock(0); - running = 0; /* lock only needed for the recursion check */ - return; + running = 0; /* lock only needed for the recursion check */ + return; } /* NB lnet_peers_start_down depends on me, @@ -1605,17 +1750,12 @@ lnet_get_tunables (void) } void -lnet_free_rtrpools (void) -{ -} - -void -lnet_init_rtrpools (void) +lnet_rtrpools_free(void) { } int -lnet_alloc_rtrpools (int im_a_arouter) +lnet_rtrpools_alloc(int im_a_arouter) { return 0; }