From: isaac Date: Tue, 1 Dec 2009 15:10:57 +0000 (+0000) Subject: i=liang,b=15332,b=21103: X-Git-Tag: GIT_EPOCH_B_HD_KDMU~41 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=9e78291e0f1f0b12472d6a0b15eaf1aec8076ca1 i=liang,b=15332,b=21103: - LNet router shuffler. --- diff --git a/lnet/ChangeLog b/lnet/ChangeLog index ae041fa..0bc3834 100644 --- a/lnet/ChangeLog +++ b/lnet/ChangeLog @@ -19,6 +19,10 @@ Details : Severity : enhancement Bugzilla : 15332 +Description: LNet router shuffler. + +Severity : enhancement +Bugzilla : 15332 Description: LNet fine grain routing support. Severity : normal diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h index 5332664..69b4829 100644 --- a/lnet/include/lnet/lib-lnet.h +++ b/lnet/include/lnet/lib-lnet.h @@ -674,9 +674,6 @@ void lnet_get_tunables(void); int lnet_peers_start_down(void); int lnet_peer_buffer_credits(lnet_ni_t *ni); -extern int router_ping_timeout; -extern int dead_router_check_interval; -extern int live_router_check_interval; int lnet_router_checker_start(void); void lnet_router_checker_stop(void); void lnet_swap_pinginfo(lnet_ping_info_t *info); diff --git a/lnet/lnet/lib-md.c b/lnet/lnet/lib-md.c index 6e64ad2..1d5eb7a 100644 --- a/lnet/lnet/lib-md.c +++ b/lnet/lnet/lib-md.c @@ -222,7 +222,7 @@ lnet_md_validate(lnet_md_t *umd) if ((umd->options & (LNET_MD_KIOV | LNET_MD_IOVEC)) != 0 && umd->length > LNET_MAX_IOV) { - CERROR("Invalid option: too many fragments %d, %d max\n", + CERROR("Invalid option: too many fragments %u, %d max\n", umd->length, LNET_MAX_IOV); return -EINVAL; } diff --git a/lnet/lnet/router.c b/lnet/lnet/router.c index a8f816c..9a231e4 100644 --- a/lnet/lnet/router.c +++ b/lnet/lnet/router.c @@ -81,15 +81,15 @@ static int avoid_asym_router_failure = 0; CFS_MODULE_PARM(avoid_asym_router_failure, "i", int, 0444, "Avoid asymmetrical failures: reserved, use at your own risk"); -int dead_router_check_interval = 0; +static int dead_router_check_interval = 0; CFS_MODULE_PARM(dead_router_check_interval, "i", int, 0444, "Seconds between dead router health checks (<= 0 to disable)"); -int live_router_check_interval = 0; +static int live_router_check_interval = 0; CFS_MODULE_PARM(live_router_check_interval, "i", int, 0444, "Seconds between live router health checks (<= 0 to disable)"); -int router_ping_timeout = 50; +static int router_ping_timeout = 50; CFS_MODULE_PARM(router_ping_timeout, "i", int, 0444, "Seconds to wait for the reply to a router health query"); @@ -235,6 +235,34 @@ lnet_find_net_locked (__u32 net) return NULL; } +/* NB expects LNET_LOCK held */ +void +lnet_add_route_to_rnet (lnet_remotenet_t *rnet, lnet_route_t *route) +{ + unsigned int len = 0; + unsigned int offset = 0; + struct list_head *e; + extern __u64 lnet_create_interface_cookie(void); + + list_for_each (e, &rnet->lrn_routes) { + len++; + } + + /* FIXME use Lustre random function when it's moved to libcfs. + * See bug 18751 */ + /* len+1 positions to add a new entry, also prevents division by 0 */ + offset = ((unsigned int) lnet_create_interface_cookie()) % (len + 1); + list_for_each (e, &rnet->lrn_routes) { + if (offset == 0) + break; + offset--; + } + list_add(&route->lr_list, e); + + the_lnet.ln_remote_nets_version++; + lnet_rtr_addref_locked(route->lr_gateway); +} + int lnet_add_route (__u32 net, unsigned int hops, lnet_nid_t gateway) { @@ -321,11 +349,7 @@ lnet_add_route (__u32 net, unsigned int hops, lnet_nid_t gateway) ni = route->lr_gateway->lp_ni; lnet_ni_addref_locked(ni); - list_add_tail(&route->lr_list, &rnet2->lrn_routes); - the_lnet.ln_remote_nets_version++; - - lnet_rtr_addref_locked(route->lr_gateway); - + lnet_add_route_to_rnet(rnet2, route); LNET_UNLOCK(); /* XXX Assume alive */ diff --git a/lnet/lnet/router_proc.c b/lnet/lnet/router_proc.c index fce91c2..76d82b3 100644 --- a/lnet/lnet/router_proc.c +++ b/lnet/lnet/router_proc.c @@ -168,9 +168,9 @@ int LL_PROC_PROTO(proc_lnet_routes) if (skip == 0) { route = re; break; - } else - skip--; + } + skip--; r = r->next; } @@ -264,10 +264,10 @@ int LL_PROC_PROTO(proc_lnet_routers) if (skip == 0) { peer = lp; - break; - } else - skip--; + break; + } + skip--; r = r->next; } @@ -280,8 +280,10 @@ int LL_PROC_PROTO(proc_lnet_routers) int alive_cnt = peer->lp_alive_count; int alive = peer->lp_alive; int pingsent = !peer->lp_ping_notsent; - int last_ping = cfs_duration_sec(now - peer->lp_ping_timestamp); - int down_ni = lnet_router_down_ni(peer, LNET_NIDNET(LNET_NID_ANY)); + int last_ping = cfs_duration_sec(cfs_time_sub(now, + peer->lp_ping_timestamp)); + int down_ni = lnet_router_down_ni(peer, + LNET_NIDNET(LNET_NID_ANY)); if (deadline == 0) s += snprintf(s, tmpstr + tmpsiz - s, @@ -296,7 +298,7 @@ int LL_PROC_PROTO(proc_lnet_routers) nrefs, nrtrrefs, alive_cnt, alive ? "up" : "down", last_ping, pingsent, - cfs_duration_sec(deadline - now), + cfs_duration_sec(cfs_time_sub(deadline, now)), down_ni, libcfs_nid2str(nid)); LASSERT (tmpstr + tmpsiz - s > 0); } @@ -411,13 +413,14 @@ int LL_PROC_PROTO(proc_lnet_peers) &the_lnet.ln_peer_hash[idx]) { num = 1; idx++; - } else + } else { num++; + } break; - } else - skip--; + } + skip--; p = lp->lp_hashlist.next; } @@ -572,14 +575,15 @@ int LL_PROC_PROTO(proc_lnet_nis) if (skip == 0) { ni = a_ni; break; - } else - skip--; + } + skip--; n = n->next; } if (ni != NULL) { cfs_time_t now = cfs_time_current(); + int last_alive = -1; int maxtxcr = ni->ni_maxtxcredits; int txcr = ni->ni_txcredits; int mintxcr = ni->ni_mintxcredits; @@ -587,11 +591,11 @@ int LL_PROC_PROTO(proc_lnet_nis) int npeerrtrcr = ni->ni_peerrtrcredits; lnet_nid_t nid = ni->ni_nid; int nref = ni->ni_refcount; - int last_alive; char *stat; - last_alive = (the_lnet.ln_routing) ? - cfs_duration_sec(now - ni->ni_last_alive) : -1; + if (the_lnet.ln_routing) + last_alive = cfs_duration_sec(cfs_time_sub(now, + ni->ni_last_alive)); if (ni->ni_lnd->lnd_type == LOLND) /* @lo forever alive */ last_alive = 0;