X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lnet%2Flnet%2Fpeer.c;h=b373f49ee764e35da8244d345f5c70aba7c0df35;hp=01fcad3edc04d05a132c265e7fa4b295a166fcfc;hb=3c580c93b8d3e94fac0ac2cf3cca2ff706c6497a;hpb=2b5b551b15d96588f8f309b5a08c11cab203efeb diff --git a/lnet/lnet/peer.c b/lnet/lnet/peer.c index 01fcad3..b373f49 100644 --- a/lnet/lnet/peer.c +++ b/lnet/lnet/peer.c @@ -258,6 +258,10 @@ lnet_peer_alloc(lnet_nid_t nid) init_waitqueue_head(&lp->lp_dc_waitq); spin_lock_init(&lp->lp_lock); lp->lp_primary_nid = nid; + if (lnet_peers_start_down()) + lp->lp_alive = false; + else + lp->lp_alive = true; /* * all peers created on a router should have health on @@ -400,8 +404,6 @@ lnet_peer_ni_del_locked(struct lnet_peer_ni *lpni, bool force) /* decrement the ref count on the peer table */ ptable = the_lnet.ln_peer_tables[lpni->lpni_cpt]; - LASSERT(ptable->pt_number > 0); - ptable->pt_number--; /* * The peer_ni can no longer be found with a lookup. But there @@ -651,7 +653,8 @@ lnet_get_peer_ni_locked(struct lnet_peer_table *ptable, lnet_nid_t nid) struct list_head *peers; struct lnet_peer_ni *lp; - LASSERT(the_lnet.ln_state == LNET_STATE_RUNNING); + if (the_lnet.ln_state != LNET_STATE_RUNNING) + return NULL; peers = &ptable->pt_hash[lnet_nid2peerhash(nid)]; list_for_each_entry(lp, peers, lpni_hashlist) { @@ -882,6 +885,8 @@ lnet_push_update_to_peers(int force) int cpt; lnet_net_lock(LNET_LOCK_EX); + if (lnet_peer_discovery_disabled) + force = 0; lncpt = cfs_percpt_number(the_lnet.ln_peer_tables); for (cpt = 0; cpt < lncpt; cpt++) { ptable = the_lnet.ln_peer_tables[cpt]; @@ -1150,6 +1155,7 @@ lnet_peer_primary_nid_locked(lnet_nid_t nid) bool lnet_is_discovery_disabled_locked(struct lnet_peer *lp) +__must_hold(&lp->lp_lock) { if (lnet_peer_discovery_disabled) return true; @@ -1254,7 +1260,6 @@ lnet_peer_attach_peer_ni(struct lnet_peer *lp, ptable = the_lnet.ln_peer_tables[lpni->lpni_cpt]; list_add_tail(&lpni->lpni_hashlist, &ptable->pt_hash[hash]); ptable->pt_version++; - ptable->pt_number++; /* This is the 1st refcount on lpni. */ atomic_inc(&lpni->lpni_refcount); } @@ -1525,11 +1530,7 @@ lnet_peer_ni_traffic_add(lnet_nid_t nid, lnet_nid_t pref) struct lnet_peer *lp; struct lnet_peer_net *lpn; struct lnet_peer_ni *lpni; - /* - * Assume peer is Multi-Rail capable and let discovery find out - * otherwise. - */ - unsigned flags = LNET_PEER_MULTI_RAIL; + unsigned flags = 0; int rc = 0; if (nid == LNET_NID_ANY) { @@ -1842,6 +1843,17 @@ lnet_peer_gw_discovery(struct lnet_peer *lp) return rc; } +bool +lnet_peer_is_uptodate(struct lnet_peer *lp) +{ + bool rc; + + spin_lock(&lp->lp_lock); + rc = lnet_peer_is_uptodate_locked(lp); + spin_unlock(&lp->lp_lock); + return rc; +} + /* * Is a peer uptodate from the point of view of discovery? * @@ -1851,11 +1863,11 @@ lnet_peer_gw_discovery(struct lnet_peer *lp) * Otherwise look at whether the peer needs rediscovering. */ bool -lnet_peer_is_uptodate(struct lnet_peer *lp) +lnet_peer_is_uptodate_locked(struct lnet_peer *lp) +__must_hold(&lp->lp_lock) { bool rc; - spin_lock(&lp->lp_lock); if (lp->lp_state & (LNET_PEER_DISCOVERING | LNET_PEER_FORCE_PING | LNET_PEER_FORCE_PUSH)) { @@ -1872,7 +1884,6 @@ lnet_peer_is_uptodate(struct lnet_peer *lp) } else { rc = false; } - spin_unlock(&lp->lp_lock); return rc; } @@ -2150,6 +2161,7 @@ lnet_discover_peer_locked(struct lnet_peer_ni *lpni, int cpt, bool block) DEFINE_WAIT(wait); struct lnet_peer *lp; int rc = 0; + int count = 0; again: lnet_net_unlock(cpt); @@ -2169,11 +2181,21 @@ again: break; if (the_lnet.ln_dc_state != LNET_DC_STATE_RUNNING) break; + /* + * Don't repeat discovery if discovery is disabled. This is + * done to ensure we can use discovery as a standard ping as + * well for backwards compatibility with routers which do not + * have discovery or have discovery disabled + */ + if (lnet_is_discovery_disabled(lp) && count > 0) + break; if (lp->lp_dc_error) break; if (lnet_peer_is_uptodate(lp)) break; lnet_peer_queue_for_discovery(lp); + count++; + CDEBUG(D_NET, "Discovery attempt # %d\n", count); /* * If caller requested a non-blocking operation then @@ -2191,16 +2213,6 @@ again: lnet_peer_decref_locked(lp); /* Peer may have changed */ lp = lpni->lpni_peer_net->lpn_peer; - - /* - * Wait for discovery to complete, but don't repeat if - * discovery is disabled. This is done to ensure we can - * use discovery as a standard ping as well for backwards - * compatibility with routers which do not have discovery - * or have discovery disabled - */ - if (lnet_is_discovery_disabled(lp)) - break; } finish_wait(&lp->lp_dc_waitq, &wait); @@ -2298,20 +2310,6 @@ lnet_discovery_event_reply(struct lnet_peer *lp, struct lnet_event *ev) /* - * Only enable the multi-rail feature on the peer if both sides of - * the connection have discovery on - */ - if (pbuf->pb_info.pi_features & LNET_PING_FEAT_MULTI_RAIL) { - CDEBUG(D_NET, "Peer %s has Multi-Rail feature enabled\n", - libcfs_nid2str(lp->lp_primary_nid)); - lp->lp_state |= LNET_PEER_MULTI_RAIL; - } else { - CDEBUG(D_NET, "Peer %s has Multi-Rail feature disabled\n", - libcfs_nid2str(lp->lp_primary_nid)); - lp->lp_state &= ~LNET_PEER_MULTI_RAIL; - } - - /* * The peer may have discovery disabled at its end. Set * NO_DISCOVERY as appropriate. */ @@ -2333,22 +2331,24 @@ lnet_discovery_event_reply(struct lnet_peer *lp, struct lnet_event *ev) */ if (pbuf->pb_info.pi_features & LNET_PING_FEAT_MULTI_RAIL) { if (lp->lp_state & LNET_PEER_MULTI_RAIL) { - /* Everything's fine */ + CDEBUG(D_NET, "peer %s(%p) is MR\n", + libcfs_nid2str(lp->lp_primary_nid), lp); } else if (lp->lp_state & LNET_PEER_CONFIGURED) { CWARN("Reply says %s is Multi-Rail, DLC says not\n", libcfs_nid2str(lp->lp_primary_nid)); + } else if (lnet_peer_discovery_disabled) { + CDEBUG(D_NET, + "peer %s(%p) not MR: DD disabled locally\n", + libcfs_nid2str(lp->lp_primary_nid), lp); + } else if (lp->lp_state & LNET_PEER_NO_DISCOVERY) { + CDEBUG(D_NET, + "peer %s(%p) not MR: DD disabled remotely\n", + libcfs_nid2str(lp->lp_primary_nid), lp); } else { - /* - * if discovery is disabled then we don't want to - * update the state of the peer. All we'll do is - * update the peer_nis which were reported back in - * the initial ping - */ - - if (!lnet_is_discovery_disabled_locked(lp)) { - lp->lp_state |= LNET_PEER_MULTI_RAIL; - lnet_peer_clr_non_mr_pref_nids(lp); - } + CDEBUG(D_NET, "peer %s(%p) is MR capable\n", + libcfs_nid2str(lp->lp_primary_nid), lp); + lp->lp_state |= LNET_PEER_MULTI_RAIL; + lnet_peer_clr_non_mr_pref_nids(lp); } } else if (lp->lp_state & LNET_PEER_MULTI_RAIL) { if (lp->lp_state & LNET_PEER_CONFIGURED) { @@ -2411,6 +2411,15 @@ lnet_discovery_event_reply(struct lnet_peer *lp, struct lnet_event *ev) out: lp->lp_state &= ~LNET_PEER_PING_SENT; spin_unlock(&lp->lp_lock); + + lnet_net_lock(LNET_LOCK_EX); + /* + * If this peer is a gateway, call the routing callback to + * handle the ping reply + */ + if (lp->lp_rtr_refcount > 0) + lnet_router_discovery_ping_reply(lp); + lnet_net_unlock(LNET_LOCK_EX); } /* @@ -3273,6 +3282,8 @@ static int lnet_peer_discovery(void *arg) struct lnet_peer *lp; int rc; + wait_for_completion(&the_lnet.ln_started); + CDEBUG(D_NET, "started\n"); cfs_block_allsigs(); @@ -3445,7 +3456,14 @@ void lnet_peer_discovery_stop(void) LASSERT(the_lnet.ln_dc_state == LNET_DC_STATE_RUNNING); the_lnet.ln_dc_state = LNET_DC_STATE_STOPPING; - wake_up(&the_lnet.ln_dc_waitq); + + /* In the LNetNIInit() path we may be stopping discovery before it + * entered its work loop + */ + if (!completion_done(&the_lnet.ln_started)) + complete(&the_lnet.ln_started); + else + wake_up(&the_lnet.ln_dc_waitq); wait_event(the_lnet.ln_dc_waitq, the_lnet.ln_dc_state == LNET_DC_STATE_SHUTDOWN);