X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lnet%2Flnet%2Fpeer.c;h=20ae5588b3eb87ee3f67156f6c51d448ba4254c7;hp=f2b081996031aa1dd134b1bc397919b4d2263b90;hb=9eb9474c41c823c70f34e6bb102a8861ca21a3d1;hpb=1cf929df259a9aaa5446a4cda637930ca5b27d7a diff --git a/lnet/lnet/peer.c b/lnet/lnet/peer.c index f2b0819..20ae558 100644 --- a/lnet/lnet/peer.c +++ b/lnet/lnet/peer.c @@ -27,7 +27,6 @@ */ /* * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. * * lnet/lnet/peer.c */ @@ -40,6 +39,7 @@ #endif #include +#include #include #include @@ -104,7 +104,7 @@ lnet_peer_tables_destroy(void) for (j = 0; j < LNET_PEER_HASH_SIZE; j++) LASSERT(list_empty(&hash[j])); - LIBCFS_FREE(hash, LNET_PEER_HASH_SIZE * sizeof(*hash)); + CFS_FREE_PTR_ARRAY(hash, LNET_PEER_HASH_SIZE); } cfs_percpt_free(the_lnet.ln_peer_tables); @@ -162,20 +162,25 @@ lnet_peer_ni_alloc(lnet_nid_t nid) return NULL; INIT_LIST_HEAD(&lpni->lpni_txq); - INIT_LIST_HEAD(&lpni->lpni_rtrq); - INIT_LIST_HEAD(&lpni->lpni_routes); INIT_LIST_HEAD(&lpni->lpni_hashlist); INIT_LIST_HEAD(&lpni->lpni_peer_nis); + INIT_LIST_HEAD(&lpni->lpni_recovery); INIT_LIST_HEAD(&lpni->lpni_on_remote_peer_ni_list); + INIT_LIST_HEAD(&lpni->lpni_rtr_pref_nids); + LNetInvalidateMDHandle(&lpni->lpni_recovery_ping_mdh); + kref_init(&lpni->lpni_kref); + lpni->lpni_sel_priority = LNET_MAX_SELECTION_PRIORITY; spin_lock_init(&lpni->lpni_lock); - lpni->lpni_alive = !lnet_peers_start_down(); /* 1 bit!! */ - lpni->lpni_last_alive = ktime_get_seconds(); /* assumes alive */ + if (lnet_peers_start_down()) + lpni->lpni_ns_status = LNET_NI_STATUS_DOWN; + else + lpni->lpni_ns_status = LNET_NI_STATUS_UP; lpni->lpni_ping_feats = LNET_PING_FEAT_INVAL; lpni->lpni_nid = nid; lpni->lpni_cpt = cpt; - lnet_set_peer_ni_health_locked(lpni, true); + atomic_set(&lpni->lpni_healthv, LNET_MAX_HEALTH_VALUE); net = lnet_get_net_locked(LNET_NIDNET(nid)); lpni->lpni_net = net; @@ -192,7 +197,7 @@ lnet_peer_ni_alloc(lnet_nid_t nid) * list so it can be easily found and revisited. */ /* FIXME: per-net implementation instead? */ - atomic_inc(&lpni->lpni_refcount); + lnet_peer_ni_addref_locked(lpni); list_add_tail(&lpni->lpni_on_remote_peer_ni_list, &the_lnet.ln_remote_peer_ni_list); } @@ -214,6 +219,7 @@ lnet_peer_net_alloc(__u32 net_id) INIT_LIST_HEAD(&lpn->lpn_peer_nets); INIT_LIST_HEAD(&lpn->lpn_peer_nis); lpn->lpn_net_id = net_id; + lpn->lpn_sel_priority = LNET_MAX_SELECTION_PRIORITY; CDEBUG(D_NET, "%p net %s\n", lpn, libcfs_net2str(lpn->lpn_net_id)); @@ -246,13 +252,29 @@ lnet_peer_alloc(lnet_nid_t nid) if (!lp) return NULL; + INIT_LIST_HEAD(&lp->lp_rtrq); + INIT_LIST_HEAD(&lp->lp_routes); INIT_LIST_HEAD(&lp->lp_peer_list); INIT_LIST_HEAD(&lp->lp_peer_nets); INIT_LIST_HEAD(&lp->lp_dc_list); INIT_LIST_HEAD(&lp->lp_dc_pendq); + INIT_LIST_HEAD(&lp->lp_rtr_list); init_waitqueue_head(&lp->lp_dc_waitq); spin_lock_init(&lp->lp_lock); lp->lp_primary_nid = nid; + lp->lp_disc_src_nid = LNET_NID_ANY; + if (lnet_peers_start_down()) + lp->lp_alive = false; + else + lp->lp_alive = true; + + /* + * all peers created on a router should have health on + * if it's not already on. + */ + if (the_lnet.ln_routing && !lnet_health_sensitivity) + lp->lp_health_sensitivity = 1; + /* * Turn off discovery for loopback peer. If you're creating a peer * for the loopback interface then that was initiated when we @@ -260,7 +282,7 @@ lnet_peer_alloc(lnet_nid_t nid) * to ever use a different interface when sending messages to * myself. */ - if (LNET_NETTYP(LNET_NIDNET(nid)) == LOLND) + if (nid == LNET_NID_LO_0) lp->lp_state = LNET_PEER_NO_DISCOVERY; lp->lp_cpt = lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER); @@ -275,6 +297,7 @@ lnet_destroy_peer_locked(struct lnet_peer *lp) CDEBUG(D_NET, "%p nid %s\n", lp, libcfs_nid2str(lp->lp_primary_nid)); LASSERT(atomic_read(&lp->lp_refcount) == 0); + LASSERT(lp->lp_rtr_refcount == 0); LASSERT(list_empty(&lp->lp_peer_nets)); LASSERT(list_empty(&lp->lp_peer_list)); LASSERT(list_empty(&lp->lp_dc_list)); @@ -295,7 +318,9 @@ lnet_destroy_peer_locked(struct lnet_peer *lp) * Releasing the lock can cause an inconsistent state */ spin_lock(&the_lnet.ln_msg_resend_lock); + spin_lock(&lp->lp_lock); list_splice(&lp->lp_dc_pendq, &the_lnet.ln_msg_resend); + spin_unlock(&lp->lp_lock); spin_unlock(&the_lnet.ln_msg_resend_lock); wake_up(&the_lnet.ln_dc_waitq); @@ -358,12 +383,12 @@ lnet_peer_detach_peer_ni_locked(struct lnet_peer_ni *lpni) /* called with lnet_net_lock LNET_LOCK_EX held */ static int -lnet_peer_ni_del_locked(struct lnet_peer_ni *lpni) +lnet_peer_ni_del_locked(struct lnet_peer_ni *lpni, bool force) { struct lnet_peer_table *ptable = NULL; /* don't remove a peer_ni if it's also a gateway */ - if (lpni->lpni_rtr_refcount > 0) { + if (lnet_isrouter(lpni) && !force) { CERROR("Peer NI %s is a gateway. Can not delete it\n", libcfs_nid2str(lpni->lpni_nid)); return -EBUSY; @@ -374,10 +399,16 @@ lnet_peer_ni_del_locked(struct lnet_peer_ni *lpni) /* remove peer ni from the hash list. */ list_del_init(&lpni->lpni_hashlist); + /* + * indicate the peer is being deleted so the monitor thread can + * remove it from the recovery queue. + */ + spin_lock(&lpni->lpni_lock); + lpni->lpni_state |= LNET_PEER_NI_DELETING; + spin_unlock(&lpni->lpni_lock); + /* decrement the ref count on the peer table */ ptable = the_lnet.ln_peer_tables[lpni->lpni_cpt]; - LASSERT(ptable->pt_number > 0); - ptable->pt_number--; /* * The peer_ni can no longer be found with a lookup. But there @@ -412,7 +443,7 @@ void lnet_peer_uninit(void) /* remove all peer_nis from the remote peer and the hash list */ list_for_each_entry_safe(lpni, tmp, &the_lnet.ln_remote_peer_ni_list, lpni_on_remote_peer_ni_list) - lnet_peer_ni_del_locked(lpni); + lnet_peer_ni_del_locked(lpni, false); lnet_peer_tables_destroy(); @@ -427,10 +458,14 @@ lnet_peer_del_locked(struct lnet_peer *peer) CDEBUG(D_NET, "peer %s\n", libcfs_nid2str(peer->lp_primary_nid)); + spin_lock(&peer->lp_lock); + peer->lp_state |= LNET_PEER_MARK_DELETED; + spin_unlock(&peer->lp_lock); + lpni = lnet_get_next_peer_ni_locked(peer, NULL, lpni); while (lpni != NULL) { lpni2 = lnet_get_next_peer_ni_locked(peer, NULL, lpni); - rc = lnet_peer_ni_del_locked(lpni); + rc = lnet_peer_ni_del_locked(lpni, false); if (rc != 0) rc2 = rc; lpni = lpni2; @@ -439,9 +474,41 @@ lnet_peer_del_locked(struct lnet_peer *peer) return rc2; } +/* + * Discovering this peer is taking too long. Cancel any Ping or Push + * that discovery is waiting on by unlinking the relevant MDs. The + * lnet_discovery_event_handler() will proceed from here and complete + * the cleanup. + */ +static void lnet_peer_cancel_discovery(struct lnet_peer *lp) +{ + struct lnet_handle_md ping_mdh; + struct lnet_handle_md push_mdh; + + LNetInvalidateMDHandle(&ping_mdh); + LNetInvalidateMDHandle(&push_mdh); + + spin_lock(&lp->lp_lock); + if (lp->lp_state & LNET_PEER_PING_SENT) { + ping_mdh = lp->lp_ping_mdh; + LNetInvalidateMDHandle(&lp->lp_ping_mdh); + } + if (lp->lp_state & LNET_PEER_PUSH_SENT) { + push_mdh = lp->lp_push_mdh; + LNetInvalidateMDHandle(&lp->lp_push_mdh); + } + spin_unlock(&lp->lp_lock); + + if (!LNetMDHandleIsInvalid(ping_mdh)) + LNetMDUnlink(ping_mdh); + if (!LNetMDHandleIsInvalid(push_mdh)) + LNetMDUnlink(push_mdh); +} + static int lnet_peer_del(struct lnet_peer *peer) { + lnet_peer_cancel_discovery(peer); lnet_net_lock(LNET_LOCK_EX); lnet_peer_del_locked(peer); lnet_net_unlock(LNET_LOCK_EX); @@ -464,6 +531,7 @@ lnet_peer_del_nid(struct lnet_peer *lp, lnet_nid_t nid, unsigned flags) struct lnet_peer_ni *lpni; lnet_nid_t primary_nid = lp->lp_primary_nid; int rc = 0; + bool force = (flags & LNET_PEER_RTR_NI_FORCE_DEL) ? true : false; if (!(flags & LNET_PEER_CONFIGURED)) { if (lp->lp_state & LNET_PEER_CONFIGURED) { @@ -486,13 +554,22 @@ lnet_peer_del_nid(struct lnet_peer *lp, lnet_nid_t nid, unsigned flags) * This function only allows deletion of the primary NID if it * is the only NID. */ - if (nid == lp->lp_primary_nid && lp->lp_nnis != 1) { + if (nid == lp->lp_primary_nid && lp->lp_nnis != 1 && !force) { rc = -EBUSY; goto out; } lnet_net_lock(LNET_LOCK_EX); - lnet_peer_ni_del_locked(lpni); + + if (nid == lp->lp_primary_nid && lp->lp_nnis != 1 && force) { + struct lnet_peer_ni *lpni2; + /* assign the next peer_ni to be the primary */ + lpni2 = lnet_get_next_peer_ni_locked(lp, NULL, lpni); + LASSERT(lpni2); + lp->lp_primary_nid = lpni2->lpni_nid; + } + rc = lnet_peer_ni_del_locked(lpni, force); + lnet_net_unlock(LNET_LOCK_EX); out: @@ -519,7 +596,7 @@ lnet_peer_table_cleanup_locked(struct lnet_net *net, peer = lpni->lpni_peer_net->lpn_peer; if (peer->lp_primary_nid != lpni->lpni_nid) { - lnet_peer_ni_del_locked(lpni); + lnet_peer_ni_del_locked(lpni, false); continue; } /* @@ -540,22 +617,10 @@ lnet_peer_table_cleanup_locked(struct lnet_net *net, static void lnet_peer_ni_finalize_wait(struct lnet_peer_table *ptable) { - int i = 3; - - spin_lock(&ptable->pt_zombie_lock); - while (ptable->pt_zombies) { - spin_unlock(&ptable->pt_zombie_lock); - - if (is_power_of_2(i)) { - CDEBUG(D_WARNING, + wait_var_event_warning(&ptable->pt_zombies, + ptable->pt_zombies == 0, "Waiting for %d zombies on peer table\n", ptable->pt_zombies); - } - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(cfs_time_seconds(1) >> 1); - spin_lock(&ptable->pt_zombie_lock); - } - spin_unlock(&ptable->pt_zombie_lock); } static void @@ -564,7 +629,7 @@ lnet_peer_table_del_rtrs_locked(struct lnet_net *net, { struct lnet_peer_ni *lp; struct lnet_peer_ni *tmp; - lnet_nid_t lpni_nid; + lnet_nid_t gw_nid; int i; for (i = 0; i < LNET_PEER_HASH_SIZE; i++) { @@ -573,13 +638,13 @@ lnet_peer_table_del_rtrs_locked(struct lnet_net *net, if (net != lp->lpni_net) continue; - if (lp->lpni_rtr_refcount == 0) + if (!lnet_isrouter(lp)) continue; - lpni_nid = lp->lpni_nid; + gw_nid = lp->lpni_peer_net->lpn_peer->lp_primary_nid; lnet_net_unlock(LNET_LOCK_EX); - lnet_del_route(LNET_NIDNET(LNET_NID_ANY), lpni_nid); + lnet_del_route(LNET_NET_ANY, gw_nid); lnet_net_lock(LNET_LOCK_EX); } } @@ -617,7 +682,8 @@ lnet_get_peer_ni_locked(struct lnet_peer_table *ptable, lnet_nid_t nid) struct list_head *peers; struct lnet_peer_ni *lp; - LASSERT(the_lnet.ln_state == LNET_STATE_RUNNING); + if (the_lnet.ln_state != LNET_STATE_RUNNING) + return NULL; peers = &ptable->pt_hash[lnet_nid2peerhash(nid)]; list_for_each_entry(lp, peers, lpni_hashlist) { @@ -645,6 +711,24 @@ lnet_find_peer_ni_locked(lnet_nid_t nid) return lpni; } +struct lnet_peer_ni * +lnet_peer_get_ni_locked(struct lnet_peer *lp, lnet_nid_t nid) +{ + struct lnet_peer_net *lpn; + struct lnet_peer_ni *lpni; + + lpn = lnet_peer_get_net_locked(lp, LNET_NIDNET(nid)); + if (!lpn) + return NULL; + + list_for_each_entry(lpni, &lpn->lpn_peer_nis, lpni_peer_nis) { + if (lpni->lpni_nid == nid) + return lpni; + } + + return NULL; +} + struct lnet_peer * lnet_find_peer(lnet_nid_t nid) { @@ -664,6 +748,39 @@ lnet_find_peer(lnet_nid_t nid) return lp; } +struct lnet_peer_net * +lnet_get_next_peer_net_locked(struct lnet_peer *lp, __u32 prev_lpn_id) +{ + struct lnet_peer_net *net; + + if (!prev_lpn_id) { + /* no net id provided return the first net */ + net = list_first_entry_or_null(&lp->lp_peer_nets, + struct lnet_peer_net, + lpn_peer_nets); + + return net; + } + + /* find the net after the one provided */ + list_for_each_entry(net, &lp->lp_peer_nets, lpn_peer_nets) { + if (net->lpn_net_id == prev_lpn_id) { + /* + * if we reached the end of the list loop to the + * beginning. + */ + if (net->lpn_peer_nets.next == &lp->lp_peer_nets) + return list_first_entry_or_null(&lp->lp_peer_nets, + struct lnet_peer_net, + lpn_peer_nets); + else + return list_next_entry(net, lpn_peer_nets); + } + } + + return NULL; +} + struct lnet_peer_ni * lnet_get_next_peer_ni_locked(struct lnet_peer *peer, struct lnet_peer_net *peer_net, @@ -797,6 +914,8 @@ lnet_push_update_to_peers(int force) int cpt; lnet_net_lock(LNET_LOCK_EX); + if (lnet_peer_discovery_disabled) + force = 0; lncpt = cfs_percpt_number(the_lnet.ln_peer_tables); for (cpt = 0; cpt < lncpt; cpt++) { ptable = the_lnet.ln_peer_tables[cpt]; @@ -815,6 +934,94 @@ lnet_push_update_to_peers(int force) wake_up(&the_lnet.ln_dc_waitq); } +/* find the NID in the preferred gateways for the remote peer + * return: + * false: list is not empty and NID is not preferred + * false: list is empty + * true: nid is found in the list + */ +bool +lnet_peer_is_pref_rtr_locked(struct lnet_peer_ni *lpni, + lnet_nid_t gw_nid) +{ + struct lnet_nid_list *ne; + + CDEBUG(D_NET, "%s: rtr pref emtpy: %d\n", + libcfs_nid2str(lpni->lpni_nid), + list_empty(&lpni->lpni_rtr_pref_nids)); + + if (list_empty(&lpni->lpni_rtr_pref_nids)) + return false; + + /* iterate through all the preferred NIDs and see if any of them + * matches the provided gw_nid + */ + list_for_each_entry(ne, &lpni->lpni_rtr_pref_nids, nl_list) { + CDEBUG(D_NET, "Comparing pref %s with gw %s\n", + libcfs_nid2str(ne->nl_nid), + libcfs_nid2str(gw_nid)); + if (ne->nl_nid == gw_nid) + return true; + } + + return false; +} + +void +lnet_peer_clr_pref_rtrs(struct lnet_peer_ni *lpni) +{ + struct list_head zombies; + struct lnet_nid_list *ne; + struct lnet_nid_list *tmp; + int cpt = lpni->lpni_cpt; + + INIT_LIST_HEAD(&zombies); + + lnet_net_lock(cpt); + list_splice_init(&lpni->lpni_rtr_pref_nids, &zombies); + lnet_net_unlock(cpt); + + list_for_each_entry_safe(ne, tmp, &zombies, nl_list) { + list_del(&ne->nl_list); + LIBCFS_FREE(ne, sizeof(*ne)); + } +} + +int +lnet_peer_add_pref_rtr(struct lnet_peer_ni *lpni, + lnet_nid_t gw_nid) +{ + int cpt = lpni->lpni_cpt; + struct lnet_nid_list *ne = NULL; + + /* This function is called with api_mutex held. When the api_mutex + * is held the list can not be modified, as it is only modified as + * a result of applying a UDSP and that happens under api_mutex + * lock. + */ + __must_hold(&the_lnet.ln_api_mutex); + + list_for_each_entry(ne, &lpni->lpni_rtr_pref_nids, nl_list) { + if (ne->nl_nid == gw_nid) + return -EEXIST; + } + + LIBCFS_CPT_ALLOC(ne, lnet_cpt_table(), cpt, sizeof(*ne)); + if (!ne) + return -ENOMEM; + + ne->nl_nid = gw_nid; + + /* Lock the cpt to protect against addition and checks in the + * selection algorithm + */ + lnet_net_lock(cpt); + list_add(&ne->nl_list, &lpni->lpni_rtr_pref_nids); + lnet_net_unlock(cpt); + + return 0; +} + /* * Test whether a ni is a preferred ni for this peer_ni, e.g, whether * this is a preferred point-to-point path. Call with lnet_net_lock in @@ -823,14 +1030,14 @@ lnet_push_update_to_peers(int force) bool lnet_peer_is_pref_nid_locked(struct lnet_peer_ni *lpni, lnet_nid_t nid) { - int i; + struct lnet_nid_list *ne; if (lpni->lpni_pref_nnids == 0) return false; if (lpni->lpni_pref_nnids == 1) return lpni->lpni_pref.nid == nid; - for (i = 0; i < lpni->lpni_pref_nnids; i++) { - if (lpni->lpni_pref.nids[i] == nid) + list_for_each_entry(ne, &lpni->lpni_pref.nids, nl_list) { + if (ne->nl_nid == nid) return true; } return false; @@ -887,6 +1094,12 @@ lnet_peer_ni_clr_non_mr_pref_nid(struct lnet_peer_ni *lpni) return rc; } +void +lnet_peer_ni_set_selection_priority(struct lnet_peer_ni *lpni, __u32 priority) +{ + lpni->lpni_sel_priority = priority; +} + /* * Clear the preferred NIDs from a non-multi-rail peer. */ @@ -902,11 +1115,10 @@ lnet_peer_clr_non_mr_pref_nids(struct lnet_peer *lp) int lnet_peer_add_pref_nid(struct lnet_peer_ni *lpni, lnet_nid_t nid) { - lnet_nid_t *nids = NULL; - lnet_nid_t *oldnids = NULL; struct lnet_peer *lp = lpni->lpni_peer_net->lpn_peer; - int size; - int i; + struct lnet_nid_list *ne1 = NULL; + struct lnet_nid_list *ne2 = NULL; + lnet_nid_t tmp_nid = LNET_NID_ANY; int rc = 0; if (nid == LNET_NID_ANY) { @@ -920,29 +1132,47 @@ lnet_peer_add_pref_nid(struct lnet_peer_ni *lpni, lnet_nid_t nid) } /* A non-MR node may have only one preferred NI per peer_ni */ - if (lpni->lpni_pref_nnids > 0) { - if (!(lp->lp_state & LNET_PEER_MULTI_RAIL)) { - rc = -EPERM; - goto out; - } + if (lpni->lpni_pref_nnids > 0 && + !(lp->lp_state & LNET_PEER_MULTI_RAIL)) { + rc = -EPERM; + goto out; } + /* add the new preferred nid to the list of preferred nids */ if (lpni->lpni_pref_nnids != 0) { - size = sizeof(*nids) * (lpni->lpni_pref_nnids + 1); - LIBCFS_CPT_ALLOC(nids, lnet_cpt_table(), lpni->lpni_cpt, size); - if (!nids) { + size_t alloc_size = sizeof(*ne1); + + if (lpni->lpni_pref_nnids == 1) { + tmp_nid = lpni->lpni_pref.nid; + INIT_LIST_HEAD(&lpni->lpni_pref.nids); + } + + list_for_each_entry(ne1, &lpni->lpni_pref.nids, nl_list) { + if (ne1->nl_nid == nid) { + rc = -EEXIST; + goto out; + } + } + + LIBCFS_CPT_ALLOC(ne1, lnet_cpt_table(), lpni->lpni_cpt, + alloc_size); + if (!ne1) { rc = -ENOMEM; goto out; } - for (i = 0; i < lpni->lpni_pref_nnids; i++) { - if (lpni->lpni_pref.nids[i] == nid) { - LIBCFS_FREE(nids, size); - rc = -EEXIST; + + /* move the originally stored nid to the list */ + if (lpni->lpni_pref_nnids == 1) { + LIBCFS_CPT_ALLOC(ne2, lnet_cpt_table(), + lpni->lpni_cpt, alloc_size); + if (!ne2) { + rc = -ENOMEM; goto out; } - nids[i] = lpni->lpni_pref.nids[i]; + INIT_LIST_HEAD(&ne2->nl_list); + ne2->nl_nid = tmp_nid; } - nids[i] = nid; + ne1->nl_nid = nid; } lnet_net_lock(LNET_LOCK_EX); @@ -950,18 +1180,15 @@ lnet_peer_add_pref_nid(struct lnet_peer_ni *lpni, lnet_nid_t nid) if (lpni->lpni_pref_nnids == 0) { lpni->lpni_pref.nid = nid; } else { - oldnids = lpni->lpni_pref.nids; - lpni->lpni_pref.nids = nids; + if (ne2) + list_add_tail(&ne2->nl_list, &lpni->lpni_pref.nids); + list_add_tail(&ne1->nl_list, &lpni->lpni_pref.nids); } lpni->lpni_pref_nnids++; lpni->lpni_state &= ~LNET_PEER_NI_NON_MR_PREF; spin_unlock(&lpni->lpni_lock); lnet_net_unlock(LNET_LOCK_EX); - if (oldnids) { - size = sizeof(*nids) * (lpni->lpni_pref_nnids - 1); - LIBCFS_FREE(oldnids, sizeof(*oldnids) * size); - } out: if (rc == -EEXIST && (lpni->lpni_state & LNET_PEER_NI_NON_MR_PREF)) { spin_lock(&lpni->lpni_lock); @@ -976,11 +1203,8 @@ out: int lnet_peer_del_pref_nid(struct lnet_peer_ni *lpni, lnet_nid_t nid) { - lnet_nid_t *nids = NULL; - lnet_nid_t *oldnids = NULL; struct lnet_peer *lp = lpni->lpni_peer_net->lpn_peer; - int size; - int i, j; + struct lnet_nid_list *ne = NULL; int rc = 0; if (lpni->lpni_pref_nnids == 0) { @@ -993,61 +1217,71 @@ lnet_peer_del_pref_nid(struct lnet_peer_ni *lpni, lnet_nid_t nid) rc = -ENOENT; goto out; } - } else if (lpni->lpni_pref_nnids == 2) { - if (lpni->lpni_pref.nids[0] != nid && - lpni->lpni_pref.nids[1] != nid) { - rc = -ENOENT; - goto out; - } } else { - size = sizeof(*nids) * (lpni->lpni_pref_nnids - 1); - LIBCFS_CPT_ALLOC(nids, lnet_cpt_table(), lpni->lpni_cpt, size); - if (!nids) { - rc = -ENOMEM; - goto out; - } - for (i = 0, j = 0; i < lpni->lpni_pref_nnids; i++) { - if (lpni->lpni_pref.nids[i] != nid) - continue; - nids[j++] = lpni->lpni_pref.nids[i]; - } - /* Check if we actually removed a nid. */ - if (j == lpni->lpni_pref_nnids) { - LIBCFS_FREE(nids, size); - rc = -ENOENT; - goto out; + list_for_each_entry(ne, &lpni->lpni_pref.nids, nl_list) { + if (ne->nl_nid == nid) + goto remove_nid_entry; } + rc = -ENOENT; + ne = NULL; + goto out; } +remove_nid_entry: lnet_net_lock(LNET_LOCK_EX); spin_lock(&lpni->lpni_lock); - if (lpni->lpni_pref_nnids == 1) { + if (lpni->lpni_pref_nnids == 1) lpni->lpni_pref.nid = LNET_NID_ANY; - } else if (lpni->lpni_pref_nnids == 2) { - oldnids = lpni->lpni_pref.nids; - if (oldnids[0] == nid) - lpni->lpni_pref.nid = oldnids[1]; - else - lpni->lpni_pref.nid = oldnids[2]; - } else { - oldnids = lpni->lpni_pref.nids; - lpni->lpni_pref.nids = nids; + else { + list_del_init(&ne->nl_list); + if (lpni->lpni_pref_nnids == 2) { + struct lnet_nid_list *ne, *tmp; + + list_for_each_entry_safe(ne, tmp, + &lpni->lpni_pref.nids, + nl_list) { + lpni->lpni_pref.nid = ne->nl_nid; + list_del_init(&ne->nl_list); + LIBCFS_FREE(ne, sizeof(*ne)); + } + } } lpni->lpni_pref_nnids--; lpni->lpni_state &= ~LNET_PEER_NI_NON_MR_PREF; spin_unlock(&lpni->lpni_lock); lnet_net_unlock(LNET_LOCK_EX); - if (oldnids) { - size = sizeof(*nids) * (lpni->lpni_pref_nnids + 1); - LIBCFS_FREE(oldnids, sizeof(*oldnids) * size); - } + if (ne) + LIBCFS_FREE(ne, sizeof(*ne)); out: CDEBUG(D_NET, "peer %s nid %s: %d\n", libcfs_nid2str(lp->lp_primary_nid), libcfs_nid2str(nid), rc); return rc; } +void +lnet_peer_clr_pref_nids(struct lnet_peer_ni *lpni) +{ + struct list_head zombies; + struct lnet_nid_list *ne; + struct lnet_nid_list *tmp; + + INIT_LIST_HEAD(&zombies); + + lnet_net_lock(LNET_LOCK_EX); + if (lpni->lpni_pref_nnids == 1) + lpni->lpni_pref.nid = LNET_NID_ANY; + else if (lpni->lpni_pref_nnids > 1) + list_splice_init(&lpni->lpni_pref.nids, &zombies); + lpni->lpni_pref_nnids = 0; + lnet_net_unlock(LNET_LOCK_EX); + + list_for_each_entry_safe(ne, tmp, &zombies, nl_list) { + list_del_init(&ne->nl_list); + LIBCFS_FREE(ne, sizeof(*ne)); + } +} + lnet_nid_t lnet_peer_primary_nid_locked(lnet_nid_t nid) { @@ -1063,6 +1297,36 @@ lnet_peer_primary_nid_locked(lnet_nid_t nid) return primary_nid; } +bool +lnet_is_discovery_disabled_locked(struct lnet_peer *lp) +__must_hold(&lp->lp_lock) +{ + if (lnet_peer_discovery_disabled) + return true; + + if (!(lp->lp_state & LNET_PEER_MULTI_RAIL) || + (lp->lp_state & LNET_PEER_NO_DISCOVERY)) { + return true; + } + + return false; +} + +/* + * Peer Discovery + */ +bool +lnet_is_discovery_disabled(struct lnet_peer *lp) +{ + bool rc = false; + + spin_lock(&lp->lp_lock); + rc = lnet_is_discovery_disabled_locked(lp); + spin_unlock(&lp->lp_lock); + + return rc; +} + lnet_nid_t LNetPrimaryNID(lnet_nid_t nid) { @@ -1072,6 +1336,9 @@ LNetPrimaryNID(lnet_nid_t nid) int rc = 0; int cpt; + if (nid == LNET_NID_LO_0) + return LNET_NID_LO_0; + cpt = lnet_net_lock_current(); lpni = lnet_nid2peerni_locked(nid, LNET_NID_ANY, cpt); if (IS_ERR(lpni)) { @@ -1079,11 +1346,31 @@ LNetPrimaryNID(lnet_nid_t nid) goto out_unlock; } lp = lpni->lpni_peer_net->lpn_peer; + while (!lnet_peer_is_uptodate(lp)) { + spin_lock(&lp->lp_lock); + /* force a full discovery cycle */ + lp->lp_state |= LNET_PEER_FORCE_PING | LNET_PEER_FORCE_PUSH; + spin_unlock(&lp->lp_lock); + rc = lnet_discover_peer_locked(lpni, cpt, true); if (rc) goto out_decref; + /* The lpni (or lp) for this NID may have changed and our ref is + * the only thing keeping the old one around. Release the ref + * and lookup the lpni again + */ + lnet_peer_ni_decref_locked(lpni); + lpni = lnet_find_peer_ni_locked(nid); + if (!lpni) { + rc = -ENOENT; + goto out_unlock; + } lp = lpni->lpni_peer_net->lpn_peer; + + /* Only try once if discovery is disabled */ + if (lnet_is_discovery_disabled(lp)) + break; } primary_nid = lp->lp_primary_nid; out_decref: @@ -1114,9 +1401,9 @@ lnet_peer_get_net_locked(struct lnet_peer *peer, __u32 net_id) * may be attached to a different peer, in which case it will be * properly detached first. The whole operation is done atomically. * - * Always returns 0. This is the last function called from functions - * that do return an int, so returning 0 here allows the compiler to - * do a tail call. + * This function consumes the reference on lpni and Always returns 0. + * This is the last function called from functions that do return an + * int, so returning 0 here allows the compiler to do a tail call. */ static int lnet_peer_attach_peer_ni(struct lnet_peer *lp, @@ -1125,6 +1412,8 @@ lnet_peer_attach_peer_ni(struct lnet_peer *lp, unsigned flags) { struct lnet_peer_table *ptable; + bool new_lpn = false; + int rc; /* Install the new peer_ni */ lnet_net_lock(LNET_LOCK_EX); @@ -1135,9 +1424,7 @@ lnet_peer_attach_peer_ni(struct lnet_peer *lp, ptable = the_lnet.ln_peer_tables[lpni->lpni_cpt]; list_add_tail(&lpni->lpni_hashlist, &ptable->pt_hash[hash]); ptable->pt_version++; - ptable->pt_number++; - /* This is the 1st refcount on lpni. */ - atomic_inc(&lpni->lpni_refcount); + lnet_peer_ni_addref_locked(lpni); } /* Detach the peer_ni from an existing peer, if necessary. */ @@ -1151,13 +1438,21 @@ lnet_peer_attach_peer_ni(struct lnet_peer *lp, /* Add peer_ni to peer_net */ lpni->lpni_peer_net = lpn; - list_add_tail(&lpni->lpni_peer_nis, &lpn->lpn_peer_nis); + if (lp->lp_primary_nid == lpni->lpni_nid) + list_add(&lpni->lpni_peer_nis, &lpn->lpn_peer_nis); + else + list_add_tail(&lpni->lpni_peer_nis, &lpn->lpn_peer_nis); + lnet_update_peer_net_healthv(lpni); lnet_peer_net_addref_locked(lpn); /* Add peer_net to peer */ if (!lpn->lpn_peer) { + new_lpn = true; lpn->lpn_peer = lp; - list_add_tail(&lpn->lpn_peer_nets, &lp->lp_peer_nets); + if (lp->lp_primary_nid == lpni->lpni_nid) + list_add(&lpn->lpn_peer_nets, &lp->lp_peer_nets); + else + list_add_tail(&lpn->lpn_peer_nets, &lp->lp_peer_nets); lnet_peer_addref_locked(lp); } @@ -1184,11 +1479,24 @@ lnet_peer_attach_peer_ni(struct lnet_peer *lp, spin_unlock(&lp->lp_lock); lp->lp_nnis++; - lnet_net_unlock(LNET_LOCK_EX); + + /* apply UDSPs */ + if (new_lpn) { + rc = lnet_udsp_apply_policies_on_lpn(lpn); + if (rc) + CERROR("Failed to apply UDSPs on lpn %s\n", + libcfs_net2str(lpn->lpn_net_id)); + } + rc = lnet_udsp_apply_policies_on_lpni(lpni); + if (rc) + CERROR("Failed to apply UDSPs on lpni %s\n", + libcfs_nid2str(lpni->lpni_nid)); CDEBUG(D_NET, "peer %s NID %s flags %#x\n", libcfs_nid2str(lp->lp_primary_nid), libcfs_nid2str(lpni->lpni_nid), flags); + lnet_peer_ni_decref_locked(lpni); + lnet_net_unlock(LNET_LOCK_EX); return 0; } @@ -1309,27 +1617,39 @@ lnet_peer_add_nid(struct lnet_peer *lp, lnet_nid_t nid, unsigned flags) * it is not connected to this peer and was configured * by DLC. */ - lnet_peer_ni_decref_locked(lpni); if (lpni->lpni_peer_net->lpn_peer == lp) - goto out; + goto out_free_lpni; if (lnet_peer_ni_is_configured(lpni)) { rc = -EEXIST; - goto out; + goto out_free_lpni; } /* If this is the primary NID, destroy the peer. */ if (lnet_peer_ni_is_primary(lpni)) { + struct lnet_peer *rtr_lp = + lpni->lpni_peer_net->lpn_peer; + int rtr_refcount = rtr_lp->lp_rtr_refcount; + /* + * if we're trying to delete a router it means + * we're moving this peer NI to a new peer so must + * transfer router properties to the new peer + */ + if (rtr_refcount > 0) { + flags |= LNET_PEER_RTR_NI_FORCE_DEL; + lnet_rtr_transfer_to_peer(rtr_lp, lp); + } lnet_peer_del(lpni->lpni_peer_net->lpn_peer); + lnet_peer_ni_decref_locked(lpni); lpni = lnet_peer_ni_alloc(nid); if (!lpni) { rc = -ENOMEM; - goto out; + goto out_free_lpni; } } } else { lpni = lnet_peer_ni_alloc(nid); if (!lpni) { rc = -ENOMEM; - goto out; + goto out_free_lpni; } } @@ -1352,9 +1672,7 @@ lnet_peer_add_nid(struct lnet_peer *lp, lnet_nid_t nid, unsigned flags) return lnet_peer_attach_peer_ni(lp, lpn, lpni, flags); out_free_lpni: - /* If the peer_ni was allocated above its peer_net pointer is NULL */ - if (!lpni->lpni_peer_net) - LIBCFS_FREE(lpni, sizeof(*lpni)); + lnet_peer_ni_decref_locked(lpni); out: CDEBUG(D_NET, "peer %s NID %s flags %#x: %d\n", libcfs_nid2str(lp->lp_primary_nid), libcfs_nid2str(nid), @@ -1375,10 +1693,14 @@ lnet_peer_set_primary_nid(struct lnet_peer *lp, lnet_nid_t nid, unsigned flags) if (lp->lp_primary_nid == nid) goto out; + + lp->lp_primary_nid = nid; + rc = lnet_peer_add_nid(lp, nid, flags); - if (rc) + if (rc) { + lp->lp_primary_nid = old; goto out; - lp->lp_primary_nid = nid; + } out: CDEBUG(D_NET, "peer %s NID %s: %d\n", libcfs_nid2str(old), libcfs_nid2str(nid), rc); @@ -1540,6 +1862,15 @@ lnet_del_peer_ni(lnet_nid_t prim_nid, lnet_nid_t nid) return -ENODEV; } + lnet_net_lock(LNET_LOCK_EX); + if (lp->lp_rtr_refcount > 0) { + lnet_net_unlock(LNET_LOCK_EX); + CERROR("%s is a router. Can not be deleted\n", + libcfs_nid2str(prim_nid)); + return -EBUSY; + } + lnet_net_unlock(LNET_LOCK_EX); + if (nid == LNET_NID_ANY || nid == lp->lp_primary_nid) return lnet_peer_del(lp); @@ -1551,15 +1882,16 @@ lnet_del_peer_ni(lnet_nid_t prim_nid, lnet_nid_t nid) } void -lnet_destroy_peer_ni_locked(struct lnet_peer_ni *lpni) +lnet_destroy_peer_ni_locked(struct kref *ref) { + struct lnet_peer_ni *lpni = container_of(ref, struct lnet_peer_ni, + lpni_kref); struct lnet_peer_table *ptable; struct lnet_peer_net *lpn; CDEBUG(D_NET, "%p nid %s\n", lpni, libcfs_nid2str(lpni->lpni_nid)); - LASSERT(atomic_read(&lpni->lpni_refcount) == 0); - LASSERT(lpni->lpni_rtr_refcount == 0); + LASSERT(kref_read(&lpni->lpni_kref) == 0); LASSERT(list_empty(&lpni->lpni_txq)); LASSERT(lpni->lpni_txqnob == 0); LASSERT(list_empty(&lpni->lpni_peer_nis)); @@ -1569,20 +1901,28 @@ lnet_destroy_peer_ni_locked(struct lnet_peer_ni *lpni) lpni->lpni_peer_net = NULL; lpni->lpni_net = NULL; - /* remove the peer ni from the zombie list */ - ptable = the_lnet.ln_peer_tables[lpni->lpni_cpt]; - spin_lock(&ptable->pt_zombie_lock); - list_del_init(&lpni->lpni_hashlist); - ptable->pt_zombies--; - spin_unlock(&ptable->pt_zombie_lock); + if (!list_empty(&lpni->lpni_hashlist)) { + /* remove the peer ni from the zombie list */ + ptable = the_lnet.ln_peer_tables[lpni->lpni_cpt]; + spin_lock(&ptable->pt_zombie_lock); + list_del_init(&lpni->lpni_hashlist); + ptable->pt_zombies--; + spin_unlock(&ptable->pt_zombie_lock); + } if (lpni->lpni_pref_nnids > 1) { - LIBCFS_FREE(lpni->lpni_pref.nids, - sizeof(*lpni->lpni_pref.nids) * lpni->lpni_pref_nnids); + struct lnet_nid_list *ne, *tmp; + + list_for_each_entry_safe(ne, tmp, &lpni->lpni_pref.nids, + nl_list) { + list_del_init(&ne->nl_list); + LIBCFS_FREE(ne, sizeof(*ne)); + } } LIBCFS_FREE(lpni, sizeof(*lpni)); - lnet_peer_net_decref_locked(lpn); + if (lpn) + lnet_peer_net_decref_locked(lpn); } struct lnet_peer_ni * @@ -1686,9 +2026,29 @@ out_mutex_unlock: return lpni; } -/* - * Peer Discovery - */ +bool +lnet_peer_gw_discovery(struct lnet_peer *lp) +{ + bool rc = false; + + spin_lock(&lp->lp_lock); + if (lp->lp_state & LNET_PEER_RTR_DISCOVERY) + rc = true; + spin_unlock(&lp->lp_lock); + + return rc; +} + +bool +lnet_peer_is_uptodate(struct lnet_peer *lp) +{ + bool rc; + + spin_lock(&lp->lp_lock); + rc = lnet_peer_is_uptodate_locked(lp); + spin_unlock(&lp->lp_lock); + return rc; +} /* * Is a peer uptodate from the point of view of discovery? @@ -1699,22 +2059,17 @@ out_mutex_unlock: * Otherwise look at whether the peer needs rediscovering. */ bool -lnet_peer_is_uptodate(struct lnet_peer *lp) +lnet_peer_is_uptodate_locked(struct lnet_peer *lp) +__must_hold(&lp->lp_lock) { bool rc; - spin_lock(&lp->lp_lock); if (lp->lp_state & (LNET_PEER_DISCOVERING | LNET_PEER_FORCE_PING | LNET_PEER_FORCE_PUSH)) { rc = false; - } else if (lp->lp_state & LNET_PEER_NO_DISCOVERY) { - rc = true; } else if (lp->lp_state & LNET_PEER_REDISCOVER) { - if (lnet_peer_discovery_disabled) - rc = true; - else - rc = false; + rc = false; } else if (lnet_peer_needs_push(lp)) { rc = false; } else if (lp->lp_state & LNET_PEER_DISCOVERED) { @@ -1725,11 +2080,30 @@ lnet_peer_is_uptodate(struct lnet_peer *lp) } else { rc = false; } - spin_unlock(&lp->lp_lock); return rc; } +/* Add the message to the peer's lp_dc_pendq and queue the peer for discovery */ +void +lnet_peer_queue_message(struct lnet_peer *lp, struct lnet_msg *msg) +{ + /* The discovery thread holds net_lock/EX and lp_lock when it splices + * the lp_dc_pendq onto a local list for resending. Thus, we do the same + * when adding to the list and queuing the peer to ensure that we do not + * strand any messages on the lp_dc_pendq. This scheme ensures the + * message will be resent even if the peer is already being discovered. + * Therefore we needn't check the return value of + * lnet_peer_queue_for_discovery(lp). + */ + lnet_net_lock(LNET_LOCK_EX); + spin_lock(&lp->lp_lock); + list_add_tail(&msg->msg_list, &lp->lp_dc_pendq); + spin_unlock(&lp->lp_lock); + lnet_peer_queue_for_discovery(lp); + lnet_net_unlock(LNET_LOCK_EX); +} + /* * Queue a peer for the attention of the discovery thread. Call with * lnet_net_lock/EX held. Returns 0 if the peer was queued, and @@ -1766,16 +2140,19 @@ static void lnet_peer_discovery_complete(struct lnet_peer *lp) { struct lnet_msg *msg, *tmp; int rc = 0; - struct list_head pending_msgs; - - INIT_LIST_HEAD(&pending_msgs); + LIST_HEAD(pending_msgs); CDEBUG(D_NET, "Discovery complete. Dequeue peer %s\n", libcfs_nid2str(lp->lp_primary_nid)); list_del_init(&lp->lp_dc_list); + spin_lock(&lp->lp_lock); list_splice_init(&lp->lp_dc_pendq, &pending_msgs); - wake_up_all(&lp->lp_dc_waitq); + spin_unlock(&lp->lp_lock); + wake_up(&lp->lp_dc_waitq); + + if (lp->lp_rtr_refcount > 0) + lnet_router_discovery_complete(lp); lnet_net_unlock(LNET_LOCK_EX); @@ -1809,15 +2186,18 @@ static void lnet_peer_discovery_complete(struct lnet_peer *lp) */ void lnet_peer_push_event(struct lnet_event *ev) { - struct lnet_ping_buffer *pbuf = ev->md.user_ptr; + struct lnet_ping_buffer *pbuf; struct lnet_peer *lp; + pbuf = LNET_PING_INFO_TO_BUFFER(ev->md_start + ev->offset); + /* lnet_find_peer() adds a refcount */ lp = lnet_find_peer(ev->source.nid); if (!lp) { CDEBUG(D_NET, "Push Put from unknown %s (source %s). Ignoring...\n", libcfs_nid2str(ev->initiator.nid), libcfs_nid2str(ev->source.nid)); + pbuf->pb_needs_post = true; return; } @@ -1869,26 +2249,24 @@ void lnet_peer_push_event(struct lnet_event *ev) } /* - * Check the MULTIRAIL flag. Complain if the peer was DLC - * configured without it. - */ - if (!(lp->lp_state & LNET_PEER_MULTI_RAIL)) { - if (lp->lp_state & LNET_PEER_CONFIGURED) { - CERROR("Push says %s is Multi-Rail, DLC says not\n", - libcfs_nid2str(lp->lp_primary_nid)); - } else { - lp->lp_state |= LNET_PEER_MULTI_RAIL; - lnet_peer_clr_non_mr_pref_nids(lp); - } - } - - /* * The peer may have discovery disabled at its end. Set * NO_DISCOVERY as appropriate. */ if (!(pbuf->pb_info.pi_features & LNET_PING_FEAT_DISCOVERY)) { CDEBUG(D_NET, "Peer %s has discovery disabled\n", libcfs_nid2str(lp->lp_primary_nid)); + /* + * Mark the peer for deletion if we already know about it + * and it's going from discovery set to no discovery set + */ + if (!(lp->lp_state & (LNET_PEER_NO_DISCOVERY | + LNET_PEER_DISCOVERING)) && + lp->lp_state & LNET_PEER_DISCOVERED) { + CDEBUG(D_NET, "Marking %s:0x%x for deletion\n", + libcfs_nid2str(lp->lp_primary_nid), + lp->lp_state); + lp->lp_state |= LNET_PEER_MARK_DELETION; + } lp->lp_state |= LNET_PEER_NO_DISCOVERY; } else if (lp->lp_state & LNET_PEER_NO_DISCOVERY) { CDEBUG(D_NET, "Peer %s has discovery enabled\n", @@ -1897,11 +2275,36 @@ void lnet_peer_push_event(struct lnet_event *ev) } /* + * Update the MULTI_RAIL flag based on the push. If the peer + * was configured with DLC then the setting should match what + * DLC put in. + * NB: We verified above that the MR feature bit is set in pi_features + */ + if (lp->lp_state & LNET_PEER_MULTI_RAIL) { + CDEBUG(D_NET, "peer %s(%p) is MR\n", + libcfs_nid2str(lp->lp_primary_nid), lp); + } else if (lp->lp_state & LNET_PEER_CONFIGURED) { + CWARN("Push says %s is Multi-Rail, DLC says not\n", + libcfs_nid2str(lp->lp_primary_nid)); + } else if (lnet_peer_discovery_disabled) { + CDEBUG(D_NET, "peer %s(%p) not MR: DD disabled locally\n", + libcfs_nid2str(lp->lp_primary_nid), lp); + } else if (lp->lp_state & LNET_PEER_NO_DISCOVERY) { + CDEBUG(D_NET, "peer %s(%p) not MR: DD disabled remotely\n", + libcfs_nid2str(lp->lp_primary_nid), lp); + } else { + CDEBUG(D_NET, "peer %s(%p) is MR capable\n", + libcfs_nid2str(lp->lp_primary_nid), lp); + lp->lp_state |= LNET_PEER_MULTI_RAIL; + lnet_peer_clr_non_mr_pref_nids(lp); + } + + /* * Check for truncation of the Put message. Clear the * NIDS_UPTODATE flag and set FORCE_PING to trigger a ping, * and tell discovery to allocate a bigger buffer. */ - if (pbuf->pb_nnis < pbuf->pb_info.pi_nnis) { + if (ev->mlength < ev->rlength) { if (the_lnet.ln_push_target_nnis < pbuf->pb_info.pi_nnis) the_lnet.ln_push_target_nnis = pbuf->pb_info.pi_nnis; lp->lp_state &= ~LNET_PEER_NIDS_UPTODATE; @@ -1912,38 +2315,9 @@ void lnet_peer_push_event(struct lnet_event *ev) goto out; } - /* - * Check whether the Put data is stale. Stale data can just be - * dropped. - */ - if (pbuf->pb_info.pi_nnis > 1 && - lp->lp_primary_nid == pbuf->pb_info.pi_ni[1].ns_nid && - LNET_PING_BUFFER_SEQNO(pbuf) < lp->lp_peer_seqno) { - CDEBUG(D_NET, "Stale Push from %s: got %u have %u\n", - libcfs_nid2str(lp->lp_primary_nid), - LNET_PING_BUFFER_SEQNO(pbuf), - lp->lp_peer_seqno); - goto out; - } - - /* - * Check whether the Put data is new, in which case we clear - * the UPTODATE flag and prepare to process it. - * - * If the Put data is current, and the peer is UPTODATE then - * we assome everything is all right and drop the data as - * stale. - */ - if (LNET_PING_BUFFER_SEQNO(pbuf) > lp->lp_peer_seqno) { - lp->lp_peer_seqno = LNET_PING_BUFFER_SEQNO(pbuf); - lp->lp_state &= ~LNET_PEER_NIDS_UPTODATE; - } else if (lp->lp_state & LNET_PEER_NIDS_UPTODATE) { - CDEBUG(D_NET, "Stale Push from %s: got %u have %u\n", - libcfs_nid2str(lp->lp_primary_nid), - LNET_PING_BUFFER_SEQNO(pbuf), - lp->lp_peer_seqno); - goto out; - } + /* always assume new data */ + lp->lp_peer_seqno = LNET_PING_BUFFER_SEQNO(pbuf); + lp->lp_state &= ~LNET_PEER_NIDS_UPTODATE; /* * If there is data present that hasn't been processed yet, @@ -1988,6 +2362,9 @@ void lnet_peer_push_event(struct lnet_event *ev) LNET_PING_BUFFER_SEQNO(pbuf)); out: + /* We've processed this buffer. It can be reposted */ + pbuf->pb_needs_post = true; + /* * Queue the peer for discovery if not done, force it on the request * queue and wake the discovery thread if the peer was already queued, @@ -2027,6 +2404,7 @@ lnet_discover_peer_locked(struct lnet_peer_ni *lpni, int cpt, bool block) DEFINE_WAIT(wait); struct lnet_peer *lp; int rc = 0; + int count = 0; again: lnet_net_unlock(cpt); @@ -2039,26 +2417,38 @@ again: * zombie if we race with DLC, so we must check for that. */ for (;;) { + /* Keep lp alive when the lnet_net_lock is unlocked */ + lnet_peer_addref_locked(lp); prepare_to_wait(&lp->lp_dc_waitq, &wait, TASK_INTERRUPTIBLE); if (signal_pending(current)) break; if (the_lnet.ln_dc_state != LNET_DC_STATE_RUNNING) break; + /* + * Don't repeat discovery if discovery is disabled. This is + * done to ensure we can use discovery as a standard ping as + * well for backwards compatibility with routers which do not + * have discovery or have discovery disabled + */ + if (lnet_is_discovery_disabled(lp) && count > 0) + break; if (lp->lp_dc_error) break; if (lnet_peer_is_uptodate(lp)) break; lnet_peer_queue_for_discovery(lp); + count++; + CDEBUG(D_NET, "Discovery attempt # %d\n", count); + /* - * if caller requested a non-blocking operation then - * return immediately. Once discovery is complete then the - * peer ref will be decremented and any pending messages - * that were stopped due to discovery will be transmitted. + * If caller requested a non-blocking operation then + * return immediately. Once discovery is complete any + * pending messages that were stopped due to discovery + * will be transmitted. */ if (!block) break; - lnet_peer_addref_locked(lp); lnet_net_unlock(LNET_LOCK_EX); schedule(); finish_wait(&lp->lp_dc_waitq, &wait); @@ -2071,11 +2461,13 @@ again: lnet_net_unlock(LNET_LOCK_EX); lnet_net_lock(cpt); - + lnet_peer_decref_locked(lp); /* - * If the peer has changed after we've discovered the older peer, - * then we need to discovery the new peer to make sure the - * interface information is up to date + * The peer may have changed, so re-check and rediscover if that turns + * out to have been the case. The reference count on lp ensured that + * even if it was unlinked from lpni the memory could not be recycled. + * Thus the check below is sufficient to determine whether the peer + * changed. If the peer changed, then lp must not be dereferenced. */ if (lp != lpni->lpni_peer_net->lpn_peer) goto again; @@ -2088,7 +2480,7 @@ again: rc = lp->lp_dc_error; else if (!block) CDEBUG(D_NET, "non-blocking discovery\n"); - else if (!lnet_peer_is_uptodate(lp)) + else if (!lnet_peer_is_uptodate(lp) && !lnet_is_discovery_disabled(lp)) goto again; CDEBUG(D_NET, "peer %s NID %s: %d. %s\n", @@ -2105,7 +2497,7 @@ lnet_discovery_event_ack(struct lnet_peer *lp, struct lnet_event *ev) { struct lnet_ping_buffer *pbuf; - pbuf = LNET_PING_INFO_TO_BUFFER(ev->md.start); + pbuf = LNET_PING_INFO_TO_BUFFER(ev->md_start); spin_lock(&lp->lp_lock); lp->lp_state &= ~LNET_PEER_PUSH_SENT; lp->lp_push_error = ev->status; @@ -2128,6 +2520,8 @@ lnet_discovery_event_reply(struct lnet_peer *lp, struct lnet_event *ev) spin_lock(&lp->lp_lock); + lp->lp_disc_src_nid = ev->target.nid; + /* * If some kind of error happened the contents of message * cannot be used. Set PING_FAILED to trigger a retry. @@ -2142,7 +2536,7 @@ lnet_discovery_event_reply(struct lnet_peer *lp, struct lnet_event *ev) goto out; } - pbuf = LNET_PING_INFO_TO_BUFFER(ev->md.start); + pbuf = LNET_PING_INFO_TO_BUFFER(ev->md_start); if (pbuf->pb_info.pi_magic == __swab32(LNET_PROTO_PING_MAGIC)) lnet_swap_pinginfo(pbuf); @@ -2159,6 +2553,22 @@ lnet_discovery_event_reply(struct lnet_peer *lp, struct lnet_event *ev) goto out; } + + /* + * The peer may have discovery disabled at its end. Set + * NO_DISCOVERY as appropriate. + */ + if ((pbuf->pb_info.pi_features & LNET_PING_FEAT_DISCOVERY) && + !lnet_peer_discovery_disabled) { + CDEBUG(D_NET, "Peer %s has discovery enabled\n", + libcfs_nid2str(lp->lp_primary_nid)); + lp->lp_state &= ~LNET_PEER_NO_DISCOVERY; + } else { + CDEBUG(D_NET, "Peer %s has discovery disabled\n", + libcfs_nid2str(lp->lp_primary_nid)); + lp->lp_state |= LNET_PEER_NO_DISCOVERY; + } + /* * Update the MULTI_RAIL flag based on the reply. If the peer * was configured with DLC then the setting should match what @@ -2166,11 +2576,22 @@ lnet_discovery_event_reply(struct lnet_peer *lp, struct lnet_event *ev) */ if (pbuf->pb_info.pi_features & LNET_PING_FEAT_MULTI_RAIL) { if (lp->lp_state & LNET_PEER_MULTI_RAIL) { - /* Everything's fine */ + CDEBUG(D_NET, "peer %s(%p) is MR\n", + libcfs_nid2str(lp->lp_primary_nid), lp); } else if (lp->lp_state & LNET_PEER_CONFIGURED) { CWARN("Reply says %s is Multi-Rail, DLC says not\n", libcfs_nid2str(lp->lp_primary_nid)); + } else if (lnet_peer_discovery_disabled) { + CDEBUG(D_NET, + "peer %s(%p) not MR: DD disabled locally\n", + libcfs_nid2str(lp->lp_primary_nid), lp); + } else if (lp->lp_state & LNET_PEER_NO_DISCOVERY) { + CDEBUG(D_NET, + "peer %s(%p) not MR: DD disabled remotely\n", + libcfs_nid2str(lp->lp_primary_nid), lp); } else { + CDEBUG(D_NET, "peer %s(%p) is MR capable\n", + libcfs_nid2str(lp->lp_primary_nid), lp); lp->lp_state |= LNET_PEER_MULTI_RAIL; lnet_peer_clr_non_mr_pref_nids(lp); } @@ -2193,20 +2614,6 @@ lnet_discovery_event_reply(struct lnet_peer *lp, struct lnet_event *ev) lp->lp_data_nnis = pbuf->pb_info.pi_nnis; /* - * The peer may have discovery disabled at its end. Set - * NO_DISCOVERY as appropriate. - */ - if (!(pbuf->pb_info.pi_features & LNET_PING_FEAT_DISCOVERY)) { - CDEBUG(D_NET, "Peer %s has discovery disabled\n", - libcfs_nid2str(lp->lp_primary_nid)); - lp->lp_state |= LNET_PEER_NO_DISCOVERY; - } else if (lp->lp_state & LNET_PEER_NO_DISCOVERY) { - CDEBUG(D_NET, "Peer %s has discovery enabled\n", - libcfs_nid2str(lp->lp_primary_nid)); - lp->lp_state &= ~LNET_PEER_NO_DISCOVERY; - } - - /* * Check for truncation of the Reply. Clear PING_SENT and set * PING_FAILED to trigger a retry. */ @@ -2228,21 +2635,18 @@ lnet_discovery_event_reply(struct lnet_peer *lp, struct lnet_event *ev) if (pbuf->pb_info.pi_features & LNET_PING_FEAT_MULTI_RAIL && pbuf->pb_info.pi_nnis > 1 && lp->lp_primary_nid == pbuf->pb_info.pi_ni[1].ns_nid) { - if (LNET_PING_BUFFER_SEQNO(pbuf) < lp->lp_peer_seqno) { - CDEBUG(D_NET, "Stale Reply from %s: got %u have %u\n", + if (LNET_PING_BUFFER_SEQNO(pbuf) < lp->lp_peer_seqno) + CDEBUG(D_NET, "peer %s: seq# got %u have %u. peer rebooted?\n", libcfs_nid2str(lp->lp_primary_nid), LNET_PING_BUFFER_SEQNO(pbuf), lp->lp_peer_seqno); - goto out; - } - if (LNET_PING_BUFFER_SEQNO(pbuf) > lp->lp_peer_seqno) - lp->lp_peer_seqno = LNET_PING_BUFFER_SEQNO(pbuf); + lp->lp_peer_seqno = LNET_PING_BUFFER_SEQNO(pbuf); } /* We're happy with the state of the data in the buffer. */ - CDEBUG(D_NET, "peer %s data present %u\n", - libcfs_nid2str(lp->lp_primary_nid), lp->lp_peer_seqno); + CDEBUG(D_NET, "peer %s data present %u. state = 0x%x\n", + libcfs_nid2str(lp->lp_primary_nid), lp->lp_peer_seqno, lp->lp_state); if (lp->lp_state & LNET_PEER_DATA_PRESENT) lnet_ping_buffer_decref(lp->lp_data); else @@ -2252,6 +2656,15 @@ lnet_discovery_event_reply(struct lnet_peer *lp, struct lnet_event *ev) out: lp->lp_state &= ~LNET_PEER_PING_SENT; spin_unlock(&lp->lp_lock); + + lnet_net_lock(LNET_LOCK_EX); + /* + * If this peer is a gateway, call the routing callback to + * handle the ping reply + */ + if (lp->lp_rtr_refcount > 0) + lnet_router_discovery_ping_reply(lp); + lnet_net_unlock(LNET_LOCK_EX); } /* @@ -2324,7 +2737,7 @@ lnet_discovery_event_unlink(struct lnet_peer *lp, struct lnet_event *ev) */ static void lnet_discovery_event_handler(struct lnet_event *event) { - struct lnet_peer *lp = event->md.user_ptr; + struct lnet_peer *lp = event->md_user_ptr; struct lnet_ping_buffer *pbuf; int rc; @@ -2354,7 +2767,7 @@ static void lnet_discovery_event_handler(struct lnet_event *event) } lnet_net_lock(LNET_LOCK_EX); if (event->unlinked) { - pbuf = LNET_PING_INFO_TO_BUFFER(event->md.start); + pbuf = LNET_PING_INFO_TO_BUFFER(event->md_start); lnet_ping_buffer_decref(pbuf); lnet_peer_decref_locked(lp); } @@ -2389,9 +2802,10 @@ static void lnet_discovery_event_handler(struct lnet_event *event) static int lnet_peer_merge_data(struct lnet_peer *lp, struct lnet_ping_buffer *pbuf) { + struct lnet_peer_net *lpn; struct lnet_peer_ni *lpni; lnet_nid_t *curnis = NULL; - lnet_nid_t *addnis = NULL; + struct lnet_ni_status *addnis = NULL; lnet_nid_t *delnis = NULL; unsigned flags; int ncurnis; @@ -2406,10 +2820,21 @@ static int lnet_peer_merge_data(struct lnet_peer *lp, if (pbuf->pb_info.pi_features & LNET_PING_FEAT_MULTI_RAIL) flags |= LNET_PEER_MULTI_RAIL; - nnis = MAX(lp->lp_nnis, pbuf->pb_info.pi_nnis); - LIBCFS_ALLOC(curnis, nnis * sizeof(lnet_nid_t)); - LIBCFS_ALLOC(addnis, nnis * sizeof(lnet_nid_t)); - LIBCFS_ALLOC(delnis, nnis * sizeof(lnet_nid_t)); + /* + * Cache the routing feature for the peer; whether it is enabled + * for disabled as reported by the remote peer. + */ + spin_lock(&lp->lp_lock); + if (!(pbuf->pb_info.pi_features & LNET_PING_FEAT_RTE_DISABLED)) + lp->lp_state |= LNET_PEER_ROUTER_ENABLED; + else + lp->lp_state &= ~LNET_PEER_ROUTER_ENABLED; + spin_unlock(&lp->lp_lock); + + nnis = max_t(int, lp->lp_nnis, pbuf->pb_info.pi_nnis); + CFS_ALLOC_PTR_ARRAY(curnis, nnis); + CFS_ALLOC_PTR_ARRAY(addnis, nnis); + CFS_ALLOC_PTR_ARRAY(delnis, nnis); if (!curnis || !addnis || !delnis) { rc = -ENOMEM; goto out; @@ -2432,7 +2857,7 @@ static int lnet_peer_merge_data(struct lnet_peer *lp, if (pbuf->pb_info.pi_ni[i].ns_nid == curnis[j]) break; if (j == ncurnis) - addnis[naddnis++] = pbuf->pb_info.pi_ni[i].ns_nid; + addnis[naddnis++] = pbuf->pb_info.pi_ni[i]; } /* * Check for NIDs in curnis[] not present in pbuf. @@ -2442,26 +2867,61 @@ static int lnet_peer_merge_data(struct lnet_peer *lp, * present in curnis[] then this peer is for this node. */ for (i = 0; i < ncurnis; i++) { - if (LNET_NETTYP(LNET_NIDNET(curnis[i])) == LOLND) + if (curnis[i] == LNET_NID_LO_0) continue; - for (j = 1; j < pbuf->pb_info.pi_nnis; j++) - if (curnis[i] == pbuf->pb_info.pi_ni[j].ns_nid) + for (j = 1; j < pbuf->pb_info.pi_nnis; j++) { + if (curnis[i] == pbuf->pb_info.pi_ni[j].ns_nid) { + /* + * update the information we cache for the + * peer with the latest information we + * received + */ + lpni = lnet_find_peer_ni_locked(curnis[i]); + if (lpni) { + lpni->lpni_ns_status = pbuf->pb_info.pi_ni[j].ns_status; + lnet_peer_ni_decref_locked(lpni); + } break; + } + } if (j == pbuf->pb_info.pi_nnis) delnis[ndelnis++] = curnis[i]; } + /* + * If we get here and the discovery is disabled then we don't want + * to add or delete any NIs. We just updated the ones we have some + * information on, and call it a day + */ + rc = 0; + if (lnet_is_discovery_disabled(lp)) + goto out; + for (i = 0; i < naddnis; i++) { - rc = lnet_peer_add_nid(lp, addnis[i], flags); + rc = lnet_peer_add_nid(lp, addnis[i].ns_nid, flags); if (rc) { CERROR("Error adding NID %s to peer %s: %d\n", - libcfs_nid2str(addnis[i]), + libcfs_nid2str(addnis[i].ns_nid), libcfs_nid2str(lp->lp_primary_nid), rc); if (rc == -ENOMEM) goto out; } + lpni = lnet_find_peer_ni_locked(addnis[i].ns_nid); + if (lpni) { + lpni->lpni_ns_status = addnis[i].ns_status; + lnet_peer_ni_decref_locked(lpni); + } } + for (i = 0; i < ndelnis; i++) { + /* + * for routers it's okay to delete the primary_nid because + * the upper layers don't really rely on it. So if we're + * being told that the router changed its primary_nid + * then it's okay to delete it. + */ + if (lp->lp_rtr_refcount > 0) + flags |= LNET_PEER_RTR_NI_FORCE_DEL; rc = lnet_peer_del_nid(lp, delnis[i], flags); if (rc) { CERROR("Error deleting NID %s from peer %s: %d\n", @@ -2471,6 +2931,28 @@ static int lnet_peer_merge_data(struct lnet_peer *lp, goto out; } } + + /* The peer net for the primary NID should be the first entry in the + * peer's lp_peer_nets list, and the peer NI for the primary NID should + * be the first entry in its peer net's lpn_peer_nis list. + */ + lpni = lnet_find_peer_ni_locked(pbuf->pb_info.pi_ni[1].ns_nid); + if (!lpni) { + CERROR("Internal error: Failed to lookup peer NI for primary NID: %s\n", + libcfs_nid2str(pbuf->pb_info.pi_ni[1].ns_nid)); + goto out; + } + + lnet_peer_ni_decref_locked(lpni); + + lpn = lpni->lpni_peer_net; + if (lpn->lpn_peer_nets.prev != &lp->lp_peer_nets) + list_move(&lpn->lpn_peer_nets, &lp->lp_peer_nets); + + if (lpni->lpni_peer_nis.prev != &lpni->lpni_peer_net->lpn_peer_nis) + list_move(&lpni->lpni_peer_nis, + &lpni->lpni_peer_net->lpn_peer_nis); + /* * Errors other than -ENOMEM are due to peers having been * configured with DLC. Ignore these because DLC overrides @@ -2478,11 +2960,11 @@ static int lnet_peer_merge_data(struct lnet_peer *lp, */ rc = 0; out: - LIBCFS_FREE(curnis, nnis * sizeof(lnet_nid_t)); - LIBCFS_FREE(addnis, nnis * sizeof(lnet_nid_t)); - LIBCFS_FREE(delnis, nnis * sizeof(lnet_nid_t)); + CFS_FREE_PTR_ARRAY(curnis, nnis); + CFS_FREE_PTR_ARRAY(addnis, nnis); + CFS_FREE_PTR_ARRAY(delnis, nnis); lnet_ping_buffer_decref(pbuf); - CDEBUG(D_NET, "peer %s: %d\n", libcfs_nid2str(lp->lp_primary_nid), rc); + CDEBUG(D_NET, "peer %s (%p): %d\n", libcfs_nid2str(lp->lp_primary_nid), lp, rc); if (rc) { spin_lock(&lp->lp_lock); @@ -2555,6 +3037,84 @@ lnet_peer_set_primary_data(struct lnet_peer *lp, struct lnet_ping_buffer *pbuf) return 0; } +static bool lnet_is_nid_in_ping_info(lnet_nid_t nid, struct lnet_ping_info *pinfo) +{ + int i; + + for (i = 0; i < pinfo->pi_nnis; i++) { + if (pinfo->pi_ni[i].ns_nid == nid) + return true; + } + + return false; +} + +/* Delete a peer that has been marked for deletion. NB: when this peer was added + * to the discovery queue a reference was taken that will prevent the peer from + * actually being freed by this function. After this function exits the + * discovery thread should call lnet_peer_discovery_complete() which will + * drop that reference as well as wake any waiters that may also be holding a + * ref on the peer + */ +static int lnet_peer_deletion(struct lnet_peer *lp) +__must_hold(&lp->lp_lock) +{ + struct list_head rlist; + struct lnet_route *route, *tmp; + int sensitivity = lp->lp_health_sensitivity; + + INIT_LIST_HEAD(&rlist); + + lp->lp_state &= ~(LNET_PEER_DISCOVERING | LNET_PEER_FORCE_PING | + LNET_PEER_FORCE_PUSH); + CDEBUG(D_NET, "peer %s(%p) state %#x\n", + libcfs_nid2str(lp->lp_primary_nid), lp, lp->lp_state); + + /* no-op if lnet_peer_del() has already been called on this peer */ + if (lp->lp_state & LNET_PEER_MARK_DELETED) + return 0; + + if (the_lnet.ln_dc_state != LNET_DC_STATE_RUNNING) + return -ESHUTDOWN; + + spin_unlock(&lp->lp_lock); + + mutex_lock(&the_lnet.ln_api_mutex); + + lnet_net_lock(LNET_LOCK_EX); + /* remove the peer from the discovery work + * queue if it's on there in preparation + * of deleting it. + */ + if (!list_empty(&lp->lp_dc_list)) + list_del(&lp->lp_dc_list); + list_for_each_entry_safe(route, tmp, + &lp->lp_routes, + lr_gwlist) + lnet_move_route(route, NULL, &rlist); + lnet_net_unlock(LNET_LOCK_EX); + + /* lnet_peer_del() deletes all the peer NIs owned by this peer */ + lnet_peer_del(lp); + + list_for_each_entry_safe(route, tmp, + &rlist, lr_list) { + /* re-add these routes */ + lnet_add_route(route->lr_net, + route->lr_hops, + route->lr_nid, + route->lr_priority, + sensitivity); + LIBCFS_FREE(route, sizeof(*route)); + } + + mutex_unlock(&the_lnet.ln_api_mutex); + + spin_lock(&lp->lp_lock); + + return 0; +} + /* * Update a peer using the data received. */ @@ -2618,15 +3178,25 @@ __must_hold(&lp->lp_lock) if (pbuf->pb_info.pi_nnis <= 1) goto out; nid = pbuf->pb_info.pi_ni[1].ns_nid; - if (LNET_NETTYP(LNET_NIDNET(lp->lp_primary_nid)) == LOLND) { + if (lp->lp_primary_nid == LNET_NID_LO_0) { rc = lnet_peer_set_primary_nid(lp, nid, flags); if (!rc) rc = lnet_peer_merge_data(lp, pbuf); - } else if (lp->lp_primary_nid == nid) { + /* + * if the primary nid of the peer is present in the ping info returned + * from the peer, but it's not the local primary peer we have + * cached and discovery is disabled, then we don't want to update + * our local peer info, by adding or removing NIDs, we just want + * to update the status of the nids that we currently have + * recorded in that peer. + */ + } else if (lp->lp_primary_nid == nid || + (lnet_is_nid_in_ping_info(lp->lp_primary_nid, &pbuf->pb_info) && + lnet_is_discovery_disabled(lp))) { rc = lnet_peer_merge_data(lp, pbuf); } else { lpni = lnet_find_peer_ni_locked(nid); - if (!lpni) { + if (!lpni || lp == lpni->lpni_peer_net->lpn_peer) { rc = lnet_peer_set_primary_nid(lp, nid, flags); if (rc) { CERROR("Primary NID error %s versus %s: %d\n", @@ -2635,14 +3205,41 @@ __must_hold(&lp->lp_lock) } else { rc = lnet_peer_merge_data(lp, pbuf); } + if (lpni) + lnet_peer_ni_decref_locked(lpni); } else { - rc = lnet_peer_set_primary_data( - lpni->lpni_peer_net->lpn_peer, pbuf); + struct lnet_peer *new_lp; + new_lp = lpni->lpni_peer_net->lpn_peer; + /* + * if lp has discovery/MR enabled that means new_lp + * should have discovery/MR enabled as well, since + * it's the same peer, which we're about to merge + */ + spin_lock(&lp->lp_lock); + spin_lock(&new_lp->lp_lock); + if (!(lp->lp_state & LNET_PEER_NO_DISCOVERY)) + new_lp->lp_state &= ~LNET_PEER_NO_DISCOVERY; + if (lp->lp_state & LNET_PEER_MULTI_RAIL) + new_lp->lp_state |= LNET_PEER_MULTI_RAIL; + /* If we're processing a ping reply then we may be + * about to send a push to the peer that we ping'd. + * Since the ping reply that we're processing was + * received by lp, we need to set the discovery source + * NID for new_lp to the NID stored in lp. + */ + if (lp->lp_disc_src_nid != LNET_NID_ANY) + new_lp->lp_disc_src_nid = lp->lp_disc_src_nid; + spin_unlock(&new_lp->lp_lock); + spin_unlock(&lp->lp_lock); + + rc = lnet_peer_set_primary_data(new_lp, pbuf); + lnet_consolidate_routes_locked(lp, new_lp); lnet_peer_ni_decref_locked(lpni); } } out: - CDEBUG(D_NET, "peer %s: %d\n", libcfs_nid2str(lp->lp_primary_nid), rc); + CDEBUG(D_NET, "peer %s(%p): %d. state = 0x%x\n", libcfs_nid2str(lp->lp_primary_nid), lp, rc, + lp->lp_state); mutex_unlock(&the_lnet.ln_api_mutex); spin_lock(&lp->lp_lock); @@ -2691,8 +3288,6 @@ static lnet_nid_t lnet_peer_select_nid(struct lnet_peer *lp) /* Look for a direct-connected NID for this peer. */ lpni = NULL; while ((lpni = lnet_get_next_peer_ni_locked(lp, NULL, lpni)) != NULL) { - if (!lnet_is_peer_ni_healthy_locked(lpni)) - continue; if (!lnet_get_net_locked(lpni->lpni_peer_net->lpn_net_id)) continue; break; @@ -2703,8 +3298,6 @@ static lnet_nid_t lnet_peer_select_nid(struct lnet_peer *lp) /* Look for a routed-connected NID for this peer. */ lpni = NULL; while ((lpni = lnet_get_next_peer_ni_locked(lp, NULL, lpni)) != NULL) { - if (!lnet_is_peer_ni_healthy_locked(lpni)) - continue; if (!lnet_find_rnet_locked(lpni->lpni_peer_net->lpn_net_id)) continue; break; @@ -2719,9 +3312,7 @@ static lnet_nid_t lnet_peer_select_nid(struct lnet_peer *lp) static int lnet_peer_send_ping(struct lnet_peer *lp) __must_hold(&lp->lp_lock) { - struct lnet_md md = { NULL }; - struct lnet_process_id id; - struct lnet_ping_buffer *pbuf; + lnet_nid_t pnid; int nnis; int rc; int cpt; @@ -2730,55 +3321,37 @@ __must_hold(&lp->lp_lock) lp->lp_state &= ~LNET_PEER_FORCE_PING; spin_unlock(&lp->lp_lock); - nnis = MAX(lp->lp_data_nnis, LNET_INTERFACES_MIN); - pbuf = lnet_ping_buffer_alloc(nnis, GFP_NOFS); - if (!pbuf) { - rc = -ENOMEM; - goto fail_error; - } - - /* initialize md content */ - md.start = &pbuf->pb_info; - md.length = LNET_PING_INFO_SIZE(nnis); - md.threshold = 2; /* GET/REPLY */ - md.max_size = 0; - md.options = LNET_MD_TRUNCATE; - md.user_ptr = lp; - md.eq_handle = the_lnet.ln_dc_eqh; - - rc = LNetMDBind(md, LNET_UNLINK, &lp->lp_ping_mdh); - if (rc != 0) { - lnet_ping_buffer_decref(pbuf); - CERROR("Can't bind MD: %d\n", rc); - goto fail_error; - } cpt = lnet_net_lock_current(); /* Refcount for MD. */ lnet_peer_addref_locked(lp); - id.pid = LNET_PID_LUSTRE; - id.nid = lnet_peer_select_nid(lp); + pnid = lnet_peer_select_nid(lp); lnet_net_unlock(cpt); - if (id.nid == LNET_NID_ANY) { - rc = -EHOSTUNREACH; - goto fail_unlink_md; - } + nnis = max(lp->lp_data_nnis, LNET_INTERFACES_MIN); - rc = LNetGet(LNET_NID_ANY, lp->lp_ping_mdh, id, - LNET_RESERVED_PORTAL, - LNET_PROTO_PING_MATCHBITS, 0); + rc = lnet_send_ping(pnid, &lp->lp_ping_mdh, nnis, lp, + the_lnet.ln_dc_handler, false); - if (rc) - goto fail_unlink_md; + /* + * if LNetMDBind in lnet_send_ping fails we need to decrement the + * refcount on the peer, otherwise LNetMDUnlink will be called + * which will eventually do that. + */ + if (rc > 0) { + lnet_net_lock(cpt); + lnet_peer_decref_locked(lp); + lnet_net_unlock(cpt); + rc = -rc; /* change the rc to negative value */ + goto fail_error; + } else if (rc < 0) { + goto fail_error; + } CDEBUG(D_NET, "peer %s\n", libcfs_nid2str(lp->lp_primary_nid)); spin_lock(&lp->lp_lock); return 0; -fail_unlink_md: - LNetMDUnlink(lp->lp_ping_mdh); - LNetInvalidateMDHandle(&lp->lp_ping_mdh); fail_error: CDEBUG(D_NET, "peer %s: %d\n", libcfs_nid2str(lp->lp_primary_nid), rc); /* @@ -2818,6 +3391,23 @@ __must_hold(&lp->lp_lock) return rc ? rc : LNET_REDISCOVER_PEER; } +/* + * Mark the peer as discovered. + */ +static int lnet_peer_discovered(struct lnet_peer *lp) +__must_hold(&lp->lp_lock) +{ + lp->lp_state |= LNET_PEER_DISCOVERED; + lp->lp_state &= ~(LNET_PEER_DISCOVERING | + LNET_PEER_REDISCOVER); + + lp->lp_dc_error = 0; + + CDEBUG(D_NET, "peer %s\n", libcfs_nid2str(lp->lp_primary_nid)); + + return 0; +} + /* Active side of push. */ static int lnet_peer_send_push(struct lnet_peer *lp) __must_hold(&lp->lp_lock) @@ -2831,6 +3421,12 @@ __must_hold(&lp->lp_lock) /* Don't push to a non-multi-rail peer. */ if (!(lp->lp_state & LNET_PEER_MULTI_RAIL)) { lp->lp_state &= ~LNET_PEER_FORCE_PUSH; + /* if peer's NIDs are uptodate then peer is discovered */ + if (lp->lp_state & LNET_PEER_NIDS_UPTODATE) { + rc = lnet_peer_discovered(lp); + return rc; + } + return 0; } @@ -2848,11 +3444,11 @@ __must_hold(&lp->lp_lock) md.length = LNET_PING_INFO_SIZE(pbuf->pb_nnis); md.threshold = 2; /* Put/Ack */ md.max_size = 0; - md.options = 0; - md.eq_handle = the_lnet.ln_dc_eqh; + md.options = LNET_MD_TRACK_RESPONSE; + md.handler = the_lnet.ln_dc_handler; md.user_ptr = lp; - rc = LNetMDBind(md, LNET_UNLINK, &lp->lp_push_mdh); + rc = LNetMDBind(&md, LNET_UNLINK, &lp->lp_push_mdh); if (rc) { lnet_ping_buffer_decref(pbuf); CERROR("Can't bind push source MD: %d\n", rc); @@ -2870,10 +3466,18 @@ __must_hold(&lp->lp_lock) goto fail_unlink; } - rc = LNetPut(LNET_NID_ANY, lp->lp_push_mdh, + rc = LNetPut(lp->lp_disc_src_nid, lp->lp_push_mdh, LNET_ACK_REQ, id, LNET_RESERVED_PORTAL, LNET_PROTO_PING_MATCHBITS, 0, 0); + /* + * reset the discovery nid. There is no need to restrict sending + * from that source, if we call lnet_push_update_to_peers(). It'll + * get set to a specific NID, if we initiate discovery from the + * scratch + */ + lp->lp_disc_src_nid = LNET_NID_ANY; + if (rc) goto fail_unlink; @@ -2886,7 +3490,7 @@ fail_unlink: LNetMDUnlink(lp->lp_push_mdh); LNetInvalidateMDHandle(&lp->lp_push_mdh); fail_error: - CDEBUG(D_NET, "peer %s: %d\n", libcfs_nid2str(lp->lp_primary_nid), rc); + CDEBUG(D_NET, "peer %s(%p): %d\n", libcfs_nid2str(lp->lp_primary_nid), lp, rc); /* * The errors that get us here are considered hard errors and * cause Discovery to terminate. So we clear PUSH_SENT, but do @@ -2916,85 +3520,6 @@ static void lnet_peer_discovery_error(struct lnet_peer *lp, int error) } /* - * Mark the peer as discovered. - */ -static int lnet_peer_discovered(struct lnet_peer *lp) -__must_hold(&lp->lp_lock) -{ - lp->lp_state |= LNET_PEER_DISCOVERED; - lp->lp_state &= ~(LNET_PEER_DISCOVERING | - LNET_PEER_REDISCOVER); - - CDEBUG(D_NET, "peer %s\n", libcfs_nid2str(lp->lp_primary_nid)); - - return 0; -} - -/* - * Mark the peer as to be rediscovered. - */ -static int lnet_peer_rediscover(struct lnet_peer *lp) -__must_hold(&lp->lp_lock) -{ - lp->lp_state |= LNET_PEER_REDISCOVER; - lp->lp_state &= ~LNET_PEER_DISCOVERING; - - CDEBUG(D_NET, "peer %s\n", libcfs_nid2str(lp->lp_primary_nid)); - - return 0; -} - -/* - * Returns the first peer on the ln_dc_working queue if its timeout - * has expired. Takes the current time as an argument so as to not - * obsessively re-check the clock. The oldest discovery request will - * be at the head of the queue. - */ -static struct lnet_peer *lnet_peer_get_dc_timed_out(time64_t now) -{ - struct lnet_peer *lp; - - if (list_empty(&the_lnet.ln_dc_working)) - return NULL; - lp = list_first_entry(&the_lnet.ln_dc_working, - struct lnet_peer, lp_dc_list); - if (now < lp->lp_last_queued + lnet_transaction_timeout) - return NULL; - return lp; -} - -/* - * Discovering this peer is taking too long. Cancel any Ping or Push - * that discovery is waiting on by unlinking the relevant MDs. The - * lnet_discovery_event_handler() will proceed from here and complete - * the cleanup. - */ -static void lnet_peer_cancel_discovery(struct lnet_peer *lp) -{ - struct lnet_handle_md ping_mdh; - struct lnet_handle_md push_mdh; - - LNetInvalidateMDHandle(&ping_mdh); - LNetInvalidateMDHandle(&push_mdh); - - spin_lock(&lp->lp_lock); - if (lp->lp_state & LNET_PEER_PING_SENT) { - ping_mdh = lp->lp_ping_mdh; - LNetInvalidateMDHandle(&lp->lp_ping_mdh); - } - if (lp->lp_state & LNET_PEER_PUSH_SENT) { - push_mdh = lp->lp_push_mdh; - LNetInvalidateMDHandle(&lp->lp_push_mdh); - } - spin_unlock(&lp->lp_lock); - - if (!LNetMDHandleIsInvalid(ping_mdh)) - LNetMDUnlink(ping_mdh); - if (!LNetMDHandleIsInvalid(push_mdh)) - LNetMDUnlink(push_mdh); -} - -/* * Wait for work to be queued or some other change that must be * attended to. Returns non-zero if the discovery thread should shut * down. @@ -3012,14 +3537,13 @@ static int lnet_peer_discovery_wait_for_work(void) TASK_INTERRUPTIBLE); if (the_lnet.ln_dc_state == LNET_DC_STATE_STOPPING) break; - if (lnet_push_target_resize_needed()) + if (lnet_push_target_resize_needed() || + the_lnet.ln_push_target->pb_needs_post) break; if (!list_empty(&the_lnet.ln_dc_request)) break; if (!list_empty(&the_lnet.ln_msg_resend)) break; - if (lnet_peer_get_dc_timed_out(ktime_get_real_seconds())) - break; lnet_net_unlock(cpt); /* @@ -3063,11 +3587,9 @@ static int lnet_peer_discovery_wait_for_work(void) static void lnet_resend_msgs(void) { struct lnet_msg *msg, *tmp; - struct list_head resend; + LIST_HEAD(resend); int rc; - INIT_LIST_HEAD(&resend); - spin_lock(&the_lnet.ln_msg_resend_lock); list_splice(&the_lnet.ln_msg_resend, &resend); spin_unlock(&the_lnet.ln_msg_resend_lock); @@ -3089,24 +3611,29 @@ static void lnet_resend_msgs(void) static int lnet_peer_discovery(void *arg) { struct lnet_peer *lp; - time64_t now; int rc; + wait_for_completion(&the_lnet.ln_started); + CDEBUG(D_NET, "started\n"); - cfs_block_allsigs(); for (;;) { if (lnet_peer_discovery_wait_for_work()) break; - lnet_resend_msgs(); - if (lnet_push_target_resize_needed()) lnet_push_target_resize(); + else if (the_lnet.ln_push_target->pb_needs_post) + lnet_push_target_post(the_lnet.ln_push_target, + &the_lnet.ln_push_target_md); + + lnet_resend_msgs(); lnet_net_lock(LNET_LOCK_EX); - if (the_lnet.ln_dc_state == LNET_DC_STATE_STOPPING) + if (the_lnet.ln_dc_state == LNET_DC_STATE_STOPPING) { + lnet_net_unlock(LNET_LOCK_EX); break; + } /* * Process all incoming discovery work requests. When @@ -3126,13 +3653,16 @@ static int lnet_peer_discovery(void *arg) * forever, in case the GET message (for ping) * doesn't get a REPLY or the PUT message (for * push) doesn't get an ACK. - * - * TODO: LNet Health will deal with this scenario - * in a generic way. */ lp->lp_last_queued = ktime_get_real_seconds(); lnet_net_unlock(LNET_LOCK_EX); + if (lnet_push_target_resize_needed()) + lnet_push_target_resize(); + else if (the_lnet.ln_push_target->pb_needs_post) + lnet_push_target_post(the_lnet.ln_push_target, + &the_lnet.ln_push_target_md); + /* * Select an action depending on the state of * the peer and whether discovery is disabled. @@ -3142,10 +3672,13 @@ static int lnet_peer_discovery(void *arg) * forcing a Ping or Push. */ spin_lock(&lp->lp_lock); - CDEBUG(D_NET, "peer %s state %#x\n", - libcfs_nid2str(lp->lp_primary_nid), + CDEBUG(D_NET, "peer %s(%p) state %#x\n", + libcfs_nid2str(lp->lp_primary_nid), lp, lp->lp_state); - if (lp->lp_state & LNET_PEER_DATA_PRESENT) + if (lp->lp_state & (LNET_PEER_MARK_DELETION | + LNET_PEER_MARK_DELETED)) + rc = lnet_peer_deletion(lp); + else if (lp->lp_state & LNET_PEER_DATA_PRESENT) rc = lnet_peer_data_present(lp); else if (lp->lp_state & LNET_PEER_PING_FAILED) rc = lnet_peer_ping_failed(lp); @@ -3155,16 +3688,14 @@ static int lnet_peer_discovery(void *arg) rc = lnet_peer_send_ping(lp); else if (lp->lp_state & LNET_PEER_FORCE_PUSH) rc = lnet_peer_send_push(lp); - else if (lnet_peer_discovery_disabled) - rc = lnet_peer_rediscover(lp); else if (!(lp->lp_state & LNET_PEER_NIDS_UPTODATE)) rc = lnet_peer_send_ping(lp); else if (lnet_peer_needs_push(lp)) rc = lnet_peer_send_push(lp); else rc = lnet_peer_discovered(lp); - CDEBUG(D_NET, "peer %s state %#x rc %d\n", - libcfs_nid2str(lp->lp_primary_nid), + CDEBUG(D_NET, "peer %s(%p) state %#x rc %d\n", + libcfs_nid2str(lp->lp_primary_nid), lp, lp->lp_state, rc); spin_unlock(&lp->lp_lock); @@ -3179,23 +3710,7 @@ static int lnet_peer_discovery(void *arg) lnet_peer_discovery_complete(lp); if (the_lnet.ln_dc_state == LNET_DC_STATE_STOPPING) break; - } - /* - * Now that the ln_dc_request queue has been emptied - * check the ln_dc_working queue for peers that are - * taking too long. Move all that are found to the - * ln_dc_expired queue and time out any pending - * Ping or Push. We have to drop the lnet_net_lock - * in the loop because lnet_peer_cancel_discovery() - * calls LNetMDUnlink(). - */ - now = ktime_get_real_seconds(); - while ((lp = lnet_peer_get_dc_timed_out(now)) != NULL) { - list_move(&lp->lp_dc_list, &the_lnet.ln_dc_expired); - lnet_net_unlock(LNET_LOCK_EX); - lnet_peer_cancel_discovery(lp); - lnet_net_lock(LNET_LOCK_EX); } lnet_net_unlock(LNET_LOCK_EX); @@ -3208,8 +3723,6 @@ static int lnet_peer_discovery(void *arg) * size of the thundering herd if there are multiple threads * waiting on discovery of a single peer. */ - LNetEQFree(the_lnet.ln_dc_eqh); - LNetInvalidateEQHandle(&the_lnet.ln_dc_eqh); /* Queue cleanup 1: stop all pending pings and pushes. */ lnet_net_lock(LNET_LOCK_EX); @@ -3225,7 +3738,7 @@ static int lnet_peer_discovery(void *arg) /* Queue cleanup 2: wait for the expired queue to clear. */ while (!list_empty(&the_lnet.ln_dc_expired)) - schedule_timeout(cfs_time_seconds(1)); + schedule_timeout_uninterruptible(cfs_time_seconds(1)); /* Queue cleanup 3: clear the request queue. */ lnet_net_lock(LNET_LOCK_EX); @@ -3237,6 +3750,9 @@ static int lnet_peer_discovery(void *arg) } lnet_net_unlock(LNET_LOCK_EX); + lnet_assert_handler_unused(the_lnet.ln_dc_handler); + the_lnet.ln_dc_handler = NULL; + the_lnet.ln_dc_state = LNET_DC_STATE_SHUTDOWN; wake_up(&the_lnet.ln_dc_waitq); @@ -3249,25 +3765,19 @@ static int lnet_peer_discovery(void *arg) int lnet_peer_discovery_start(void) { struct task_struct *task; - int rc; + int rc = 0; if (the_lnet.ln_dc_state != LNET_DC_STATE_SHUTDOWN) return -EALREADY; - rc = LNetEQAlloc(0, lnet_discovery_event_handler, &the_lnet.ln_dc_eqh); - if (rc != 0) { - CERROR("Can't allocate discovery EQ: %d\n", rc); - return rc; - } - + the_lnet.ln_dc_handler = lnet_discovery_event_handler; the_lnet.ln_dc_state = LNET_DC_STATE_RUNNING; task = kthread_run(lnet_peer_discovery, NULL, "lnet_discovery"); if (IS_ERR(task)) { rc = PTR_ERR(task); CERROR("Can't start peer discovery thread: %d\n", rc); - LNetEQFree(the_lnet.ln_dc_eqh); - LNetInvalidateEQHandle(&the_lnet.ln_dc_eqh); + the_lnet.ln_dc_handler = NULL; the_lnet.ln_dc_state = LNET_DC_STATE_SHUTDOWN; } @@ -3285,7 +3795,14 @@ void lnet_peer_discovery_stop(void) LASSERT(the_lnet.ln_dc_state == LNET_DC_STATE_RUNNING); the_lnet.ln_dc_state = LNET_DC_STATE_STOPPING; - wake_up(&the_lnet.ln_dc_waitq); + + /* In the LNetNIInit() path we may be stopping discovery before it + * entered its work loop + */ + if (!completion_done(&the_lnet.ln_started)) + complete(&the_lnet.ln_started); + else + wake_up(&the_lnet.ln_dc_waitq); wait_event(the_lnet.ln_dc_waitq, the_lnet.ln_dc_state == LNET_DC_STATE_SHUTDOWN); @@ -3317,10 +3834,10 @@ lnet_debug_peer(lnet_nid_t nid) } if (lnet_isrouter(lp) || lnet_peer_aliveness_enabled(lp)) - aliveness = lp->lpni_alive ? "up" : "down"; + aliveness = (lnet_is_peer_ni_alive(lp)) ? "up" : "down"; CDEBUG(D_WARNING, "%-24s %4d %5s %5d %5d %5d %5d %5d %ld\n", - libcfs_nid2str(lp->lpni_nid), atomic_read(&lp->lpni_refcount), + libcfs_nid2str(lp->lpni_nid), kref_read(&lp->lpni_kref), aliveness, lp->lpni_net->net_tunables.lct_peer_tx_credits, lp->lpni_rtrcredits, lp->lpni_minrtrcredits, lp->lpni_txcredits, lp->lpni_mintxcredits, lp->lpni_txqnob); @@ -3373,10 +3890,10 @@ int lnet_get_peer_ni_info(__u32 peer_index, __u64 *nid, if (lnet_isrouter(lp) || lnet_peer_aliveness_enabled(lp)) snprintf(aliveness, LNET_MAX_STR_LEN, - lp->lpni_alive ? "up" : "down"); + lnet_is_peer_ni_alive(lp) ? "up" : "down"); *nid = lp->lpni_nid; - *refcount = atomic_read(&lp->lpni_refcount); + *refcount = kref_read(&lp->lpni_kref); *ni_peer_tx_credits = lp->lpni_net->net_tunables.lct_peer_tx_credits; *peer_tx_credits = lp->lpni_txcredits; @@ -3400,6 +3917,7 @@ int lnet_get_peer_info(struct lnet_ioctl_peer_cfg *cfg, void __user *bulk) { struct lnet_ioctl_element_stats *lpni_stats; struct lnet_ioctl_element_msg_stats *lpni_msg_stats; + struct lnet_ioctl_peer_ni_hstats *lpni_hstats; struct lnet_peer_ni_credit_info *lpni_info; struct lnet_peer_ni *lpni; struct lnet_peer *lp; @@ -3415,7 +3933,7 @@ int lnet_get_peer_info(struct lnet_ioctl_peer_cfg *cfg, void __user *bulk) } size = sizeof(nid) + sizeof(*lpni_info) + sizeof(*lpni_stats) - + sizeof(*lpni_msg_stats); + + sizeof(*lpni_msg_stats) + sizeof(*lpni_hstats); size *= lp->lp_nnis; if (size > cfg->prcfg_size) { cfg->prcfg_size = size; @@ -3441,6 +3959,9 @@ int lnet_get_peer_info(struct lnet_ioctl_peer_cfg *cfg, void __user *bulk) LIBCFS_ALLOC(lpni_msg_stats, sizeof(*lpni_msg_stats)); if (!lpni_msg_stats) goto out_free_stats; + LIBCFS_ALLOC(lpni_hstats, sizeof(*lpni_hstats)); + if (!lpni_hstats) + goto out_free_msg_stats; lpni = NULL; @@ -3448,7 +3969,7 @@ int lnet_get_peer_info(struct lnet_ioctl_peer_cfg *cfg, void __user *bulk) while ((lpni = lnet_get_next_peer_ni_locked(lp, NULL, lpni)) != NULL) { nid = lpni->lpni_nid; if (copy_to_user(bulk, &nid, sizeof(nid))) - goto out_free_msg_stats; + goto out_free_hstats; bulk += sizeof(nid); memset(lpni_info, 0, sizeof(*lpni_info)); @@ -3456,9 +3977,9 @@ int lnet_get_peer_info(struct lnet_ioctl_peer_cfg *cfg, void __user *bulk) if (lnet_isrouter(lpni) || lnet_peer_aliveness_enabled(lpni)) snprintf(lpni_info->cr_aliveness, LNET_MAX_STR_LEN, - lpni->lpni_alive ? "up" : "down"); + lnet_is_peer_ni_alive(lpni) ? "up" : "down"); - lpni_info->cr_refcount = atomic_read(&lpni->lpni_refcount); + lpni_info->cr_refcount = kref_read(&lpni->lpni_kref); lpni_info->cr_ni_peer_tx_credits = (lpni->lpni_net != NULL) ? lpni->lpni_net->net_tunables.lct_peer_tx_credits : 0; lpni_info->cr_peer_tx_credits = lpni->lpni_txcredits; @@ -3467,7 +3988,7 @@ int lnet_get_peer_info(struct lnet_ioctl_peer_cfg *cfg, void __user *bulk) lpni_info->cr_peer_min_tx_credits = lpni->lpni_mintxcredits; lpni_info->cr_peer_tx_qnob = lpni->lpni_txqnob; if (copy_to_user(bulk, lpni_info, sizeof(*lpni_info))) - goto out_free_msg_stats; + goto out_free_hstats; bulk += sizeof(*lpni_info); memset(lpni_stats, 0, sizeof(*lpni_stats)); @@ -3478,15 +3999,30 @@ int lnet_get_peer_info(struct lnet_ioctl_peer_cfg *cfg, void __user *bulk) lpni_stats->iel_drop_count = lnet_sum_stats(&lpni->lpni_stats, LNET_STATS_TYPE_DROP); if (copy_to_user(bulk, lpni_stats, sizeof(*lpni_stats))) - goto out_free_msg_stats; + goto out_free_hstats; bulk += sizeof(*lpni_stats); lnet_usr_translate_stats(lpni_msg_stats, &lpni->lpni_stats); if (copy_to_user(bulk, lpni_msg_stats, sizeof(*lpni_msg_stats))) - goto out_free_msg_stats; + goto out_free_hstats; bulk += sizeof(*lpni_msg_stats); + lpni_hstats->hlpni_network_timeout = + atomic_read(&lpni->lpni_hstats.hlt_network_timeout); + lpni_hstats->hlpni_remote_dropped = + atomic_read(&lpni->lpni_hstats.hlt_remote_dropped); + lpni_hstats->hlpni_remote_timeout = + atomic_read(&lpni->lpni_hstats.hlt_remote_timeout); + lpni_hstats->hlpni_remote_error = + atomic_read(&lpni->lpni_hstats.hlt_remote_error); + lpni_hstats->hlpni_health_value = + atomic_read(&lpni->lpni_healthv); + if (copy_to_user(bulk, lpni_hstats, sizeof(*lpni_hstats))) + goto out_free_hstats; + bulk += sizeof(*lpni_hstats); } rc = 0; +out_free_hstats: + LIBCFS_FREE(lpni_hstats, sizeof(*lpni_hstats)); out_free_msg_stats: LIBCFS_FREE(lpni_msg_stats, sizeof(*lpni_msg_stats)); out_free_stats: @@ -3498,3 +4034,109 @@ out_lp_decref: out: return rc; } + +/* must hold net_lock/0 */ +void +lnet_peer_ni_add_to_recoveryq_locked(struct lnet_peer_ni *lpni, + struct list_head *recovery_queue, + time64_t now) +{ + /* the mt could've shutdown and cleaned up the queues */ + if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING) + return; + + if (!list_empty(&lpni->lpni_recovery)) + return; + + if (atomic_read(&lpni->lpni_healthv) == LNET_MAX_HEALTH_VALUE) + return; + + if (!lpni->lpni_last_alive) { + CDEBUG(D_NET, + "lpni %s(%p) not eligible for recovery last alive %lld\n", + libcfs_nid2str(lpni->lpni_nid), lpni, + lpni->lpni_last_alive); + return; + } + + if (now > lpni->lpni_last_alive + lnet_recovery_limit) { + CDEBUG(D_NET, "lpni %s aged out last alive %lld\n", + libcfs_nid2str(lpni->lpni_nid), + lpni->lpni_last_alive); + /* Reset the ping count so that if this peer NI is added back to + * the recovery queue we will send the first ping right away. + */ + lpni->lpni_ping_count = 0; + return; + } + + /* This peer NI is going on the recovery queue, so take a ref on it */ + lnet_peer_ni_addref_locked(lpni); + + lnet_peer_ni_set_next_ping(lpni, now); + + CDEBUG(D_NET, "%s added to recovery queue. ping count: %u next ping: %lld last alive: %lld health: %d\n", + libcfs_nid2str(lpni->lpni_nid), + lpni->lpni_ping_count, + lpni->lpni_next_ping, + lpni->lpni_last_alive, + atomic_read(&lpni->lpni_healthv)); + + list_add_tail(&lpni->lpni_recovery, recovery_queue); +} + +/* Call with the ln_api_mutex held */ +void +lnet_peer_ni_set_healthv(lnet_nid_t nid, int value, bool all) +{ + struct lnet_peer_table *ptable; + struct lnet_peer *lp; + struct lnet_peer_net *lpn; + struct lnet_peer_ni *lpni; + int lncpt; + int cpt; + time64_t now; + + if (the_lnet.ln_state != LNET_STATE_RUNNING) + return; + + now = ktime_get_seconds(); + + if (!all) { + lnet_net_lock(LNET_LOCK_EX); + lpni = lnet_find_peer_ni_locked(nid); + if (!lpni) { + lnet_net_unlock(LNET_LOCK_EX); + return; + } + atomic_set(&lpni->lpni_healthv, value); + lnet_peer_ni_add_to_recoveryq_locked(lpni, + &the_lnet.ln_mt_peerNIRecovq, now); + lnet_peer_ni_decref_locked(lpni); + lnet_net_unlock(LNET_LOCK_EX); + return; + } + + lncpt = cfs_percpt_number(the_lnet.ln_peer_tables); + + /* + * Walk all the peers and reset the health value for each one to the + * specified value. + */ + lnet_net_lock(LNET_LOCK_EX); + for (cpt = 0; cpt < lncpt; cpt++) { + ptable = the_lnet.ln_peer_tables[cpt]; + list_for_each_entry(lp, &ptable->pt_peer_list, lp_peer_list) { + list_for_each_entry(lpn, &lp->lp_peer_nets, lpn_peer_nets) { + list_for_each_entry(lpni, &lpn->lpn_peer_nis, + lpni_peer_nis) { + atomic_set(&lpni->lpni_healthv, value); + lnet_peer_ni_add_to_recoveryq_locked(lpni, + &the_lnet.ln_mt_peerNIRecovq, now); + } + } + } + } + lnet_net_unlock(LNET_LOCK_EX); +} +