X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lnet%2Flnet%2Fpeer.c;h=709fe247418ab77d376ffa833a6712e484ac18fd;hb=2ce6957b69370b0ce75725d1d91866bf55c07fa8;hp=826589f16dc44969d6aa37b065dcdffd002136ac;hpb=51b2c0f75f727f0562b3145015357cbff5cbb3b5;p=fs%2Flustre-release.git diff --git a/lnet/lnet/peer.c b/lnet/lnet/peer.c index 826589f..709fe24 100644 --- a/lnet/lnet/peer.c +++ b/lnet/lnet/peer.c @@ -27,7 +27,6 @@ */ /* * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. * * lnet/lnet/peer.c */ @@ -40,6 +39,7 @@ #endif #include +#include #include #include @@ -168,7 +168,7 @@ lnet_peer_ni_alloc(lnet_nid_t nid) INIT_LIST_HEAD(&lpni->lpni_on_remote_peer_ni_list); INIT_LIST_HEAD(&lpni->lpni_rtr_pref_nids); LNetInvalidateMDHandle(&lpni->lpni_recovery_ping_mdh); - atomic_set(&lpni->lpni_refcount, 1); + kref_init(&lpni->lpni_kref); lpni->lpni_sel_priority = LNET_MAX_SELECTION_PRIORITY; spin_lock_init(&lpni->lpni_lock); @@ -263,6 +263,7 @@ lnet_peer_alloc(lnet_nid_t nid) spin_lock_init(&lp->lp_lock); lp->lp_primary_nid = nid; lp->lp_disc_src_nid = LNET_NID_ANY; + lp->lp_disc_dst_nid = LNET_NID_ANY; if (lnet_peers_start_down()) lp->lp_alive = false; else @@ -458,6 +459,10 @@ lnet_peer_del_locked(struct lnet_peer *peer) CDEBUG(D_NET, "peer %s\n", libcfs_nid2str(peer->lp_primary_nid)); + spin_lock(&peer->lp_lock); + peer->lp_state |= LNET_PEER_MARK_DELETED; + spin_unlock(&peer->lp_lock); + lpni = lnet_get_next_peer_ni_locked(peer, NULL, lpni); while (lpni != NULL) { lpni2 = lnet_get_next_peer_ni_locked(peer, NULL, lpni); @@ -470,9 +475,41 @@ lnet_peer_del_locked(struct lnet_peer *peer) return rc2; } +/* + * Discovering this peer is taking too long. Cancel any Ping or Push + * that discovery is waiting on by unlinking the relevant MDs. The + * lnet_discovery_event_handler() will proceed from here and complete + * the cleanup. + */ +static void lnet_peer_cancel_discovery(struct lnet_peer *lp) +{ + struct lnet_handle_md ping_mdh; + struct lnet_handle_md push_mdh; + + LNetInvalidateMDHandle(&ping_mdh); + LNetInvalidateMDHandle(&push_mdh); + + spin_lock(&lp->lp_lock); + if (lp->lp_state & LNET_PEER_PING_SENT) { + ping_mdh = lp->lp_ping_mdh; + LNetInvalidateMDHandle(&lp->lp_ping_mdh); + } + if (lp->lp_state & LNET_PEER_PUSH_SENT) { + push_mdh = lp->lp_push_mdh; + LNetInvalidateMDHandle(&lp->lp_push_mdh); + } + spin_unlock(&lp->lp_lock); + + if (!LNetMDHandleIsInvalid(ping_mdh)) + LNetMDUnlink(ping_mdh); + if (!LNetMDHandleIsInvalid(push_mdh)) + LNetMDUnlink(push_mdh); +} + static int lnet_peer_del(struct lnet_peer *peer) { + lnet_peer_cancel_discovery(peer); lnet_net_lock(LNET_LOCK_EX); lnet_peer_del_locked(peer); lnet_net_unlock(LNET_LOCK_EX); @@ -898,6 +935,94 @@ lnet_push_update_to_peers(int force) wake_up(&the_lnet.ln_dc_waitq); } +/* find the NID in the preferred gateways for the remote peer + * return: + * false: list is not empty and NID is not preferred + * false: list is empty + * true: nid is found in the list + */ +bool +lnet_peer_is_pref_rtr_locked(struct lnet_peer_ni *lpni, + lnet_nid_t gw_nid) +{ + struct lnet_nid_list *ne; + + CDEBUG(D_NET, "%s: rtr pref emtpy: %d\n", + libcfs_nid2str(lpni->lpni_nid), + list_empty(&lpni->lpni_rtr_pref_nids)); + + if (list_empty(&lpni->lpni_rtr_pref_nids)) + return false; + + /* iterate through all the preferred NIDs and see if any of them + * matches the provided gw_nid + */ + list_for_each_entry(ne, &lpni->lpni_rtr_pref_nids, nl_list) { + CDEBUG(D_NET, "Comparing pref %s with gw %s\n", + libcfs_nid2str(ne->nl_nid), + libcfs_nid2str(gw_nid)); + if (ne->nl_nid == gw_nid) + return true; + } + + return false; +} + +void +lnet_peer_clr_pref_rtrs(struct lnet_peer_ni *lpni) +{ + struct list_head zombies; + struct lnet_nid_list *ne; + struct lnet_nid_list *tmp; + int cpt = lpni->lpni_cpt; + + INIT_LIST_HEAD(&zombies); + + lnet_net_lock(cpt); + list_splice_init(&lpni->lpni_rtr_pref_nids, &zombies); + lnet_net_unlock(cpt); + + list_for_each_entry_safe(ne, tmp, &zombies, nl_list) { + list_del(&ne->nl_list); + LIBCFS_FREE(ne, sizeof(*ne)); + } +} + +int +lnet_peer_add_pref_rtr(struct lnet_peer_ni *lpni, + lnet_nid_t gw_nid) +{ + int cpt = lpni->lpni_cpt; + struct lnet_nid_list *ne = NULL; + + /* This function is called with api_mutex held. When the api_mutex + * is held the list can not be modified, as it is only modified as + * a result of applying a UDSP and that happens under api_mutex + * lock. + */ + __must_hold(&the_lnet.ln_api_mutex); + + list_for_each_entry(ne, &lpni->lpni_rtr_pref_nids, nl_list) { + if (ne->nl_nid == gw_nid) + return -EEXIST; + } + + LIBCFS_CPT_ALLOC(ne, lnet_cpt_table(), cpt, sizeof(*ne)); + if (!ne) + return -ENOMEM; + + ne->nl_nid = gw_nid; + + /* Lock the cpt to protect against addition and checks in the + * selection algorithm + */ + lnet_net_lock(cpt); + list_add(&ne->nl_list, &lpni->lpni_rtr_pref_nids); + lnet_net_unlock(cpt); + + return 0; +} + /* * Test whether a ni is a preferred ni for this peer_ni, e.g, whether * this is a preferred point-to-point path. Call with lnet_net_lock in @@ -970,6 +1095,12 @@ lnet_peer_ni_clr_non_mr_pref_nid(struct lnet_peer_ni *lpni) return rc; } +void +lnet_peer_ni_set_selection_priority(struct lnet_peer_ni *lpni, __u32 priority) +{ + lpni->lpni_sel_priority = priority; +} + /* * Clear the preferred NIDs from a non-multi-rail peer. */ @@ -1129,6 +1260,29 @@ out: return rc; } +void +lnet_peer_clr_pref_nids(struct lnet_peer_ni *lpni) +{ + struct list_head zombies; + struct lnet_nid_list *ne; + struct lnet_nid_list *tmp; + + INIT_LIST_HEAD(&zombies); + + lnet_net_lock(LNET_LOCK_EX); + if (lpni->lpni_pref_nnids == 1) + lpni->lpni_pref.nid = LNET_NID_ANY; + else if (lpni->lpni_pref_nnids > 1) + list_splice_init(&lpni->lpni_pref.nids, &zombies); + lpni->lpni_pref_nnids = 0; + lnet_net_unlock(LNET_LOCK_EX); + + list_for_each_entry_safe(ne, tmp, &zombies, nl_list) { + list_del_init(&ne->nl_list); + LIBCFS_FREE(ne, sizeof(*ne)); + } +} + lnet_nid_t lnet_peer_primary_nid_locked(lnet_nid_t nid) { @@ -1194,7 +1348,13 @@ LNetPrimaryNID(lnet_nid_t nid) } lp = lpni->lpni_peer_net->lpn_peer; - while (!lnet_peer_is_uptodate(lp)) { + /* If discovery is disabled locally then we needn't bother running + * discovery here because discovery will not modify whatever + * primary NID is currently set for this peer. If the specified peer is + * down then this discovery can introduce long delays into the mount + * process, so skip it if it isn't necessary. + */ + while (!lnet_peer_discovery_disabled && !lnet_peer_is_uptodate(lp)) { spin_lock(&lp->lp_lock); /* force a full discovery cycle */ lp->lp_state |= LNET_PEER_FORCE_PING | LNET_PEER_FORCE_PUSH; @@ -1203,9 +1363,23 @@ LNetPrimaryNID(lnet_nid_t nid) rc = lnet_discover_peer_locked(lpni, cpt, true); if (rc) goto out_decref; + /* The lpni (or lp) for this NID may have changed and our ref is + * the only thing keeping the old one around. Release the ref + * and lookup the lpni again + */ + lnet_peer_ni_decref_locked(lpni); + lpni = lnet_find_peer_ni_locked(nid); + if (!lpni) { + rc = -ENOENT; + goto out_unlock; + } lp = lpni->lpni_peer_net->lpn_peer; - /* Only try once if discovery is disabled */ + /* If we find that the peer has discovery disabled then we will + * not modify whatever primary NID is currently set for this + * peer. Thus, we can break out of this loop even if the peer + * is not fully up to date. + */ if (lnet_is_discovery_disabled(lp)) break; } @@ -1249,6 +1423,8 @@ lnet_peer_attach_peer_ni(struct lnet_peer *lp, unsigned flags) { struct lnet_peer_table *ptable; + bool new_lpn = false; + int rc; /* Install the new peer_ni */ lnet_net_lock(LNET_LOCK_EX); @@ -1273,14 +1449,21 @@ lnet_peer_attach_peer_ni(struct lnet_peer *lp, /* Add peer_ni to peer_net */ lpni->lpni_peer_net = lpn; - list_add_tail(&lpni->lpni_peer_nis, &lpn->lpn_peer_nis); + if (lp->lp_primary_nid == lpni->lpni_nid) + list_add(&lpni->lpni_peer_nis, &lpn->lpn_peer_nis); + else + list_add_tail(&lpni->lpni_peer_nis, &lpn->lpn_peer_nis); lnet_update_peer_net_healthv(lpni); lnet_peer_net_addref_locked(lpn); /* Add peer_net to peer */ if (!lpn->lpn_peer) { + new_lpn = true; lpn->lpn_peer = lp; - list_add_tail(&lpn->lpn_peer_nets, &lp->lp_peer_nets); + if (lp->lp_primary_nid == lpni->lpni_nid) + list_add(&lpn->lpn_peer_nets, &lp->lp_peer_nets); + else + list_add_tail(&lpn->lpn_peer_nets, &lp->lp_peer_nets); lnet_peer_addref_locked(lp); } @@ -1308,6 +1491,18 @@ lnet_peer_attach_peer_ni(struct lnet_peer *lp, lp->lp_nnis++; + /* apply UDSPs */ + if (new_lpn) { + rc = lnet_udsp_apply_policies_on_lpn(lpn); + if (rc) + CERROR("Failed to apply UDSPs on lpn %s\n", + libcfs_net2str(lpn->lpn_net_id)); + } + rc = lnet_udsp_apply_policies_on_lpni(lpni); + if (rc) + CERROR("Failed to apply UDSPs on lpni %s\n", + libcfs_nid2str(lpni->lpni_nid)); + CDEBUG(D_NET, "peer %s NID %s flags %#x\n", libcfs_nid2str(lp->lp_primary_nid), libcfs_nid2str(lpni->lpni_nid), flags); @@ -1509,10 +1704,14 @@ lnet_peer_set_primary_nid(struct lnet_peer *lp, lnet_nid_t nid, unsigned flags) if (lp->lp_primary_nid == nid) goto out; + + lp->lp_primary_nid = nid; + rc = lnet_peer_add_nid(lp, nid, flags); - if (rc) + if (rc) { + lp->lp_primary_nid = old; goto out; - lp->lp_primary_nid = nid; + } out: CDEBUG(D_NET, "peer %s NID %s: %d\n", libcfs_nid2str(old), libcfs_nid2str(nid), rc); @@ -1694,14 +1893,16 @@ lnet_del_peer_ni(lnet_nid_t prim_nid, lnet_nid_t nid) } void -lnet_destroy_peer_ni_locked(struct lnet_peer_ni *lpni) +lnet_destroy_peer_ni_locked(struct kref *ref) { + struct lnet_peer_ni *lpni = container_of(ref, struct lnet_peer_ni, + lpni_kref); struct lnet_peer_table *ptable; struct lnet_peer_net *lpn; CDEBUG(D_NET, "%p nid %s\n", lpni, libcfs_nid2str(lpni->lpni_nid)); - LASSERT(atomic_read(&lpni->lpni_refcount) == 0); + LASSERT(kref_read(&lpni->lpni_kref) == 0); LASSERT(list_empty(&lpni->lpni_txq)); LASSERT(lpni->lpni_txqnob == 0); LASSERT(list_empty(&lpni->lpni_peer_nis)); @@ -1894,6 +2095,26 @@ __must_hold(&lp->lp_lock) return rc; } +/* Add the message to the peer's lp_dc_pendq and queue the peer for discovery */ +void +lnet_peer_queue_message(struct lnet_peer *lp, struct lnet_msg *msg) +{ + /* The discovery thread holds net_lock/EX and lp_lock when it splices + * the lp_dc_pendq onto a local list for resending. Thus, we do the same + * when adding to the list and queuing the peer to ensure that we do not + * strand any messages on the lp_dc_pendq. This scheme ensures the + * message will be resent even if the peer is already being discovered. + * Therefore we needn't check the return value of + * lnet_peer_queue_for_discovery(lp). + */ + lnet_net_lock(LNET_LOCK_EX); + spin_lock(&lp->lp_lock); + list_add_tail(&msg->msg_list, &lp->lp_dc_pendq); + spin_unlock(&lp->lp_lock); + lnet_peer_queue_for_discovery(lp); + lnet_net_unlock(LNET_LOCK_EX); +} + /* * Queue a peer for the attention of the discovery thread. Call with * lnet_net_lock/EX held. Returns 0 if the peer was queued, and @@ -1939,7 +2160,7 @@ static void lnet_peer_discovery_complete(struct lnet_peer *lp) spin_lock(&lp->lp_lock); list_splice_init(&lp->lp_dc_pendq, &pending_msgs); spin_unlock(&lp->lp_lock); - wake_up_all(&lp->lp_dc_waitq); + wake_up(&lp->lp_dc_waitq); if (lp->lp_rtr_refcount > 0) lnet_router_discovery_complete(lp); @@ -2311,6 +2532,7 @@ lnet_discovery_event_reply(struct lnet_peer *lp, struct lnet_event *ev) spin_lock(&lp->lp_lock); lp->lp_disc_src_nid = ev->target.nid; + lp->lp_disc_dst_nid = ev->source.nid; /* * If some kind of error happened the contents of message @@ -2343,20 +2565,41 @@ lnet_discovery_event_reply(struct lnet_peer *lp, struct lnet_event *ev) goto out; } - /* * The peer may have discovery disabled at its end. Set * NO_DISCOVERY as appropriate. */ - if ((pbuf->pb_info.pi_features & LNET_PING_FEAT_DISCOVERY) && - !lnet_peer_discovery_disabled) { - CDEBUG(D_NET, "Peer %s has discovery enabled\n", - libcfs_nid2str(lp->lp_primary_nid)); - lp->lp_state &= ~LNET_PEER_NO_DISCOVERY; - } else { + if (!(pbuf->pb_info.pi_features & LNET_PING_FEAT_DISCOVERY) || + lnet_peer_discovery_disabled) { CDEBUG(D_NET, "Peer %s has discovery disabled\n", libcfs_nid2str(lp->lp_primary_nid)); + + /* Detect whether this peer has toggled discovery from on to + * off and whether we can delete and re-create the peer. Peers + * that were manually configured cannot be deleted by discovery. + * We need to delete this peer and re-create it if the peer was + * not configured manually, is currently considered DD capable, + * and either: + * 1. We've already discovered the peer (the peer has toggled + * the discovery feature from on to off), or + * 2. The peer is considered MR, but it was not user configured + * (this was a "temporary" peer created via the kernel APIs + * that we're discovering for the first time) + */ + if (!(lp->lp_state & (LNET_PEER_CONFIGURED | + LNET_PEER_NO_DISCOVERY)) && + (lp->lp_state & (LNET_PEER_DISCOVERED | + LNET_PEER_MULTI_RAIL))) { + CDEBUG(D_NET, "Marking %s:0x%x for deletion\n", + libcfs_nid2str(lp->lp_primary_nid), + lp->lp_state); + lp->lp_state |= LNET_PEER_MARK_DELETION; + } lp->lp_state |= LNET_PEER_NO_DISCOVERY; + } else { + CDEBUG(D_NET, "Peer %s has discovery enabled\n", + libcfs_nid2str(lp->lp_primary_nid)); + lp->lp_state &= ~LNET_PEER_NO_DISCOVERY; } /* @@ -2564,7 +2807,8 @@ static void lnet_discovery_event_handler(struct lnet_event *event) /* put peer back at end of request queue, if discovery not already * done */ - if (rc == LNET_REDISCOVER_PEER && !lnet_peer_is_uptodate(lp)) { + if (rc == LNET_REDISCOVER_PEER && !lnet_peer_is_uptodate(lp) && + lnet_peer_queue_for_discovery(lp)) { list_move_tail(&lp->lp_dc_list, &the_lnet.ln_dc_request); wake_up(&the_lnet.ln_dc_waitq); } @@ -2592,6 +2836,7 @@ static void lnet_discovery_event_handler(struct lnet_event *event) static int lnet_peer_merge_data(struct lnet_peer *lp, struct lnet_ping_buffer *pbuf) { + struct lnet_peer_net *lpn; struct lnet_peer_ni *lpni; lnet_nid_t *curnis = NULL; struct lnet_ni_status *addnis = NULL; @@ -2720,6 +2965,28 @@ static int lnet_peer_merge_data(struct lnet_peer *lp, goto out; } } + + /* The peer net for the primary NID should be the first entry in the + * peer's lp_peer_nets list, and the peer NI for the primary NID should + * be the first entry in its peer net's lpn_peer_nis list. + */ + lpni = lnet_find_peer_ni_locked(pbuf->pb_info.pi_ni[1].ns_nid); + if (!lpni) { + CERROR("Internal error: Failed to lookup peer NI for primary NID: %s\n", + libcfs_nid2str(pbuf->pb_info.pi_ni[1].ns_nid)); + goto out; + } + + lnet_peer_ni_decref_locked(lpni); + + lpn = lpni->lpni_peer_net; + if (lpn->lpn_peer_nets.prev != &lp->lp_peer_nets) + list_move(&lpn->lpn_peer_nets, &lp->lp_peer_nets); + + if (lpni->lpni_peer_nis.prev != &lpni->lpni_peer_net->lpn_peer_nis) + list_move(&lpni->lpni_peer_nis, + &lpni->lpni_peer_net->lpn_peer_nis); + /* * Errors other than -ENOMEM are due to peers having been * configured with DLC. Ignore these because DLC overrides @@ -2816,6 +3083,72 @@ static bool lnet_is_nid_in_ping_info(lnet_nid_t nid, struct lnet_ping_info *pinf return false; } +/* Delete a peer that has been marked for deletion. NB: when this peer was added + * to the discovery queue a reference was taken that will prevent the peer from + * actually being freed by this function. After this function exits the + * discovery thread should call lnet_peer_discovery_complete() which will + * drop that reference as well as wake any waiters that may also be holding a + * ref on the peer + */ +static int lnet_peer_deletion(struct lnet_peer *lp) +__must_hold(&lp->lp_lock) +{ + struct list_head rlist; + struct lnet_route *route, *tmp; + int sensitivity = lp->lp_health_sensitivity; + + INIT_LIST_HEAD(&rlist); + + lp->lp_state &= ~(LNET_PEER_DISCOVERING | LNET_PEER_FORCE_PING | + LNET_PEER_FORCE_PUSH); + CDEBUG(D_NET, "peer %s(%p) state %#x\n", + libcfs_nid2str(lp->lp_primary_nid), lp, lp->lp_state); + + /* no-op if lnet_peer_del() has already been called on this peer */ + if (lp->lp_state & LNET_PEER_MARK_DELETED) + return 0; + + if (the_lnet.ln_dc_state != LNET_DC_STATE_RUNNING) + return -ESHUTDOWN; + + spin_unlock(&lp->lp_lock); + + mutex_lock(&the_lnet.ln_api_mutex); + + lnet_net_lock(LNET_LOCK_EX); + /* remove the peer from the discovery work + * queue if it's on there in preparation + * of deleting it. + */ + if (!list_empty(&lp->lp_dc_list)) + list_del_init(&lp->lp_dc_list); + list_for_each_entry_safe(route, tmp, + &lp->lp_routes, + lr_gwlist) + lnet_move_route(route, NULL, &rlist); + lnet_net_unlock(LNET_LOCK_EX); + + /* lnet_peer_del() deletes all the peer NIs owned by this peer */ + lnet_peer_del(lp); + + list_for_each_entry_safe(route, tmp, + &rlist, lr_list) { + /* re-add these routes */ + lnet_add_route(route->lr_net, + route->lr_hops, + route->lr_nid, + route->lr_priority, + sensitivity); + LIBCFS_FREE(route, sizeof(*route)); + } + + mutex_unlock(&the_lnet.ln_api_mutex); + + spin_lock(&lp->lp_lock); + + return 0; +} + /* * Update a peer using the data received. */ @@ -2897,7 +3230,7 @@ __must_hold(&lp->lp_lock) rc = lnet_peer_merge_data(lp, pbuf); } else { lpni = lnet_find_peer_ni_locked(nid); - if (!lpni) { + if (!lpni || lp == lpni->lpni_peer_net->lpn_peer) { rc = lnet_peer_set_primary_nid(lp, nid, flags); if (rc) { CERROR("Primary NID error %s versus %s: %d\n", @@ -2906,6 +3239,8 @@ __must_hold(&lp->lp_lock) } else { rc = lnet_peer_merge_data(lp, pbuf); } + if (lpni) + lnet_peer_ni_decref_locked(lpni); } else { struct lnet_peer *new_lp; new_lp = lpni->lpni_peer_net->lpn_peer; @@ -2914,10 +3249,24 @@ __must_hold(&lp->lp_lock) * should have discovery/MR enabled as well, since * it's the same peer, which we're about to merge */ + spin_lock(&lp->lp_lock); + spin_lock(&new_lp->lp_lock); if (!(lp->lp_state & LNET_PEER_NO_DISCOVERY)) new_lp->lp_state &= ~LNET_PEER_NO_DISCOVERY; if (lp->lp_state & LNET_PEER_MULTI_RAIL) new_lp->lp_state |= LNET_PEER_MULTI_RAIL; + /* If we're processing a ping reply then we may be + * about to send a push to the peer that we ping'd. + * Since the ping reply that we're processing was + * received by lp, we need to set the discovery source + * NID for new_lp to the NID stored in lp. + */ + if (lp->lp_disc_src_nid != LNET_NID_ANY) { + new_lp->lp_disc_src_nid = lp->lp_disc_src_nid; + new_lp->lp_disc_dst_nid = lp->lp_disc_dst_nid; + } + spin_unlock(&new_lp->lp_lock); + spin_unlock(&lp->lp_lock); rc = lnet_peer_set_primary_data(new_lp, pbuf); lnet_consolidate_routes_locked(lp, new_lp); @@ -2965,41 +3314,10 @@ __must_hold(&lp->lp_lock) return rc ? rc : LNET_REDISCOVER_PEER; } -/* - * Select NID to send a Ping or Push to. - */ -static lnet_nid_t lnet_peer_select_nid(struct lnet_peer *lp) -{ - struct lnet_peer_ni *lpni; - - /* Look for a direct-connected NID for this peer. */ - lpni = NULL; - while ((lpni = lnet_get_next_peer_ni_locked(lp, NULL, lpni)) != NULL) { - if (!lnet_get_net_locked(lpni->lpni_peer_net->lpn_net_id)) - continue; - break; - } - if (lpni) - return lpni->lpni_nid; - - /* Look for a routed-connected NID for this peer. */ - lpni = NULL; - while ((lpni = lnet_get_next_peer_ni_locked(lp, NULL, lpni)) != NULL) { - if (!lnet_find_rnet_locked(lpni->lpni_peer_net->lpn_net_id)) - continue; - break; - } - if (lpni) - return lpni->lpni_nid; - - return LNET_NID_ANY; -} - /* Active side of ping. */ static int lnet_peer_send_ping(struct lnet_peer *lp) __must_hold(&lp->lp_lock) { - lnet_nid_t pnid; int nnis; int rc; int cpt; @@ -3011,12 +3329,11 @@ __must_hold(&lp->lp_lock) cpt = lnet_net_lock_current(); /* Refcount for MD. */ lnet_peer_addref_locked(lp); - pnid = lnet_peer_select_nid(lp); lnet_net_unlock(cpt); nnis = max(lp->lp_data_nnis, LNET_INTERFACES_MIN); - rc = lnet_send_ping(pnid, &lp->lp_ping_mdh, nnis, lp, + rc = lnet_send_ping(lp->lp_primary_nid, &lp->lp_ping_mdh, nnis, lp, the_lnet.ln_dc_handler, false); /* @@ -3141,18 +3458,17 @@ __must_hold(&lp->lp_lock) CERROR("Can't bind push source MD: %d\n", rc); goto fail_error; } + cpt = lnet_net_lock_current(); /* Refcount for MD. */ lnet_peer_addref_locked(lp); id.pid = LNET_PID_LUSTRE; - id.nid = lnet_peer_select_nid(lp); + if (lp->lp_disc_dst_nid != LNET_NID_ANY) + id.nid = lp->lp_disc_dst_nid; + else + id.nid = lp->lp_primary_nid; lnet_net_unlock(cpt); - if (id.nid == LNET_NID_ANY) { - rc = -EHOSTUNREACH; - goto fail_unlink; - } - rc = LNetPut(lp->lp_disc_src_nid, lp->lp_push_mdh, LNET_ACK_REQ, id, LNET_RESERVED_PORTAL, LNET_PROTO_PING_MATCHBITS, 0, 0); @@ -3164,6 +3480,7 @@ __must_hold(&lp->lp_lock) * scratch */ lp->lp_disc_src_nid = LNET_NID_ANY; + lp->lp_disc_dst_nid = LNET_NID_ANY; if (rc) goto fail_unlink; @@ -3207,37 +3524,6 @@ static void lnet_peer_discovery_error(struct lnet_peer *lp, int error) } /* - * Discovering this peer is taking too long. Cancel any Ping or Push - * that discovery is waiting on by unlinking the relevant MDs. The - * lnet_discovery_event_handler() will proceed from here and complete - * the cleanup. - */ -static void lnet_peer_cancel_discovery(struct lnet_peer *lp) -{ - struct lnet_handle_md ping_mdh; - struct lnet_handle_md push_mdh; - - LNetInvalidateMDHandle(&ping_mdh); - LNetInvalidateMDHandle(&push_mdh); - - spin_lock(&lp->lp_lock); - if (lp->lp_state & LNET_PEER_PING_SENT) { - ping_mdh = lp->lp_ping_mdh; - LNetInvalidateMDHandle(&lp->lp_ping_mdh); - } - if (lp->lp_state & LNET_PEER_PUSH_SENT) { - push_mdh = lp->lp_push_mdh; - LNetInvalidateMDHandle(&lp->lp_push_mdh); - } - spin_unlock(&lp->lp_lock); - - if (!LNetMDHandleIsInvalid(ping_mdh)) - LNetMDUnlink(ping_mdh); - if (!LNetMDHandleIsInvalid(push_mdh)) - LNetMDUnlink(push_mdh); -} - -/* * Wait for work to be queued or some other change that must be * attended to. Returns non-zero if the discovery thread should shut * down. @@ -3393,7 +3679,10 @@ static int lnet_peer_discovery(void *arg) CDEBUG(D_NET, "peer %s(%p) state %#x\n", libcfs_nid2str(lp->lp_primary_nid), lp, lp->lp_state); - if (lp->lp_state & LNET_PEER_DATA_PRESENT) + if (lp->lp_state & (LNET_PEER_MARK_DELETION | + LNET_PEER_MARK_DELETED)) + rc = lnet_peer_deletion(lp); + else if (lp->lp_state & LNET_PEER_DATA_PRESENT) rc = lnet_peer_data_present(lp); else if (lp->lp_state & LNET_PEER_PING_FAILED) rc = lnet_peer_ping_failed(lp); @@ -3426,49 +3715,6 @@ static int lnet_peer_discovery(void *arg) if (the_lnet.ln_dc_state == LNET_DC_STATE_STOPPING) break; - if (lp->lp_state & LNET_PEER_MARK_DELETION) { - struct list_head rlist; - struct lnet_route *route, *tmp; - int sensitivity = lp->lp_health_sensitivity; - - INIT_LIST_HEAD(&rlist); - - /* - * remove the peer from the discovery work - * queue if it's on there in preparation - * of deleting it. - */ - if (!list_empty(&lp->lp_dc_list)) - list_del(&lp->lp_dc_list); - - lnet_net_unlock(LNET_LOCK_EX); - - mutex_lock(&the_lnet.ln_api_mutex); - - lnet_net_lock(LNET_LOCK_EX); - list_for_each_entry_safe(route, tmp, - &lp->lp_routes, - lr_gwlist) - lnet_move_route(route, NULL, &rlist); - lnet_net_unlock(LNET_LOCK_EX); - - /* delete the peer */ - lnet_peer_del(lp); - - list_for_each_entry_safe(route, tmp, - &rlist, lr_list) { - /* re-add these routes */ - lnet_add_route(route->lr_net, - route->lr_hops, - route->lr_nid, - route->lr_priority, - sensitivity); - LIBCFS_FREE(route, sizeof(*route)); - } - mutex_unlock(&the_lnet.ln_api_mutex); - - lnet_net_lock(LNET_LOCK_EX); - } } lnet_net_unlock(LNET_LOCK_EX); @@ -3595,7 +3841,7 @@ lnet_debug_peer(lnet_nid_t nid) aliveness = (lnet_is_peer_ni_alive(lp)) ? "up" : "down"; CDEBUG(D_WARNING, "%-24s %4d %5s %5d %5d %5d %5d %5d %ld\n", - libcfs_nid2str(lp->lpni_nid), atomic_read(&lp->lpni_refcount), + libcfs_nid2str(lp->lpni_nid), kref_read(&lp->lpni_kref), aliveness, lp->lpni_net->net_tunables.lct_peer_tx_credits, lp->lpni_rtrcredits, lp->lpni_minrtrcredits, lp->lpni_txcredits, lp->lpni_mintxcredits, lp->lpni_txqnob); @@ -3651,7 +3897,7 @@ int lnet_get_peer_ni_info(__u32 peer_index, __u64 *nid, lnet_is_peer_ni_alive(lp) ? "up" : "down"); *nid = lp->lpni_nid; - *refcount = atomic_read(&lp->lpni_refcount); + *refcount = kref_read(&lp->lpni_kref); *ni_peer_tx_credits = lp->lpni_net->net_tunables.lct_peer_tx_credits; *peer_tx_credits = lp->lpni_txcredits; @@ -3737,7 +3983,7 @@ int lnet_get_peer_info(struct lnet_ioctl_peer_cfg *cfg, void __user *bulk) snprintf(lpni_info->cr_aliveness, LNET_MAX_STR_LEN, lnet_is_peer_ni_alive(lpni) ? "up" : "down"); - lpni_info->cr_refcount = atomic_read(&lpni->lpni_refcount); + lpni_info->cr_refcount = kref_read(&lpni->lpni_kref); lpni_info->cr_ni_peer_tx_credits = (lpni->lpni_net != NULL) ? lpni->lpni_net->net_tunables.lct_peer_tx_credits : 0; lpni_info->cr_peer_tx_credits = lpni->lpni_txcredits; @@ -3773,6 +4019,8 @@ int lnet_get_peer_info(struct lnet_ioctl_peer_cfg *cfg, void __user *bulk) atomic_read(&lpni->lpni_hstats.hlt_remote_error); lpni_hstats->hlpni_health_value = atomic_read(&lpni->lpni_healthv); + lpni_hstats->hlpni_ping_count = lpni->lpni_ping_count; + lpni_hstats->hlpni_next_ping = lpni->lpni_next_ping; if (copy_to_user(bulk, lpni_hstats, sizeof(*lpni_hstats))) goto out_free_hstats; bulk += sizeof(*lpni_hstats); @@ -3793,21 +4041,54 @@ out: return rc; } +/* must hold net_lock/0 */ void -lnet_peer_ni_add_to_recoveryq_locked(struct lnet_peer_ni *lpni) +lnet_peer_ni_add_to_recoveryq_locked(struct lnet_peer_ni *lpni, + struct list_head *recovery_queue, + time64_t now) { /* the mt could've shutdown and cleaned up the queues */ if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING) return; - if (list_empty(&lpni->lpni_recovery) && - atomic_read(&lpni->lpni_healthv) < LNET_MAX_HEALTH_VALUE) { - CDEBUG(D_NET, "lpni %s added to recovery queue. Health = %d\n", - libcfs_nid2str(lpni->lpni_nid), - atomic_read(&lpni->lpni_healthv)); - list_add_tail(&lpni->lpni_recovery, &the_lnet.ln_mt_peerNIRecovq); - lnet_peer_ni_addref_locked(lpni); + if (!list_empty(&lpni->lpni_recovery)) + return; + + if (atomic_read(&lpni->lpni_healthv) == LNET_MAX_HEALTH_VALUE) + return; + + if (!lpni->lpni_last_alive) { + CDEBUG(D_NET, + "lpni %s(%p) not eligible for recovery last alive %lld\n", + libcfs_nid2str(lpni->lpni_nid), lpni, + lpni->lpni_last_alive); + return; + } + + if (now > lpni->lpni_last_alive + lnet_recovery_limit) { + CDEBUG(D_NET, "lpni %s aged out last alive %lld\n", + libcfs_nid2str(lpni->lpni_nid), + lpni->lpni_last_alive); + /* Reset the ping count so that if this peer NI is added back to + * the recovery queue we will send the first ping right away. + */ + lpni->lpni_ping_count = 0; + return; } + + /* This peer NI is going on the recovery queue, so take a ref on it */ + lnet_peer_ni_addref_locked(lpni); + + lnet_peer_ni_set_next_ping(lpni, now); + + CDEBUG(D_NET, "%s added to recovery queue. ping count: %u next ping: %lld last alive: %lld health: %d\n", + libcfs_nid2str(lpni->lpni_nid), + lpni->lpni_ping_count, + lpni->lpni_next_ping, + lpni->lpni_last_alive, + atomic_read(&lpni->lpni_healthv)); + + list_add_tail(&lpni->lpni_recovery, recovery_queue); } /* Call with the ln_api_mutex held */ @@ -3820,10 +4101,13 @@ lnet_peer_ni_set_healthv(lnet_nid_t nid, int value, bool all) struct lnet_peer_ni *lpni; int lncpt; int cpt; + time64_t now; if (the_lnet.ln_state != LNET_STATE_RUNNING) return; + now = ktime_get_seconds(); + if (!all) { lnet_net_lock(LNET_LOCK_EX); lpni = lnet_find_peer_ni_locked(nid); @@ -3831,8 +4115,9 @@ lnet_peer_ni_set_healthv(lnet_nid_t nid, int value, bool all) lnet_net_unlock(LNET_LOCK_EX); return; } - atomic_set(&lpni->lpni_healthv, value); - lnet_peer_ni_add_to_recoveryq_locked(lpni); + lnet_set_lpni_healthv_locked(lpni, value); + lnet_peer_ni_add_to_recoveryq_locked(lpni, + &the_lnet.ln_mt_peerNIRecovq, now); lnet_peer_ni_decref_locked(lpni); lnet_net_unlock(LNET_LOCK_EX); return; @@ -3841,8 +4126,8 @@ lnet_peer_ni_set_healthv(lnet_nid_t nid, int value, bool all) lncpt = cfs_percpt_number(the_lnet.ln_peer_tables); /* - * Walk all the peers and reset the healhv for each one to the - * maximum value. + * Walk all the peers and reset the health value for each one to the + * specified value. */ lnet_net_lock(LNET_LOCK_EX); for (cpt = 0; cpt < lncpt; cpt++) { @@ -3851,8 +4136,10 @@ lnet_peer_ni_set_healthv(lnet_nid_t nid, int value, bool all) list_for_each_entry(lpn, &lp->lp_peer_nets, lpn_peer_nets) { list_for_each_entry(lpni, &lpn->lpn_peer_nis, lpni_peer_nis) { - atomic_set(&lpni->lpni_healthv, value); - lnet_peer_ni_add_to_recoveryq_locked(lpni); + lnet_set_lpni_healthv_locked(lpni, + value); + lnet_peer_ni_add_to_recoveryq_locked(lpni, + &the_lnet.ln_mt_peerNIRecovq, now); } } }