X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lnet%2Flnet%2Fpeer.c;h=e3eb262081f3bbf373225b0ebd6a5df7a6c6ef55;hp=3102e1d3e486ce7033fe23b2f8a359818e3a3ff1;hb=fd32cd817cba336c684fe3ab7aac79705061e8b5;hpb=9ab84fd822305a1afd5f69a1bc81b316a9a162e4 diff --git a/lnet/lnet/peer.c b/lnet/lnet/peer.c index 3102e1d..e3eb262 100644 --- a/lnet/lnet/peer.c +++ b/lnet/lnet/peer.c @@ -40,6 +40,7 @@ #endif #include +#include #include #include @@ -166,7 +167,10 @@ lnet_peer_ni_alloc(lnet_nid_t nid) INIT_LIST_HEAD(&lpni->lpni_peer_nis); INIT_LIST_HEAD(&lpni->lpni_recovery); INIT_LIST_HEAD(&lpni->lpni_on_remote_peer_ni_list); + INIT_LIST_HEAD(&lpni->lpni_rtr_pref_nids); LNetInvalidateMDHandle(&lpni->lpni_recovery_ping_mdh); + atomic_set(&lpni->lpni_refcount, 1); + lpni->lpni_sel_priority = LNET_MAX_SELECTION_PRIORITY; spin_lock_init(&lpni->lpni_lock); @@ -194,7 +198,7 @@ lnet_peer_ni_alloc(lnet_nid_t nid) * list so it can be easily found and revisited. */ /* FIXME: per-net implementation instead? */ - atomic_inc(&lpni->lpni_refcount); + lnet_peer_ni_addref_locked(lpni); list_add_tail(&lpni->lpni_on_remote_peer_ni_list, &the_lnet.ln_remote_peer_ni_list); } @@ -216,6 +220,7 @@ lnet_peer_net_alloc(__u32 net_id) INIT_LIST_HEAD(&lpn->lpn_peer_nets); INIT_LIST_HEAD(&lpn->lpn_peer_nis); lpn->lpn_net_id = net_id; + lpn->lpn_sel_priority = LNET_MAX_SELECTION_PRIORITY; CDEBUG(D_NET, "%p net %s\n", lpn, libcfs_net2str(lpn->lpn_net_id)); @@ -258,6 +263,7 @@ lnet_peer_alloc(lnet_nid_t nid) init_waitqueue_head(&lp->lp_dc_waitq); spin_lock_init(&lp->lp_lock); lp->lp_primary_nid = nid; + lp->lp_disc_src_nid = LNET_NID_ANY; if (lnet_peers_start_down()) lp->lp_alive = false; else @@ -277,7 +283,7 @@ lnet_peer_alloc(lnet_nid_t nid) * to ever use a different interface when sending messages to * myself. */ - if (LNET_NETTYP(LNET_NIDNET(nid)) == LOLND) + if (nid == LNET_NID_LO_0) lp->lp_state = LNET_PEER_NO_DISCOVERY; lp->lp_cpt = lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER); @@ -453,6 +459,10 @@ lnet_peer_del_locked(struct lnet_peer *peer) CDEBUG(D_NET, "peer %s\n", libcfs_nid2str(peer->lp_primary_nid)); + spin_lock(&peer->lp_lock); + peer->lp_state |= LNET_PEER_MARK_DELETED; + spin_unlock(&peer->lp_lock); + lpni = lnet_get_next_peer_ni_locked(peer, NULL, lpni); while (lpni != NULL) { lpni2 = lnet_get_next_peer_ni_locked(peer, NULL, lpni); @@ -465,9 +475,41 @@ lnet_peer_del_locked(struct lnet_peer *peer) return rc2; } +/* + * Discovering this peer is taking too long. Cancel any Ping or Push + * that discovery is waiting on by unlinking the relevant MDs. The + * lnet_discovery_event_handler() will proceed from here and complete + * the cleanup. + */ +static void lnet_peer_cancel_discovery(struct lnet_peer *lp) +{ + struct lnet_handle_md ping_mdh; + struct lnet_handle_md push_mdh; + + LNetInvalidateMDHandle(&ping_mdh); + LNetInvalidateMDHandle(&push_mdh); + + spin_lock(&lp->lp_lock); + if (lp->lp_state & LNET_PEER_PING_SENT) { + ping_mdh = lp->lp_ping_mdh; + LNetInvalidateMDHandle(&lp->lp_ping_mdh); + } + if (lp->lp_state & LNET_PEER_PUSH_SENT) { + push_mdh = lp->lp_push_mdh; + LNetInvalidateMDHandle(&lp->lp_push_mdh); + } + spin_unlock(&lp->lp_lock); + + if (!LNetMDHandleIsInvalid(ping_mdh)) + LNetMDUnlink(ping_mdh); + if (!LNetMDHandleIsInvalid(push_mdh)) + LNetMDUnlink(push_mdh); +} + static int lnet_peer_del(struct lnet_peer *peer) { + lnet_peer_cancel_discovery(peer); lnet_net_lock(LNET_LOCK_EX); lnet_peer_del_locked(peer); lnet_net_unlock(LNET_LOCK_EX); @@ -603,7 +645,7 @@ lnet_peer_table_del_rtrs_locked(struct lnet_net *net, gw_nid = lp->lpni_peer_net->lpn_peer->lp_primary_nid; lnet_net_unlock(LNET_LOCK_EX); - lnet_del_route(LNET_NIDNET(LNET_NID_ANY), gw_nid); + lnet_del_route(LNET_NET_ANY, gw_nid); lnet_net_lock(LNET_LOCK_EX); } } @@ -893,6 +935,94 @@ lnet_push_update_to_peers(int force) wake_up(&the_lnet.ln_dc_waitq); } +/* find the NID in the preferred gateways for the remote peer + * return: + * false: list is not empty and NID is not preferred + * false: list is empty + * true: nid is found in the list + */ +bool +lnet_peer_is_pref_rtr_locked(struct lnet_peer_ni *lpni, + lnet_nid_t gw_nid) +{ + struct lnet_nid_list *ne; + + CDEBUG(D_NET, "%s: rtr pref emtpy: %d\n", + libcfs_nid2str(lpni->lpni_nid), + list_empty(&lpni->lpni_rtr_pref_nids)); + + if (list_empty(&lpni->lpni_rtr_pref_nids)) + return false; + + /* iterate through all the preferred NIDs and see if any of them + * matches the provided gw_nid + */ + list_for_each_entry(ne, &lpni->lpni_rtr_pref_nids, nl_list) { + CDEBUG(D_NET, "Comparing pref %s with gw %s\n", + libcfs_nid2str(ne->nl_nid), + libcfs_nid2str(gw_nid)); + if (ne->nl_nid == gw_nid) + return true; + } + + return false; +} + +void +lnet_peer_clr_pref_rtrs(struct lnet_peer_ni *lpni) +{ + struct list_head zombies; + struct lnet_nid_list *ne; + struct lnet_nid_list *tmp; + int cpt = lpni->lpni_cpt; + + INIT_LIST_HEAD(&zombies); + + lnet_net_lock(cpt); + list_splice_init(&lpni->lpni_rtr_pref_nids, &zombies); + lnet_net_unlock(cpt); + + list_for_each_entry_safe(ne, tmp, &zombies, nl_list) { + list_del(&ne->nl_list); + LIBCFS_FREE(ne, sizeof(*ne)); + } +} + +int +lnet_peer_add_pref_rtr(struct lnet_peer_ni *lpni, + lnet_nid_t gw_nid) +{ + int cpt = lpni->lpni_cpt; + struct lnet_nid_list *ne = NULL; + + /* This function is called with api_mutex held. When the api_mutex + * is held the list can not be modified, as it is only modified as + * a result of applying a UDSP and that happens under api_mutex + * lock. + */ + __must_hold(&the_lnet.ln_api_mutex); + + list_for_each_entry(ne, &lpni->lpni_rtr_pref_nids, nl_list) { + if (ne->nl_nid == gw_nid) + return -EEXIST; + } + + LIBCFS_CPT_ALLOC(ne, lnet_cpt_table(), cpt, sizeof(*ne)); + if (!ne) + return -ENOMEM; + + ne->nl_nid = gw_nid; + + /* Lock the cpt to protect against addition and checks in the + * selection algorithm + */ + lnet_net_lock(cpt); + list_add(&ne->nl_list, &lpni->lpni_rtr_pref_nids); + lnet_net_unlock(cpt); + + return 0; +} + /* * Test whether a ni is a preferred ni for this peer_ni, e.g, whether * this is a preferred point-to-point path. Call with lnet_net_lock in @@ -901,14 +1031,14 @@ lnet_push_update_to_peers(int force) bool lnet_peer_is_pref_nid_locked(struct lnet_peer_ni *lpni, lnet_nid_t nid) { - int i; + struct lnet_nid_list *ne; if (lpni->lpni_pref_nnids == 0) return false; if (lpni->lpni_pref_nnids == 1) return lpni->lpni_pref.nid == nid; - for (i = 0; i < lpni->lpni_pref_nnids; i++) { - if (lpni->lpni_pref.nids[i] == nid) + list_for_each_entry(ne, &lpni->lpni_pref.nids, nl_list) { + if (ne->nl_nid == nid) return true; } return false; @@ -965,6 +1095,12 @@ lnet_peer_ni_clr_non_mr_pref_nid(struct lnet_peer_ni *lpni) return rc; } +void +lnet_peer_ni_set_selection_priority(struct lnet_peer_ni *lpni, __u32 priority) +{ + lpni->lpni_sel_priority = priority; +} + /* * Clear the preferred NIDs from a non-multi-rail peer. */ @@ -980,11 +1116,10 @@ lnet_peer_clr_non_mr_pref_nids(struct lnet_peer *lp) int lnet_peer_add_pref_nid(struct lnet_peer_ni *lpni, lnet_nid_t nid) { - lnet_nid_t *nids = NULL; - lnet_nid_t *oldnids = NULL; struct lnet_peer *lp = lpni->lpni_peer_net->lpn_peer; - int size; - int i; + struct lnet_nid_list *ne1 = NULL; + struct lnet_nid_list *ne2 = NULL; + lnet_nid_t tmp_nid = LNET_NID_ANY; int rc = 0; if (nid == LNET_NID_ANY) { @@ -998,29 +1133,47 @@ lnet_peer_add_pref_nid(struct lnet_peer_ni *lpni, lnet_nid_t nid) } /* A non-MR node may have only one preferred NI per peer_ni */ - if (lpni->lpni_pref_nnids > 0) { - if (!(lp->lp_state & LNET_PEER_MULTI_RAIL)) { - rc = -EPERM; - goto out; - } + if (lpni->lpni_pref_nnids > 0 && + !(lp->lp_state & LNET_PEER_MULTI_RAIL)) { + rc = -EPERM; + goto out; } + /* add the new preferred nid to the list of preferred nids */ if (lpni->lpni_pref_nnids != 0) { - size = sizeof(*nids) * (lpni->lpni_pref_nnids + 1); - LIBCFS_CPT_ALLOC(nids, lnet_cpt_table(), lpni->lpni_cpt, size); - if (!nids) { + size_t alloc_size = sizeof(*ne1); + + if (lpni->lpni_pref_nnids == 1) { + tmp_nid = lpni->lpni_pref.nid; + INIT_LIST_HEAD(&lpni->lpni_pref.nids); + } + + list_for_each_entry(ne1, &lpni->lpni_pref.nids, nl_list) { + if (ne1->nl_nid == nid) { + rc = -EEXIST; + goto out; + } + } + + LIBCFS_CPT_ALLOC(ne1, lnet_cpt_table(), lpni->lpni_cpt, + alloc_size); + if (!ne1) { rc = -ENOMEM; goto out; } - for (i = 0; i < lpni->lpni_pref_nnids; i++) { - if (lpni->lpni_pref.nids[i] == nid) { - LIBCFS_FREE(nids, size); - rc = -EEXIST; + + /* move the originally stored nid to the list */ + if (lpni->lpni_pref_nnids == 1) { + LIBCFS_CPT_ALLOC(ne2, lnet_cpt_table(), + lpni->lpni_cpt, alloc_size); + if (!ne2) { + rc = -ENOMEM; goto out; } - nids[i] = lpni->lpni_pref.nids[i]; + INIT_LIST_HEAD(&ne2->nl_list); + ne2->nl_nid = tmp_nid; } - nids[i] = nid; + ne1->nl_nid = nid; } lnet_net_lock(LNET_LOCK_EX); @@ -1028,18 +1181,15 @@ lnet_peer_add_pref_nid(struct lnet_peer_ni *lpni, lnet_nid_t nid) if (lpni->lpni_pref_nnids == 0) { lpni->lpni_pref.nid = nid; } else { - oldnids = lpni->lpni_pref.nids; - lpni->lpni_pref.nids = nids; + if (ne2) + list_add_tail(&ne2->nl_list, &lpni->lpni_pref.nids); + list_add_tail(&ne1->nl_list, &lpni->lpni_pref.nids); } lpni->lpni_pref_nnids++; lpni->lpni_state &= ~LNET_PEER_NI_NON_MR_PREF; spin_unlock(&lpni->lpni_lock); lnet_net_unlock(LNET_LOCK_EX); - if (oldnids) { - size = sizeof(*nids) * (lpni->lpni_pref_nnids - 1); - CFS_FREE_PTR_ARRAY(oldnids, size); - } out: if (rc == -EEXIST && (lpni->lpni_state & LNET_PEER_NI_NON_MR_PREF)) { spin_lock(&lpni->lpni_lock); @@ -1054,11 +1204,8 @@ out: int lnet_peer_del_pref_nid(struct lnet_peer_ni *lpni, lnet_nid_t nid) { - lnet_nid_t *nids = NULL; - lnet_nid_t *oldnids = NULL; struct lnet_peer *lp = lpni->lpni_peer_net->lpn_peer; - int size; - int i, j; + struct lnet_nid_list *ne = NULL; int rc = 0; if (lpni->lpni_pref_nnids == 0) { @@ -1071,61 +1218,71 @@ lnet_peer_del_pref_nid(struct lnet_peer_ni *lpni, lnet_nid_t nid) rc = -ENOENT; goto out; } - } else if (lpni->lpni_pref_nnids == 2) { - if (lpni->lpni_pref.nids[0] != nid && - lpni->lpni_pref.nids[1] != nid) { - rc = -ENOENT; - goto out; - } } else { - size = sizeof(*nids) * (lpni->lpni_pref_nnids - 1); - LIBCFS_CPT_ALLOC(nids, lnet_cpt_table(), lpni->lpni_cpt, size); - if (!nids) { - rc = -ENOMEM; - goto out; - } - for (i = 0, j = 0; i < lpni->lpni_pref_nnids; i++) { - if (lpni->lpni_pref.nids[i] != nid) - continue; - nids[j++] = lpni->lpni_pref.nids[i]; - } - /* Check if we actually removed a nid. */ - if (j == lpni->lpni_pref_nnids) { - LIBCFS_FREE(nids, size); - rc = -ENOENT; - goto out; + list_for_each_entry(ne, &lpni->lpni_pref.nids, nl_list) { + if (ne->nl_nid == nid) + goto remove_nid_entry; } + rc = -ENOENT; + ne = NULL; + goto out; } +remove_nid_entry: lnet_net_lock(LNET_LOCK_EX); spin_lock(&lpni->lpni_lock); - if (lpni->lpni_pref_nnids == 1) { + if (lpni->lpni_pref_nnids == 1) lpni->lpni_pref.nid = LNET_NID_ANY; - } else if (lpni->lpni_pref_nnids == 2) { - oldnids = lpni->lpni_pref.nids; - if (oldnids[0] == nid) - lpni->lpni_pref.nid = oldnids[1]; - else - lpni->lpni_pref.nid = oldnids[2]; - } else { - oldnids = lpni->lpni_pref.nids; - lpni->lpni_pref.nids = nids; + else { + list_del_init(&ne->nl_list); + if (lpni->lpni_pref_nnids == 2) { + struct lnet_nid_list *ne, *tmp; + + list_for_each_entry_safe(ne, tmp, + &lpni->lpni_pref.nids, + nl_list) { + lpni->lpni_pref.nid = ne->nl_nid; + list_del_init(&ne->nl_list); + LIBCFS_FREE(ne, sizeof(*ne)); + } + } } lpni->lpni_pref_nnids--; lpni->lpni_state &= ~LNET_PEER_NI_NON_MR_PREF; spin_unlock(&lpni->lpni_lock); lnet_net_unlock(LNET_LOCK_EX); - if (oldnids) { - size = sizeof(*nids) * (lpni->lpni_pref_nnids + 1); - CFS_FREE_PTR_ARRAY(oldnids, size); - } + if (ne) + LIBCFS_FREE(ne, sizeof(*ne)); out: CDEBUG(D_NET, "peer %s nid %s: %d\n", libcfs_nid2str(lp->lp_primary_nid), libcfs_nid2str(nid), rc); return rc; } +void +lnet_peer_clr_pref_nids(struct lnet_peer_ni *lpni) +{ + struct list_head zombies; + struct lnet_nid_list *ne; + struct lnet_nid_list *tmp; + + INIT_LIST_HEAD(&zombies); + + lnet_net_lock(LNET_LOCK_EX); + if (lpni->lpni_pref_nnids == 1) + lpni->lpni_pref.nid = LNET_NID_ANY; + else if (lpni->lpni_pref_nnids > 1) + list_splice_init(&lpni->lpni_pref.nids, &zombies); + lpni->lpni_pref_nnids = 0; + lnet_net_unlock(LNET_LOCK_EX); + + list_for_each_entry_safe(ne, tmp, &zombies, nl_list) { + list_del_init(&ne->nl_list); + LIBCFS_FREE(ne, sizeof(*ne)); + } +} + lnet_nid_t lnet_peer_primary_nid_locked(lnet_nid_t nid) { @@ -1180,6 +1337,9 @@ LNetPrimaryNID(lnet_nid_t nid) int rc = 0; int cpt; + if (nid == LNET_NID_LO_0) + return LNET_NID_LO_0; + cpt = lnet_net_lock_current(); lpni = lnet_nid2peerni_locked(nid, LNET_NID_ANY, cpt); if (IS_ERR(lpni)) { @@ -1189,6 +1349,11 @@ LNetPrimaryNID(lnet_nid_t nid) lp = lpni->lpni_peer_net->lpn_peer; while (!lnet_peer_is_uptodate(lp)) { + spin_lock(&lp->lp_lock); + /* force a full discovery cycle */ + lp->lp_state |= LNET_PEER_FORCE_PING | LNET_PEER_FORCE_PUSH; + spin_unlock(&lp->lp_lock); + rc = lnet_discover_peer_locked(lpni, cpt, true); if (rc) goto out_decref; @@ -1227,9 +1392,9 @@ lnet_peer_get_net_locked(struct lnet_peer *peer, __u32 net_id) * may be attached to a different peer, in which case it will be * properly detached first. The whole operation is done atomically. * - * Always returns 0. This is the last function called from functions - * that do return an int, so returning 0 here allows the compiler to - * do a tail call. + * This function consumes the reference on lpni and Always returns 0. + * This is the last function called from functions that do return an + * int, so returning 0 here allows the compiler to do a tail call. */ static int lnet_peer_attach_peer_ni(struct lnet_peer *lp, @@ -1238,6 +1403,8 @@ lnet_peer_attach_peer_ni(struct lnet_peer *lp, unsigned flags) { struct lnet_peer_table *ptable; + bool new_lpn = false; + int rc; /* Install the new peer_ni */ lnet_net_lock(LNET_LOCK_EX); @@ -1248,8 +1415,7 @@ lnet_peer_attach_peer_ni(struct lnet_peer *lp, ptable = the_lnet.ln_peer_tables[lpni->lpni_cpt]; list_add_tail(&lpni->lpni_hashlist, &ptable->pt_hash[hash]); ptable->pt_version++; - /* This is the 1st refcount on lpni. */ - atomic_inc(&lpni->lpni_refcount); + lnet_peer_ni_addref_locked(lpni); } /* Detach the peer_ni from an existing peer, if necessary. */ @@ -1269,6 +1435,7 @@ lnet_peer_attach_peer_ni(struct lnet_peer *lp, /* Add peer_net to peer */ if (!lpn->lpn_peer) { + new_lpn = true; lpn->lpn_peer = lp; list_add_tail(&lpn->lpn_peer_nets, &lp->lp_peer_nets); lnet_peer_addref_locked(lp); @@ -1297,11 +1464,24 @@ lnet_peer_attach_peer_ni(struct lnet_peer *lp, spin_unlock(&lp->lp_lock); lp->lp_nnis++; - lnet_net_unlock(LNET_LOCK_EX); + + /* apply UDSPs */ + if (new_lpn) { + rc = lnet_udsp_apply_policies_on_lpn(lpn); + if (rc) + CERROR("Failed to apply UDSPs on lpn %s\n", + libcfs_net2str(lpn->lpn_net_id)); + } + rc = lnet_udsp_apply_policies_on_lpni(lpni); + if (rc) + CERROR("Failed to apply UDSPs on lpni %s\n", + libcfs_nid2str(lpni->lpni_nid)); CDEBUG(D_NET, "peer %s NID %s flags %#x\n", libcfs_nid2str(lp->lp_primary_nid), libcfs_nid2str(lpni->lpni_nid), flags); + lnet_peer_ni_decref_locked(lpni); + lnet_net_unlock(LNET_LOCK_EX); return 0; } @@ -1422,17 +1602,16 @@ lnet_peer_add_nid(struct lnet_peer *lp, lnet_nid_t nid, unsigned flags) * it is not connected to this peer and was configured * by DLC. */ - lnet_peer_ni_decref_locked(lpni); if (lpni->lpni_peer_net->lpn_peer == lp) - goto out; + goto out_free_lpni; if (lnet_peer_ni_is_configured(lpni)) { rc = -EEXIST; - goto out; + goto out_free_lpni; } /* If this is the primary NID, destroy the peer. */ if (lnet_peer_ni_is_primary(lpni)) { struct lnet_peer *rtr_lp = - lpni->lpni_peer_net->lpn_peer; + lpni->lpni_peer_net->lpn_peer; int rtr_refcount = rtr_lp->lp_rtr_refcount; /* * if we're trying to delete a router it means @@ -1444,17 +1623,18 @@ lnet_peer_add_nid(struct lnet_peer *lp, lnet_nid_t nid, unsigned flags) lnet_rtr_transfer_to_peer(rtr_lp, lp); } lnet_peer_del(lpni->lpni_peer_net->lpn_peer); + lnet_peer_ni_decref_locked(lpni); lpni = lnet_peer_ni_alloc(nid); if (!lpni) { rc = -ENOMEM; - goto out; + goto out_free_lpni; } } } else { lpni = lnet_peer_ni_alloc(nid); if (!lpni) { rc = -ENOMEM; - goto out; + goto out_free_lpni; } } @@ -1477,9 +1657,7 @@ lnet_peer_add_nid(struct lnet_peer *lp, lnet_nid_t nid, unsigned flags) return lnet_peer_attach_peer_ni(lp, lpn, lpni, flags); out_free_lpni: - /* If the peer_ni was allocated above its peer_net pointer is NULL */ - if (!lpni->lpni_peer_net) - LIBCFS_FREE(lpni, sizeof(*lpni)); + lnet_peer_ni_decref_locked(lpni); out: CDEBUG(D_NET, "peer %s NID %s flags %#x: %d\n", libcfs_nid2str(lp->lp_primary_nid), libcfs_nid2str(nid), @@ -1702,19 +1880,28 @@ lnet_destroy_peer_ni_locked(struct lnet_peer_ni *lpni) lpni->lpni_peer_net = NULL; lpni->lpni_net = NULL; - /* remove the peer ni from the zombie list */ - ptable = the_lnet.ln_peer_tables[lpni->lpni_cpt]; - spin_lock(&ptable->pt_zombie_lock); - list_del_init(&lpni->lpni_hashlist); - ptable->pt_zombies--; - spin_unlock(&ptable->pt_zombie_lock); + if (!list_empty(&lpni->lpni_hashlist)) { + /* remove the peer ni from the zombie list */ + ptable = the_lnet.ln_peer_tables[lpni->lpni_cpt]; + spin_lock(&ptable->pt_zombie_lock); + list_del_init(&lpni->lpni_hashlist); + ptable->pt_zombies--; + spin_unlock(&ptable->pt_zombie_lock); + } - if (lpni->lpni_pref_nnids > 1) - CFS_FREE_PTR_ARRAY(lpni->lpni_pref.nids, lpni->lpni_pref_nnids); + if (lpni->lpni_pref_nnids > 1) { + struct lnet_nid_list *ne, *tmp; + list_for_each_entry_safe(ne, tmp, &lpni->lpni_pref.nids, + nl_list) { + list_del_init(&ne->nl_list); + LIBCFS_FREE(ne, sizeof(*ne)); + } + } LIBCFS_FREE(lpni, sizeof(*lpni)); - lnet_peer_net_decref_locked(lpn); + if (lpn) + lnet_peer_net_decref_locked(lpn); } struct lnet_peer_ni * @@ -1961,7 +2148,7 @@ void lnet_peer_push_event(struct lnet_event *ev) struct lnet_ping_buffer *pbuf; struct lnet_peer *lp; - pbuf = LNET_PING_INFO_TO_BUFFER(ev->md.start + ev->offset); + pbuf = LNET_PING_INFO_TO_BUFFER(ev->md_start + ev->offset); /* lnet_find_peer() adds a refcount */ lp = lnet_find_peer(ev->source.nid); @@ -2027,6 +2214,18 @@ void lnet_peer_push_event(struct lnet_event *ev) if (!(pbuf->pb_info.pi_features & LNET_PING_FEAT_DISCOVERY)) { CDEBUG(D_NET, "Peer %s has discovery disabled\n", libcfs_nid2str(lp->lp_primary_nid)); + /* + * Mark the peer for deletion if we already know about it + * and it's going from discovery set to no discovery set + */ + if (!(lp->lp_state & (LNET_PEER_NO_DISCOVERY | + LNET_PEER_DISCOVERING)) && + lp->lp_state & LNET_PEER_DISCOVERED) { + CDEBUG(D_NET, "Marking %s:0x%x for deletion\n", + libcfs_nid2str(lp->lp_primary_nid), + lp->lp_state); + lp->lp_state |= LNET_PEER_MARK_DELETION; + } lp->lp_state |= LNET_PEER_NO_DISCOVERY; } else if (lp->lp_state & LNET_PEER_NO_DISCOVERY) { CDEBUG(D_NET, "Peer %s has discovery enabled\n", @@ -2257,7 +2456,7 @@ lnet_discovery_event_ack(struct lnet_peer *lp, struct lnet_event *ev) { struct lnet_ping_buffer *pbuf; - pbuf = LNET_PING_INFO_TO_BUFFER(ev->md.start); + pbuf = LNET_PING_INFO_TO_BUFFER(ev->md_start); spin_lock(&lp->lp_lock); lp->lp_state &= ~LNET_PEER_PUSH_SENT; lp->lp_push_error = ev->status; @@ -2280,6 +2479,8 @@ lnet_discovery_event_reply(struct lnet_peer *lp, struct lnet_event *ev) spin_lock(&lp->lp_lock); + lp->lp_disc_src_nid = ev->target.nid; + /* * If some kind of error happened the contents of message * cannot be used. Set PING_FAILED to trigger a retry. @@ -2294,7 +2495,7 @@ lnet_discovery_event_reply(struct lnet_peer *lp, struct lnet_event *ev) goto out; } - pbuf = LNET_PING_INFO_TO_BUFFER(ev->md.start); + pbuf = LNET_PING_INFO_TO_BUFFER(ev->md_start); if (pbuf->pb_info.pi_magic == __swab32(LNET_PROTO_PING_MAGIC)) lnet_swap_pinginfo(pbuf); @@ -2495,7 +2696,7 @@ lnet_discovery_event_unlink(struct lnet_peer *lp, struct lnet_event *ev) */ static void lnet_discovery_event_handler(struct lnet_event *event) { - struct lnet_peer *lp = event->md.user_ptr; + struct lnet_peer *lp = event->md_user_ptr; struct lnet_ping_buffer *pbuf; int rc; @@ -2525,7 +2726,7 @@ static void lnet_discovery_event_handler(struct lnet_event *event) } lnet_net_lock(LNET_LOCK_EX); if (event->unlinked) { - pbuf = LNET_PING_INFO_TO_BUFFER(event->md.start); + pbuf = LNET_PING_INFO_TO_BUFFER(event->md_start); lnet_ping_buffer_decref(pbuf); lnet_peer_decref_locked(lp); } @@ -2624,7 +2825,7 @@ static int lnet_peer_merge_data(struct lnet_peer *lp, * present in curnis[] then this peer is for this node. */ for (i = 0; i < ncurnis; i++) { - if (LNET_NETTYP(LNET_NIDNET(curnis[i])) == LOLND) + if (curnis[i] == LNET_NID_LO_0) continue; for (j = 1; j < pbuf->pb_info.pi_nnis; j++) { if (curnis[i] == pbuf->pb_info.pi_ni[j].ns_nid) { @@ -2784,6 +2985,72 @@ static bool lnet_is_nid_in_ping_info(lnet_nid_t nid, struct lnet_ping_info *pinf return false; } +/* Delete a peer that has been marked for deletion. NB: when this peer was added + * to the discovery queue a reference was taken that will prevent the peer from + * actually being freed by this function. After this function exits the + * discovery thread should call lnet_peer_discovery_complete() which will + * drop that reference as well as wake any waiters that may also be holding a + * ref on the peer + */ +static int lnet_peer_deletion(struct lnet_peer *lp) +__must_hold(&lp->lp_lock) +{ + struct list_head rlist; + struct lnet_route *route, *tmp; + int sensitivity = lp->lp_health_sensitivity; + + INIT_LIST_HEAD(&rlist); + + lp->lp_state &= ~(LNET_PEER_DISCOVERING | LNET_PEER_FORCE_PING | + LNET_PEER_FORCE_PUSH); + CDEBUG(D_NET, "peer %s(%p) state %#x\n", + libcfs_nid2str(lp->lp_primary_nid), lp, lp->lp_state); + + /* no-op if lnet_peer_del() has already been called on this peer */ + if (lp->lp_state & LNET_PEER_MARK_DELETED) + return 0; + + if (the_lnet.ln_dc_state != LNET_DC_STATE_RUNNING) + return -ESHUTDOWN; + + spin_unlock(&lp->lp_lock); + + mutex_lock(&the_lnet.ln_api_mutex); + + lnet_net_lock(LNET_LOCK_EX); + /* remove the peer from the discovery work + * queue if it's on there in preparation + * of deleting it. + */ + if (!list_empty(&lp->lp_dc_list)) + list_del(&lp->lp_dc_list); + list_for_each_entry_safe(route, tmp, + &lp->lp_routes, + lr_gwlist) + lnet_move_route(route, NULL, &rlist); + lnet_net_unlock(LNET_LOCK_EX); + + /* lnet_peer_del() deletes all the peer NIs owned by this peer */ + lnet_peer_del(lp); + + list_for_each_entry_safe(route, tmp, + &rlist, lr_list) { + /* re-add these routes */ + lnet_add_route(route->lr_net, + route->lr_hops, + route->lr_nid, + route->lr_priority, + sensitivity); + LIBCFS_FREE(route, sizeof(*route)); + } + + mutex_unlock(&the_lnet.ln_api_mutex); + + spin_lock(&lp->lp_lock); + + return 0; +} + /* * Update a peer using the data received. */ @@ -2847,7 +3114,7 @@ __must_hold(&lp->lp_lock) if (pbuf->pb_info.pi_nnis <= 1) goto out; nid = pbuf->pb_info.pi_ni[1].ns_nid; - if (LNET_NETTYP(LNET_NIDNET(lp->lp_primary_nid)) == LOLND) { + if (lp->lp_primary_nid == LNET_NID_LO_0) { rc = lnet_peer_set_primary_nid(lp, nid, flags); if (!rc) rc = lnet_peer_merge_data(lp, pbuf); @@ -2985,7 +3252,7 @@ __must_hold(&lp->lp_lock) nnis = max(lp->lp_data_nnis, LNET_INTERFACES_MIN); rc = lnet_send_ping(pnid, &lp->lp_ping_mdh, nnis, lp, - the_lnet.ln_dc_eq, false); + the_lnet.ln_dc_handler, false); /* * if LNetMDBind in lnet_send_ping fails we need to decrement the @@ -3046,6 +3313,23 @@ __must_hold(&lp->lp_lock) return rc ? rc : LNET_REDISCOVER_PEER; } +/* + * Mark the peer as discovered. + */ +static int lnet_peer_discovered(struct lnet_peer *lp) +__must_hold(&lp->lp_lock) +{ + lp->lp_state |= LNET_PEER_DISCOVERED; + lp->lp_state &= ~(LNET_PEER_DISCOVERING | + LNET_PEER_REDISCOVER); + + lp->lp_dc_error = 0; + + CDEBUG(D_NET, "peer %s\n", libcfs_nid2str(lp->lp_primary_nid)); + + return 0; +} + /* Active side of push. */ static int lnet_peer_send_push(struct lnet_peer *lp) __must_hold(&lp->lp_lock) @@ -3059,6 +3343,12 @@ __must_hold(&lp->lp_lock) /* Don't push to a non-multi-rail peer. */ if (!(lp->lp_state & LNET_PEER_MULTI_RAIL)) { lp->lp_state &= ~LNET_PEER_FORCE_PUSH; + /* if peer's NIDs are uptodate then peer is discovered */ + if (lp->lp_state & LNET_PEER_NIDS_UPTODATE) { + rc = lnet_peer_discovered(lp); + return rc; + } + return 0; } @@ -3076,11 +3366,11 @@ __must_hold(&lp->lp_lock) md.length = LNET_PING_INFO_SIZE(pbuf->pb_nnis); md.threshold = 2; /* Put/Ack */ md.max_size = 0; - md.options = 0; - md.eq_handle = the_lnet.ln_dc_eq; + md.options = LNET_MD_TRACK_RESPONSE; + md.handler = the_lnet.ln_dc_handler; md.user_ptr = lp; - rc = LNetMDBind(md, LNET_UNLINK, &lp->lp_push_mdh); + rc = LNetMDBind(&md, LNET_UNLINK, &lp->lp_push_mdh); if (rc) { lnet_ping_buffer_decref(pbuf); CERROR("Can't bind push source MD: %d\n", rc); @@ -3098,10 +3388,18 @@ __must_hold(&lp->lp_lock) goto fail_unlink; } - rc = LNetPut(LNET_NID_ANY, lp->lp_push_mdh, + rc = LNetPut(lp->lp_disc_src_nid, lp->lp_push_mdh, LNET_ACK_REQ, id, LNET_RESERVED_PORTAL, LNET_PROTO_PING_MATCHBITS, 0, 0); + /* + * reset the discovery nid. There is no need to restrict sending + * from that source, if we call lnet_push_update_to_peers(). It'll + * get set to a specific NID, if we initiate discovery from the + * scratch + */ + lp->lp_disc_src_nid = LNET_NID_ANY; + if (rc) goto fail_unlink; @@ -3144,53 +3442,6 @@ static void lnet_peer_discovery_error(struct lnet_peer *lp, int error) } /* - * Mark the peer as discovered. - */ -static int lnet_peer_discovered(struct lnet_peer *lp) -__must_hold(&lp->lp_lock) -{ - lp->lp_state |= LNET_PEER_DISCOVERED; - lp->lp_state &= ~(LNET_PEER_DISCOVERING | - LNET_PEER_REDISCOVER); - - CDEBUG(D_NET, "peer %s\n", libcfs_nid2str(lp->lp_primary_nid)); - - return 0; -} - - -/* - * Discovering this peer is taking too long. Cancel any Ping or Push - * that discovery is waiting on by unlinking the relevant MDs. The - * lnet_discovery_event_handler() will proceed from here and complete - * the cleanup. - */ -static void lnet_peer_cancel_discovery(struct lnet_peer *lp) -{ - struct lnet_handle_md ping_mdh; - struct lnet_handle_md push_mdh; - - LNetInvalidateMDHandle(&ping_mdh); - LNetInvalidateMDHandle(&push_mdh); - - spin_lock(&lp->lp_lock); - if (lp->lp_state & LNET_PEER_PING_SENT) { - ping_mdh = lp->lp_ping_mdh; - LNetInvalidateMDHandle(&lp->lp_ping_mdh); - } - if (lp->lp_state & LNET_PEER_PUSH_SENT) { - push_mdh = lp->lp_push_mdh; - LNetInvalidateMDHandle(&lp->lp_push_mdh); - } - spin_unlock(&lp->lp_lock); - - if (!LNetMDHandleIsInvalid(ping_mdh)) - LNetMDUnlink(ping_mdh); - if (!LNetMDHandleIsInvalid(push_mdh)) - LNetMDUnlink(push_mdh); -} - -/* * Wait for work to be queued or some other change that must be * attended to. Returns non-zero if the discovery thread should shut * down. @@ -3346,7 +3597,10 @@ static int lnet_peer_discovery(void *arg) CDEBUG(D_NET, "peer %s(%p) state %#x\n", libcfs_nid2str(lp->lp_primary_nid), lp, lp->lp_state); - if (lp->lp_state & LNET_PEER_DATA_PRESENT) + if (lp->lp_state & (LNET_PEER_MARK_DELETION | + LNET_PEER_MARK_DELETED)) + rc = lnet_peer_deletion(lp); + else if (lp->lp_state & LNET_PEER_DATA_PRESENT) rc = lnet_peer_data_present(lp); else if (lp->lp_state & LNET_PEER_PING_FAILED) rc = lnet_peer_ping_failed(lp); @@ -3378,6 +3632,7 @@ static int lnet_peer_discovery(void *arg) lnet_peer_discovery_complete(lp); if (the_lnet.ln_dc_state == LNET_DC_STATE_STOPPING) break; + } lnet_net_unlock(LNET_LOCK_EX); @@ -3417,8 +3672,8 @@ static int lnet_peer_discovery(void *arg) } lnet_net_unlock(LNET_LOCK_EX); - LNetEQFree(the_lnet.ln_dc_eq); - the_lnet.ln_dc_eq = NULL; + lnet_assert_handler_unused(the_lnet.ln_dc_handler); + the_lnet.ln_dc_handler = NULL; the_lnet.ln_dc_state = LNET_DC_STATE_SHUTDOWN; wake_up(&the_lnet.ln_dc_waitq); @@ -3437,21 +3692,14 @@ int lnet_peer_discovery_start(void) if (the_lnet.ln_dc_state != LNET_DC_STATE_SHUTDOWN) return -EALREADY; - the_lnet.ln_dc_eq = LNetEQAlloc(lnet_discovery_event_handler); - if (IS_ERR(the_lnet.ln_dc_eq)) { - rc = PTR_ERR(the_lnet.ln_dc_eq); - CERROR("Can't allocate discovery EQ: %d\n", rc); - return rc; - } - + the_lnet.ln_dc_handler = lnet_discovery_event_handler; the_lnet.ln_dc_state = LNET_DC_STATE_RUNNING; task = kthread_run(lnet_peer_discovery, NULL, "lnet_discovery"); if (IS_ERR(task)) { rc = PTR_ERR(task); CERROR("Can't start peer discovery thread: %d\n", rc); - LNetEQFree(the_lnet.ln_dc_eq); - the_lnet.ln_dc_eq = NULL; + the_lnet.ln_dc_handler = NULL; the_lnet.ln_dc_state = LNET_DC_STATE_SHUTDOWN; }