From: Amir Shehata Date: Mon, 4 Jul 2016 21:51:06 +0000 (-0700) Subject: LU-7734 lnet: Routing fixes part 1 X-Git-Tag: 2.9.53~47^2~14 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=376633ab5c487a2e9497e118ce351c4b1597bf33 LU-7734 lnet: Routing fixes part 1 This is the first part of a routing fix. - Fix crash in lnet_parse_get() - Resolve deadlock when adding a route. - Fix an issue with dynamically turning on routing - Set the final destination NID properly when routing a msg Signed-off-by: Amir Shehata Change-Id: I68d0e4d52192aa96e37c77952a1ebe75c1b770c5 Reviewed-on: http://review.whamcloud.com/21166 --- diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h index 97ca7a4..ba98de0 100644 --- a/lnet/include/lnet/lib-lnet.h +++ b/lnet/include/lnet/lib-lnet.h @@ -796,6 +796,7 @@ struct lnet_peer_ni *lnet_get_next_peer_ni_locked(struct lnet_peer *peer, struct lnet_peer_ni *prev); struct lnet_peer *lnet_find_or_create_peer_locked(lnet_nid_t dst_nid, int cpt); struct lnet_peer_ni *lnet_nid2peerni_locked(lnet_nid_t nid, int cpt); +struct lnet_peer_ni *lnet_nid2peerni_ex(lnet_nid_t nid, int cpt); struct lnet_peer_ni *lnet_find_peer_ni_locked(lnet_nid_t nid); void lnet_peer_net_added(struct lnet_net *net); lnet_nid_t lnet_peer_primary_nid(lnet_nid_t nid); diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index ec59069..c98ec94 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -1744,16 +1744,9 @@ send: lnet_ni_addref_locked(msg->msg_txni, cpt); /* - * set the destination nid in the message here because it's - * possible that we'd be sending to a different nid than the one - * originaly given. - */ - msg->msg_hdr.dest_nid = cpu_to_le64(msg->msg_txpeer->lpni_nid); - - /* * Always set the target.nid to the best peer picked. Either the * nid will be one of the preconfigured NIDs, or the same NID as - * what was originaly set in the target or it will be the NID of + * what was originally set in the target or it will be the NID of * a router if this message should be routed */ msg->msg_target.nid = msg->msg_txpeer->lpni_nid; @@ -1776,6 +1769,19 @@ send: if (routing) { msg->msg_target_is_router = 1; msg->msg_target.pid = LNET_PID_LUSTRE; + /* + * since we're routing we want to ensure that the + * msg_hdr.dest_nid is set to the final destination. When + * the router receives this message it knows how to route + * it. + */ + msg->msg_hdr.dest_nid = cpu_to_le64(dst_nid); + } else { + /* + * if we're not routing set the dest_nid to the best peer + * ni that we picked earlier in the algorithm. + */ + msg->msg_hdr.dest_nid = cpu_to_le64(msg->msg_txpeer->lpni_nid); } rc = lnet_post_send_locked(msg, 0); @@ -1931,6 +1937,7 @@ lnet_parse_get(lnet_ni_t *ni, lnet_msg_t *msg, int rdma_get) info.mi_rlength = hdr->msg.get.sink_length; info.mi_roffset = hdr->msg.get.src_offset; info.mi_mbits = hdr->msg.get.match_bits; + info.mi_cpt = lnet_cpt_of_nid(msg->msg_rxpeer->lpni_nid, ni); rc = lnet_ptl_match_md(&info, msg); if (rc == LNET_MATCHMD_DROP) { diff --git a/lnet/lnet/peer.c b/lnet/lnet/peer.c index 4713fb9..aa85c5e 100644 --- a/lnet/lnet/peer.c +++ b/lnet/lnet/peer.c @@ -949,35 +949,73 @@ lnet_destroy_peer_ni_locked(struct lnet_peer_ni *lpni) } struct lnet_peer_ni * +lnet_nid2peerni_ex(lnet_nid_t nid, int cpt) +{ + struct lnet_peer_ni *lpni = NULL; + int rc; + + if (the_lnet.ln_shutdown) /* it's shutting down */ + return ERR_PTR(-ESHUTDOWN); + + /* + * find if a peer_ni already exists. + * If so then just return that. + */ + lpni = lnet_find_peer_ni_locked(nid); + if (lpni) + return lpni; + + lnet_net_unlock(cpt); + + rc = lnet_peer_ni_traffic_add(nid); + if (rc) { + lpni = ERR_PTR(rc); + goto out_net_relock; + } + + lpni = lnet_find_peer_ni_locked(nid); + LASSERT(lpni); + +out_net_relock: + lnet_net_lock(cpt); + + return lpni; +} + +struct lnet_peer_ni * lnet_nid2peerni_locked(lnet_nid_t nid, int cpt) { - struct lnet_peer_table *ptable; - struct lnet_peer_ni *lpni = NULL; - int cpt2; - int rc; + struct lnet_peer_ni *lpni = NULL; + int rc; if (the_lnet.ln_shutdown) /* it's shutting down */ return ERR_PTR(-ESHUTDOWN); /* - * calculate cpt2 with the standard hash function - * This cpt2 is the slot where we'll find or create the peer. + * find if a peer_ni already exists. + * If so then just return that. */ - cpt2 = lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER); - ptable = the_lnet.ln_peer_tables[cpt2]; - lpni = lnet_get_peer_ni_locked(ptable, nid); + lpni = lnet_find_peer_ni_locked(nid); if (lpni) return lpni; - /* Slow path: serialized using the ln_api_mutex. */ + /* + * Slow path: + * use the lnet_api_mutex to serialize the creation of the peer_ni + * and the creation/deletion of the local ni/net. When a local ni is + * created, if there exists a set of peer_nis on that network, + * they need to be traversed and updated. When a local NI is + * deleted, which could result in a network being deleted, then + * all peer nis on that network need to be removed as well. + * + * Creation through traffic should also be serialized with + * creation through DLC. + */ lnet_net_unlock(cpt); mutex_lock(&the_lnet.ln_api_mutex); /* * Shutdown is only set under the ln_api_lock, so a single * check here is sufficent. - * - * lnet_add_nid_to_peer() also handles the case where we've - * raced and a different thread added the NID. */ if (the_lnet.ln_shutdown) { lpni = ERR_PTR(-ESHUTDOWN); @@ -990,7 +1028,7 @@ lnet_nid2peerni_locked(lnet_nid_t nid, int cpt) goto out_mutex_unlock; } - lpni = lnet_get_peer_ni_locked(ptable, nid); + lpni = lnet_find_peer_ni_locked(nid); LASSERT(lpni); out_mutex_unlock: diff --git a/lnet/lnet/router.c b/lnet/lnet/router.c index c0dd0a9..502ff84 100644 --- a/lnet/lnet/router.c +++ b/lnet/lnet/router.c @@ -365,7 +365,7 @@ lnet_add_route(__u32 net, __u32 hops, lnet_nid_t gateway, lnet_net_lock(LNET_LOCK_EX); - lpni = lnet_nid2peerni_locked(gateway, LNET_LOCK_EX); + lpni = lnet_nid2peerni_ex(gateway, LNET_LOCK_EX); if (IS_ERR(lpni)) { lnet_net_unlock(LNET_LOCK_EX);