From: Isaac Huang Date: Tue, 22 Oct 2013 06:21:37 +0000 (-0600) Subject: LU-4006 lnet: abort messages whose MD has been unlinked X-Git-Tag: 2.5.52~60 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=cee48faec902dbdfe4c547b2ed92e86841ef82e3 LU-4006 lnet: abort messages whose MD has been unlinked If LNetMDUnlink has been called, all outgoing messages on that MD should be aborted before lnet_ni_send() is called. Signed-off-by: Isaac Huang Change-Id: I3ebe6d3f4ecc21cd6f00aebfb98b90109501df5a Reviewed-on: http://review.whamcloud.com/8041 Tested-by: Jenkins Reviewed-by: Liang Zhen Tested-by: Maloo Reviewed-by: Doug Oucharek Reviewed-by: Oleg Drokin --- diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h index 079ea62..30b1f85 100644 --- a/lnet/include/lnet/lib-types.h +++ b/lnet/include/lnet/lib-types.h @@ -288,8 +288,9 @@ typedef struct lnet_libmd { } md_iov; } lnet_libmd_t; -#define LNET_MD_FLAG_ZOMBIE (1 << 0) -#define LNET_MD_FLAG_AUTO_UNLINK (1 << 1) +#define LNET_MD_FLAG_ZOMBIE (1 << 0) +#define LNET_MD_FLAG_AUTO_UNLINK (1 << 1) +#define LNET_MD_FLAG_ABORTED (1 << 2) #ifdef LNET_USE_LIB_FREELIST typedef struct diff --git a/lnet/lnet/lib-md.c b/lnet/lnet/lib-md.c index 8008c63..c70b4da 100644 --- a/lnet/lnet/lib-md.c +++ b/lnet/lnet/lib-md.c @@ -391,7 +391,8 @@ EXPORT_SYMBOL(LNetMDBind); /** * Unlink the memory descriptor from any ME it may be linked to and release - * the internal resources associated with it. + * the internal resources associated with it. As a result, active messages + * associated with the MD may get aborted. * * This function does not free the memory region associated with the MD; * i.e., the memory the user allocated for this MD. If the ME associated with @@ -437,15 +438,14 @@ LNetMDUnlink (lnet_handle_md_t mdh) return -ENOENT; } - /* If the MD is busy, lnet_md_unlink just marks it for deletion, and - * when the NAL is done, the completion event flags that the MD was - * unlinked. Otherwise, we enqueue an event now... */ - - if (md->md_eq != NULL && - md->md_refcount == 0) { - lnet_build_unlink_event(md, &ev); + md->md_flags |= LNET_MD_FLAG_ABORTED; + /* If the MD is busy, lnet_md_unlink just marks it for deletion, and + * when the LND is done, the completion event flags that the MD was + * unlinked. Otherwise, we enqueue an event now... */ + if (md->md_eq != NULL && md->md_refcount == 0) { + lnet_build_unlink_event(md, &ev); lnet_eq_enqueue_event(md->md_eq, &ev); - } + } lnet_md_unlink(md); diff --git a/lnet/lnet/lib-me.c b/lnet/lnet/lib-me.c index 63d457a..0f61929 100644 --- a/lnet/lnet/lib-me.c +++ b/lnet/lnet/lib-me.c @@ -245,12 +245,13 @@ LNetMEUnlink(lnet_handle_me_t meh) return -ENOENT; } - md = me->me_md; - if (md != NULL && - md->md_eq != NULL && - md->md_refcount == 0) { - lnet_build_unlink_event(md, &ev); - lnet_eq_enqueue_event(md->md_eq, &ev); + md = me->me_md; + if (md != NULL) { + md->md_flags |= LNET_MD_FLAG_ABORTED; + if (md->md_eq != NULL && md->md_refcount == 0) { + lnet_build_unlink_event(md, &ev); + lnet_eq_enqueue_event(md->md_eq, &ev); + } } lnet_me_unlink(me); diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index 30a70ae..406e2ed 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -816,26 +816,30 @@ lnet_peer_alive_locked (lnet_peer_t *lp) return 0; } -int +/** + * \param msg The message to be sent. + * \param do_send True if lnet_ni_send() should be called in this function. + * lnet_send() is going to lnet_net_unlock immediately after this, so + * it sets do_send FALSE and I don't do the unlock/send/lock bit. + * + * \retval 0 If \a msg sent or OK to send. + * \retval EAGAIN If \a msg blocked for credit. + * \retval EHOSTUNREACH If the next hop of the message appears dead. + * \retval ECANCELED If the MD of the message has been unlinked. + */ +static int lnet_post_send_locked(lnet_msg_t *msg, int do_send) { - /* lnet_send is going to lnet_net_unlock immediately after this, - * so it sets do_send FALSE and I don't do the unlock/send/lock bit. - * I return EAGAIN if msg blocked, EHOSTUNREACH if msg_txpeer - * appears dead, and 0 if sent or OK to send */ - struct lnet_peer *lp = msg->msg_txpeer; - struct lnet_ni *ni = lp->lp_ni; - struct lnet_tx_queue *tq; - int cpt; + lnet_peer_t *lp = msg->msg_txpeer; + lnet_ni_t *ni = lp->lp_ni; + int cpt = msg->msg_tx_cpt; + struct lnet_tx_queue *tq = ni->ni_tx_queues[cpt]; /* non-lnet_send() callers have checked before */ LASSERT(!do_send || msg->msg_tx_delayed); LASSERT(!msg->msg_receiving); LASSERT(msg->msg_tx_committed); - cpt = msg->msg_tx_cpt; - tq = ni->ni_tx_queues[cpt]; - /* NB 'lp' is always the next hop */ if ((msg->msg_target.pid & LNET_PID_USERFLAG) == 0 && lnet_peer_alive_locked(lp) == 0) { @@ -852,6 +856,20 @@ lnet_post_send_locked(lnet_msg_t *msg, int do_send) return EHOSTUNREACH; } + if (msg->msg_md != NULL && + (msg->msg_md->md_flags & LNET_MD_FLAG_ABORTED) != 0) { + lnet_net_unlock(cpt); + + CNETERR("Aborting message for %s: LNetM[DE]Unlink() already " + "called on the MD/ME.\n", + libcfs_id2str(msg->msg_target)); + if (do_send) + lnet_finalize(ni, msg, -ECANCELED); + + lnet_net_lock(cpt); + return ECANCELED; + } + if (!msg->msg_peertxcredit) { LASSERT ((lp->lp_txcredits < 0) == !cfs_list_empty(&lp->lp_txq)); @@ -1388,13 +1406,13 @@ lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg, lnet_nid_t rtr_nid) rc = lnet_post_send_locked(msg, 0); lnet_net_unlock(cpt); - if (rc == EHOSTUNREACH) - return -EHOSTUNREACH; + if (rc == EHOSTUNREACH || rc == ECANCELED) + return -rc; - if (rc == 0) + if (rc == 0) lnet_ni_send(src_ni, msg); - return 0; + return 0; /* rc == 0 or EAGAIN */ } static void @@ -2352,7 +2370,6 @@ LNetGet(lnet_nid_t self, lnet_handle_md_t mdh, lnet_res_unlock(cpt); lnet_msg_free(msg); - return -ENOENT; } @@ -2378,11 +2395,11 @@ LNetGet(lnet_nid_t self, lnet_handle_md_t mdh, lnet_build_msg_event(msg, LNET_EVENT_SEND); rc = lnet_send(self, msg, LNET_NID_ANY); - if (rc < 0) { - CNETERR( "Error sending GET to %s: %d\n", - libcfs_id2str(target), rc); - lnet_finalize (NULL, msg, rc); - } + if (rc < 0) { + CNETERR("Error sending GET to %s: %d\n", + libcfs_id2str(target), rc); + lnet_finalize(NULL, msg, rc); + } /* completion will be signalled by an event */ return 0; diff --git a/lnet/lnet/lib-msg.c b/lnet/lnet/lib-msg.c index 6e1c241..c4c4d18 100644 --- a/lnet/lnet/lib-msg.c +++ b/lnet/lnet/lib-msg.c @@ -319,7 +319,7 @@ lnet_msg_attach_md(lnet_msg_t *msg, lnet_libmd_t *md, LASSERT(!msg->msg_routing); msg->msg_md = md; - if (msg->msg_receiving) { /* commited for receiving */ + if (msg->msg_receiving) { /* committed for receiving */ msg->msg_offset = offset; msg->msg_wanted = mlen; } @@ -395,7 +395,7 @@ lnet_complete_msg_locked(lnet_msg_t *msg, int cpt) * NB: message is committed for sending, we should return * on success because LND will finalize this message later. * - * Also, there is possibility that message is commited for + * Also, there is possibility that message is committed for * sending and also failed before delivering to LND, * i.e: ENOMEM, in that case we can't fall through either * because CPT for sending can be different with CPT for @@ -417,7 +417,7 @@ lnet_complete_msg_locked(lnet_msg_t *msg, int cpt) * NB: message is committed for sending, we should return * on success because LND will finalize this message later. * - * Also, there is possibility that message is commited for + * Also, there is possibility that message is committed for * sending and also failed before delivering to LND, * i.e: ENOMEM, in that case we can't fall through either: * - The rule is message must decommit for sending first if @@ -477,14 +477,14 @@ lnet_finalize (lnet_ni_t *ni, lnet_msg_t *msg, int status) again: rc = 0; if (!msg->msg_tx_committed && !msg->msg_rx_committed) { - /* not commited to network yet */ + /* not committed to network yet */ LASSERT(!msg->msg_onactivelist); lnet_msg_free(msg); return; } /* - * NB: routed message can be commited for both receiving and sending, + * NB: routed message can be committed for both receiving and sending, * we should finalize in LIFO order and keep counters correct. * (finalize sending first then finalize receiving) */