From: Liang Zhen Date: Sun, 1 Jul 2012 14:21:39 +0000 (+0800) Subject: LU-56 lnet: re-finalize failed ACK or routed message X-Git-Tag: 2.2.60~35 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=82e02a17c0c645a8d156e51b8d8da5eaa68b8f5b LU-56 lnet: re-finalize failed ACK or routed message lnet_finalize should restart finalizing process for failed ACK or failed forwarding, because message could be committed for sending then failed before delivering to LND, i.e: ENOMEM, in that case we can't just continue to call lnet_msg_decommit(): - The rule is message must decommit for sending first if the it's committed for both sending and receiving - CPT for sending can be different with CPT for receiving, so we should return back to lnet_finalize() to make sure we are locking the correct partition. Signed-off-by: Liang Zhen Change-Id: I0b35434762225fcb0dccad7d23bcd63740484e0a Reviewed-on: http://review.whamcloud.com/3252 Reviewed-by: Bobi Jam Tested-by: Hudson Reviewed-by: Doug Oucharek Tested-by: Maloo Reviewed-by: Oleg Drokin --- diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index a37c7bb..7fb3592 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -2011,12 +2011,9 @@ lnet_drop_delayed_msg_list(cfs_list_t *head, char *reason) lnet_drop_message(msg->msg_rxpeer->lp_ni, msg->msg_rxpeer->lp_cpt, msg->msg_private, msg->msg_len); - - lnet_net_lock(msg->msg_rxpeer->lp_cpt); - lnet_peer_decref_locked(msg->msg_rxpeer); - lnet_net_unlock(msg->msg_rxpeer->lp_cpt); - - lnet_msg_free(msg); + /* NB: message will not generate event because w/o attached MD, + * so we just use 0 as the third parameter */ + lnet_finalize(msg->msg_rxpeer->lp_ni, msg, 0); } } diff --git a/lnet/lnet/lib-msg.c b/lnet/lnet/lib-msg.c index f2ea470..cc889b3 100644 --- a/lnet/lnet/lib-msg.c +++ b/lnet/lnet/lib-msg.c @@ -356,7 +356,7 @@ lnet_msg_detach_md(lnet_msg_t *msg, int status) msg->msg_md = NULL; } -void +static int lnet_complete_msg_locked(lnet_msg_t *msg, int cpt) { lnet_handle_wire_t ack_wmd; @@ -389,9 +389,19 @@ lnet_complete_msg_locked(lnet_msg_t *msg, int cpt) rc = lnet_send(msg->msg_ev.target.nid, msg, LNET_NID_ANY); lnet_net_lock(cpt); + /* + * NB: message is committed for sending, we should return + * on success because LND will finalize this message later. + * + * Also, there is possibility that message is commited for + * sending and also failed before delivering to LND, + * i.e: ENOMEM, in that case we can't fall through either + * because CPT for sending can be different with CPT for + * receiving, so we should return back to lnet_finalize() + * to make sure we are locking the correct partition. + */ + return rc; - if (rc == 0) - return; } else if (status == 0 && /* OK so far */ (msg->msg_routing && !msg->msg_sending)) { /* not forwarded */ @@ -401,13 +411,25 @@ lnet_complete_msg_locked(lnet_msg_t *msg, int cpt) rc = lnet_send(LNET_NID_ANY, msg, LNET_NID_ANY); lnet_net_lock(cpt); - - if (rc == 0) - return; + /* + * NB: message is committed for sending, we should return + * on success because LND will finalize this message later. + * + * Also, there is possibility that message is commited for + * sending and also failed before delivering to LND, + * i.e: ENOMEM, in that case we can't fall through either: + * - The rule is message must decommit for sending first if + * the it's committed for both sending and receiving + * - CPT for sending can be different with CPT for receiving, + * so we should return back to lnet_finalize() to make + * sure we are locking the correct partition. + */ + return rc; } lnet_msg_decommit(msg, cpt, status); lnet_msg_free_locked(msg); + return 0; } void @@ -416,6 +438,7 @@ lnet_finalize (lnet_ni_t *ni, lnet_msg_t *msg, int status) struct lnet_msg_container *container; int my_slot; int cpt; + int rc; int i; LASSERT (!cfs_in_interrupt ()); @@ -449,6 +472,8 @@ lnet_finalize (lnet_ni_t *ni, lnet_msg_t *msg, int status) lnet_res_unlock(cpt); } + again: + rc = 0; if (!msg->msg_tx_committed && !msg->msg_rx_committed) { /* not commited to network yet */ LASSERT(!msg->msg_onactivelist); @@ -474,20 +499,24 @@ lnet_finalize (lnet_ni_t *ni, lnet_msg_t *msg, int status) my_slot = -1; for (i = 0; i < container->msc_nfinalizers; i++) { if (container->msc_finalizers[i] == cfs_current()) - goto out; + break; if (my_slot < 0 && container->msc_finalizers[i] == NULL) my_slot = i; } - if (my_slot < 0) - goto out; + if (i < container->msc_nfinalizers || my_slot < 0) { + lnet_net_unlock(cpt); + return; + } container->msc_finalizers[my_slot] = cfs_current(); #else LASSERT(container->msc_nfinalizers == 1); - if (container->msc_finalizers[0] != NULL) - goto out; + if (container->msc_finalizers[0] != NULL) { + lnet_net_unlock(cpt); + return; + } my_slot = i = 0; container->msc_finalizers[0] = (struct lnet_msg_container *)1; @@ -501,12 +530,16 @@ lnet_finalize (lnet_ni_t *ni, lnet_msg_t *msg, int status) /* NB drops and regains the lnet lock if it actually does * anything, so my finalizing friends can chomp along too */ - lnet_complete_msg_locked(msg, cpt); + rc = lnet_complete_msg_locked(msg, cpt); + if (rc != 0) + break; } container->msc_finalizers[my_slot] = NULL; - out: lnet_net_unlock(cpt); + + if (rc != 0) + goto again; } void