From 5c156b48425aae245537aaf10229734166463347 Mon Sep 17 00:00:00 2001 From: Chris Horn Date: Tue, 5 Oct 2021 14:11:29 -0500 Subject: [PATCH] LU-15068 ptlrpc: Do not unlink difficult reply until sent If a difficult reply is queued in LNet, or the PUT for it is otherwise delayed, then it is possible for the commit callback to unlink the reply MD which will abort the send. This results in client hitting "slow reply" timeout for the associated RPC and an unnecessary reconnect (and possibly resend). This patch replaces the rs_on_net flag with rs_sent and rs_unlinked. These flags indicate whether the send event for the reply MD has been generated, and whether the MD has been unlinked, respectively. If rs_sent is set, but rs_unlinked has not been set, then ptlrpc_hr is free to unlink the reply MD as a result of the commit callback. The reply-ack will simply be dropped by the server. If ptlrpc_hr is processing the reply because of commit callback, and rs_sent has not been set, then ptlrpc_hr will not unlink the reply MD. This means that the reply_out_callback must also be modified to check for this case when the send event occurs. Otherwise, if the ACK never arrives from the client, then the MD would never be unlinked. Thus when the send event occurs, and rs_handled is set, the reply_out_callback will schedule the reply for handling by ptlrpc_hr. HPE-bug-id: LUS-10505 Signed-off-by: Chris Horn Change-Id: Ib8f4853c7ab35d72624fce7ee3fba9e59a746e1f Reviewed-on: https://review.whamcloud.com/45138 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andriy Skulysh Reviewed-by: Alexey Lyashkov Reviewed-by: Oleg Drokin --- lustre/include/lustre_net.h | 35 ++++++++++++++-------------- lustre/ldlm/ldlm_lib.c | 11 +++++---- lustre/ptlrpc/events.c | 54 ++++++++++++++++++++++++++++---------------- lustre/ptlrpc/pack_generic.c | 2 +- lustre/ptlrpc/service.c | 10 +++++--- 5 files changed, 68 insertions(+), 44 deletions(-) diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h index ed068b4..6ad7e06 100644 --- a/lustre/include/lustre_net.h +++ b/lustre/include/lustre_net.h @@ -653,15 +653,16 @@ struct ptlrpc_reply_state { /** A spinlock to protect the reply state flags */ spinlock_t rs_lock; /** Reply state flags */ - unsigned long rs_difficult:1; /* ACK/commit stuff */ - unsigned long rs_no_ack:1; /* no ACK, even for + unsigned long rs_difficult:1; /* ACK/commit stuff */ + unsigned long rs_no_ack:1; /* no ACK, even for difficult requests */ - unsigned long rs_scheduled:1; /* being handled? */ - unsigned long rs_scheduled_ever:1;/* any schedule attempts? */ - unsigned long rs_handled:1; /* been handled yet? */ - unsigned long rs_on_net:1; /* reply_out_callback pending? */ - unsigned long rs_prealloc:1; /* rs from prealloc list */ - unsigned long rs_committed:1;/* the transaction was committed + unsigned long rs_scheduled:1; /* being handled? */ + unsigned long rs_scheduled_ever:1;/* any schedule attempts? */ + unsigned long rs_handled:1; /* been handled yet? */ + unsigned long rs_sent:1; /* Got LNET_EVENT_SEND? */ + unsigned long rs_unlinked:1; /* Reply MD unlinked? */ + unsigned long rs_prealloc:1; /* rs from prealloc list */ + unsigned long rs_committed:1;/* the transaction was committed and the rs was dispatched by ptlrpc_commit_replies */ unsigned long rs_convert_lock:1; /* need to convert saved @@ -670,15 +671,15 @@ struct ptlrpc_reply_state { /** Number of locks awaiting client ACK */ int rs_nlocks; - /** Size of the state */ - int rs_size; - /** opcode */ - __u32 rs_opc; - /** Transaction number */ - __u64 rs_transno; - /** xid */ - __u64 rs_xid; - struct obd_export *rs_export; + /** Size of the state */ + int rs_size; + /** opcode */ + __u32 rs_opc; + /** Transaction number */ + __u64 rs_transno; + /** xid */ + __u64 rs_xid; + struct obd_export *rs_export; struct ptlrpc_service_part *rs_svcpt; /** Lnet metadata handle for the reply */ struct lnet_handle_md rs_md_h; diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index 64214e2..5030c14 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -3266,7 +3266,8 @@ void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id) LASSERT(!rs->rs_scheduled); LASSERT(!rs->rs_scheduled_ever); LASSERT(!rs->rs_handled); - LASSERT(!rs->rs_on_net); + LASSERT(!rs->rs_sent); + LASSERT(!rs->rs_unlinked); LASSERT(rs->rs_export == NULL); LASSERT(list_empty(&rs->rs_obd_list)); LASSERT(list_empty(&rs->rs_exp_list)); @@ -3275,7 +3276,8 @@ void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id) /* disable reply scheduling while I'm setting up */ rs->rs_scheduled = 1; - rs->rs_on_net = 1; + rs->rs_sent = 0; + rs->rs_unlinked = 0; rs->rs_xid = req->rq_xid; rs->rs_transno = req->rq_transno; rs->rs_export = exp; @@ -3309,13 +3311,14 @@ void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id) * would have been +1 ref for the net, which * reply_out_callback leaves alone) */ - rs->rs_on_net = 0; + rs->rs_sent = 1; + rs->rs_unlinked = 1; ptlrpc_rs_addref(rs); } spin_lock(&rs->rs_lock); if (rs->rs_transno <= exp->exp_last_committed || - (!rs->rs_on_net && !rs->rs_no_ack) || + (rs->rs_unlinked && !rs->rs_no_ack) || list_empty(&rs->rs_exp_list) || /* completed already */ list_empty(&rs->rs_obd_list)) { CDEBUG(D_HA, "Schedule reply immediately\n"); diff --git a/lustre/ptlrpc/events.c b/lustre/ptlrpc/events.c index fe07a49..7d4a157 100644 --- a/lustre/ptlrpc/events.c +++ b/lustre/ptlrpc/events.c @@ -403,30 +403,46 @@ void reply_out_callback(struct lnet_event *ev) struct ptlrpc_cb_id *cbid = ev->md_user_ptr; struct ptlrpc_reply_state *rs = cbid->cbid_arg; struct ptlrpc_service_part *svcpt = rs->rs_svcpt; - ENTRY; - - LASSERT (ev->type == LNET_EVENT_SEND || - ev->type == LNET_EVENT_ACK || - ev->type == LNET_EVENT_UNLINK); - - if (!rs->rs_difficult) { - /* 'Easy' replies have no further processing so I drop the - * net's ref on 'rs' */ - LASSERT (ev->unlinked); - ptlrpc_rs_decref(rs); - EXIT; - return; - } + bool need_schedule = false; + + ENTRY; + + LASSERT(ev->type == LNET_EVENT_SEND || + ev->type == LNET_EVENT_ACK || + ev->type == LNET_EVENT_UNLINK); - LASSERT (rs->rs_on_net); + if (!rs->rs_difficult) { + /* 'Easy' replies have no further processing so I drop the + * net's ref on 'rs' + */ + LASSERT(ev->unlinked); + ptlrpc_rs_decref(rs); + EXIT; + return; + } - if (ev->unlinked) { - /* Last network callback. The net's ref on 'rs' stays put - * until ptlrpc_handle_rs() is done with it */ + if (ev->type == LNET_EVENT_SEND) { + spin_lock(&rs->rs_lock); + rs->rs_sent = 1; + /* If transaction was committed before the SEND, and the ACK + * is lost, then we need to schedule so ptlrpc_hr can unlink + * the MD. + */ + if (rs->rs_handled) + need_schedule = true; + spin_unlock(&rs->rs_lock); + } + + if (ev->unlinked || need_schedule) { + LASSERT(rs->rs_sent); + + /* Last network callback. The net's ref on 'rs' stays put + * until ptlrpc_handle_rs() is done with it + */ spin_lock(&svcpt->scp_rep_lock); spin_lock(&rs->rs_lock); - rs->rs_on_net = 0; + rs->rs_unlinked = ev->unlinked; if (!rs->rs_no_ack || rs->rs_transno <= rs->rs_export->exp_obd->obd_last_committed || diff --git a/lustre/ptlrpc/pack_generic.c b/lustre/ptlrpc/pack_generic.c index eca3952..7a6f112 100644 --- a/lustre/ptlrpc/pack_generic.c +++ b/lustre/ptlrpc/pack_generic.c @@ -542,7 +542,7 @@ void lustre_free_reply_state(struct ptlrpc_reply_state *rs) LASSERT(atomic_read(&rs->rs_refcount) == 0); LASSERT(!rs->rs_difficult || rs->rs_handled); - LASSERT(!rs->rs_on_net); + LASSERT(!rs->rs_difficult || rs->rs_unlinked); LASSERT(!rs->rs_scheduled); LASSERT(rs->rs_export == NULL); LASSERT(rs->rs_nlocks == 0); diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c index 6a68a19..47e653a 100644 --- a/lustre/ptlrpc/service.c +++ b/lustre/ptlrpc/service.c @@ -2496,10 +2496,14 @@ static int ptlrpc_handle_rs(struct ptlrpc_reply_state *rs) libcfs_nid2str(exp->exp_connection->c_peer.nid)); } - if ((!been_handled && rs->rs_on_net) || nlocks > 0) { + if ((rs->rs_sent && !rs->rs_unlinked) || nlocks > 0) { spin_unlock(&rs->rs_lock); - if (!been_handled && rs->rs_on_net) { + /* We can unlink if the LNET_EVENT_SEND has occurred. + * If rs_unlinked is set then MD is already unlinked and no + * need to do so here. + */ + if ((rs->rs_sent && !rs->rs_unlinked)) { LNetMDUnlink(rs->rs_md_h); /* Ignore return code; we're racing with completion */ } @@ -2514,7 +2518,7 @@ static int ptlrpc_handle_rs(struct ptlrpc_reply_state *rs) rs->rs_scheduled = 0; rs->rs_convert_lock = 0; - if (!rs->rs_on_net) { + if (rs->rs_unlinked) { /* Off the net */ spin_unlock(&rs->rs_lock); -- 1.8.3.1