Whamcloud - gitweb
LU-15068 ptlrpc: Do not unlink difficult reply until sent 38/45138/3
authorChris Horn <chris.horn@hpe.com>
Tue, 5 Oct 2021 19:11:29 +0000 (14:11 -0500)
committerOleg Drokin <green@whamcloud.com>
Tue, 30 Nov 2021 03:45:38 +0000 (03:45 +0000)
If a difficult reply is queued in LNet, or the PUT for it is
otherwise delayed, then it is possible for the commit callback
to unlink the reply MD which will abort the send. This results in
client hitting "slow reply" timeout for the associated RPC and
an unnecessary reconnect (and possibly resend).

This patch replaces the rs_on_net flag with rs_sent and rs_unlinked.
These flags indicate whether the send event for the reply MD has
been generated, and whether the MD has been unlinked, respectively.

If rs_sent is set, but rs_unlinked has not been set, then ptlrpc_hr
is free to unlink the reply MD as a result of the commit callback.
The reply-ack will simply be dropped by the server.

If ptlrpc_hr is processing the reply because of commit callback, and
rs_sent has not been set, then ptlrpc_hr will not unlink the reply
MD. This means that the reply_out_callback must also be modified to
check for this case when the send event occurs. Otherwise, if the ACK
never arrives from the client, then the MD would never be unlinked.
Thus when the send event occurs, and rs_handled is set, the
reply_out_callback will schedule the reply for handling by ptlrpc_hr.

HPE-bug-id: LUS-10505
Signed-off-by: Chris Horn <chris.horn@hpe.com>
Change-Id: Ib8f4853c7ab35d72624fce7ee3fba9e59a746e1f
Reviewed-on: https://review.whamcloud.com/45138
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andriy Skulysh <andriy.skulysh@hpe.com>
Reviewed-by: Alexey Lyashkov <alexey.lyashkov@hpe.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/include/lustre_net.h
lustre/ldlm/ldlm_lib.c
lustre/ptlrpc/events.c
lustre/ptlrpc/pack_generic.c
lustre/ptlrpc/service.c

index ed068b4..6ad7e06 100644 (file)
@@ -653,15 +653,16 @@ struct ptlrpc_reply_state {
        /** A spinlock to protect the reply state flags */
        spinlock_t              rs_lock;
        /** Reply state flags */
-        unsigned long          rs_difficult:1;     /* ACK/commit stuff */
-        unsigned long          rs_no_ack:1;    /* no ACK, even for
+       unsigned long           rs_difficult:1;     /* ACK/commit stuff */
+       unsigned long           rs_no_ack:1;    /* no ACK, even for
                                                   difficult requests */
-        unsigned long          rs_scheduled:1;     /* being handled? */
-        unsigned long          rs_scheduled_ever:1;/* any schedule attempts? */
-        unsigned long          rs_handled:1;  /* been handled yet? */
-        unsigned long          rs_on_net:1;   /* reply_out_callback pending? */
-        unsigned long          rs_prealloc:1; /* rs from prealloc list */
-        unsigned long          rs_committed:1;/* the transaction was committed
+       unsigned long           rs_scheduled:1;     /* being handled? */
+       unsigned long           rs_scheduled_ever:1;/* any schedule attempts? */
+       unsigned long           rs_handled:1;  /* been handled yet? */
+       unsigned long           rs_sent:1;   /* Got LNET_EVENT_SEND? */
+       unsigned long           rs_unlinked:1; /* Reply MD unlinked? */
+       unsigned long           rs_prealloc:1; /* rs from prealloc list */
+       unsigned long           rs_committed:1;/* the transaction was committed
                                                  and the rs was dispatched
                                                  by ptlrpc_commit_replies */
        unsigned long           rs_convert_lock:1; /* need to convert saved
@@ -670,15 +671,15 @@ struct ptlrpc_reply_state {
        /** Number of locks awaiting client ACK */
        int                     rs_nlocks;
 
-        /** Size of the state */
-        int                    rs_size;
-        /** opcode */
-        __u32                  rs_opc;
-        /** Transaction number */
-        __u64                  rs_transno;
-        /** xid */
-        __u64                  rs_xid;
-       struct obd_export     *rs_export;
+       /** Size of the state */
+       int                     rs_size;
+       /** opcode */
+       __u32                   rs_opc;
+       /** Transaction number */
+       __u64                   rs_transno;
+       /** xid */
+       __u64                   rs_xid;
+       struct obd_export       *rs_export;
        struct ptlrpc_service_part *rs_svcpt;
        /** Lnet metadata handle for the reply */
        struct lnet_handle_md   rs_md_h;
index 64214e2..5030c14 100644 (file)
@@ -3266,7 +3266,8 @@ void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id)
        LASSERT(!rs->rs_scheduled);
        LASSERT(!rs->rs_scheduled_ever);
        LASSERT(!rs->rs_handled);
-       LASSERT(!rs->rs_on_net);
+       LASSERT(!rs->rs_sent);
+       LASSERT(!rs->rs_unlinked);
        LASSERT(rs->rs_export == NULL);
        LASSERT(list_empty(&rs->rs_obd_list));
        LASSERT(list_empty(&rs->rs_exp_list));
@@ -3275,7 +3276,8 @@ void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id)
 
        /* disable reply scheduling while I'm setting up */
        rs->rs_scheduled = 1;
-       rs->rs_on_net    = 1;
+       rs->rs_sent      = 0;
+       rs->rs_unlinked  = 0;
        rs->rs_xid       = req->rq_xid;
        rs->rs_transno   = req->rq_transno;
        rs->rs_export    = exp;
@@ -3309,13 +3311,14 @@ void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id)
                 * would have been +1 ref for the net, which
                 * reply_out_callback leaves alone)
                 */
-               rs->rs_on_net = 0;
+               rs->rs_sent = 1;
+               rs->rs_unlinked = 1;
                ptlrpc_rs_addref(rs);
        }
 
        spin_lock(&rs->rs_lock);
        if (rs->rs_transno <= exp->exp_last_committed ||
-           (!rs->rs_on_net && !rs->rs_no_ack) ||
+           (rs->rs_unlinked && !rs->rs_no_ack) ||
            list_empty(&rs->rs_exp_list) ||     /* completed already */
            list_empty(&rs->rs_obd_list)) {
                CDEBUG(D_HA, "Schedule reply immediately\n");
index fe07a49..7d4a157 100644 (file)
@@ -403,30 +403,46 @@ void reply_out_callback(struct lnet_event *ev)
        struct ptlrpc_cb_id       *cbid = ev->md_user_ptr;
        struct ptlrpc_reply_state *rs = cbid->cbid_arg;
        struct ptlrpc_service_part *svcpt = rs->rs_svcpt;
-        ENTRY;
-
-        LASSERT (ev->type == LNET_EVENT_SEND ||
-                 ev->type == LNET_EVENT_ACK ||
-                 ev->type == LNET_EVENT_UNLINK);
-
-        if (!rs->rs_difficult) {
-                /* 'Easy' replies have no further processing so I drop the
-                 * net's ref on 'rs' */
-                LASSERT (ev->unlinked);
-                ptlrpc_rs_decref(rs);
-                EXIT;
-                return;
-        }
+       bool need_schedule = false;
+
+       ENTRY;
+
+       LASSERT(ev->type == LNET_EVENT_SEND ||
+               ev->type == LNET_EVENT_ACK ||
+               ev->type == LNET_EVENT_UNLINK);
 
-        LASSERT (rs->rs_on_net);
+       if (!rs->rs_difficult) {
+               /* 'Easy' replies have no further processing so I drop the
+                * net's ref on 'rs'
+                */
+               LASSERT(ev->unlinked);
+               ptlrpc_rs_decref(rs);
+               EXIT;
+               return;
+       }
 
-        if (ev->unlinked) {
-                /* Last network callback. The net's ref on 'rs' stays put
-                 * until ptlrpc_handle_rs() is done with it */
+       if (ev->type == LNET_EVENT_SEND) {
+               spin_lock(&rs->rs_lock);
+               rs->rs_sent = 1;
+               /* If transaction was committed before the SEND, and the ACK
+                * is lost, then we need to schedule so ptlrpc_hr can unlink
+                * the MD.
+                */
+               if (rs->rs_handled)
+                       need_schedule = true;
+               spin_unlock(&rs->rs_lock);
+       }
+
+       if (ev->unlinked || need_schedule) {
+               LASSERT(rs->rs_sent);
+
+               /* Last network callback. The net's ref on 'rs' stays put
+                * until ptlrpc_handle_rs() is done with it
+                */
                spin_lock(&svcpt->scp_rep_lock);
                spin_lock(&rs->rs_lock);
 
-               rs->rs_on_net = 0;
+               rs->rs_unlinked = ev->unlinked;
                if (!rs->rs_no_ack ||
                    rs->rs_transno <=
                    rs->rs_export->exp_obd->obd_last_committed ||
index eca3952..7a6f112 100644 (file)
@@ -542,7 +542,7 @@ void lustre_free_reply_state(struct ptlrpc_reply_state *rs)
 
        LASSERT(atomic_read(&rs->rs_refcount) == 0);
        LASSERT(!rs->rs_difficult || rs->rs_handled);
-       LASSERT(!rs->rs_on_net);
+       LASSERT(!rs->rs_difficult || rs->rs_unlinked);
        LASSERT(!rs->rs_scheduled);
        LASSERT(rs->rs_export == NULL);
        LASSERT(rs->rs_nlocks == 0);
index 6a68a19..47e653a 100644 (file)
@@ -2496,10 +2496,14 @@ static int ptlrpc_handle_rs(struct ptlrpc_reply_state *rs)
                       libcfs_nid2str(exp->exp_connection->c_peer.nid));
        }
 
-       if ((!been_handled && rs->rs_on_net) || nlocks > 0) {
+       if ((rs->rs_sent && !rs->rs_unlinked) || nlocks > 0) {
                spin_unlock(&rs->rs_lock);
 
-               if (!been_handled && rs->rs_on_net) {
+               /* We can unlink if the LNET_EVENT_SEND has occurred.
+                * If rs_unlinked is set then MD is already unlinked and no
+                * need to do so here.
+                */
+               if ((rs->rs_sent && !rs->rs_unlinked)) {
                        LNetMDUnlink(rs->rs_md_h);
                        /* Ignore return code; we're racing with completion */
                }
@@ -2514,7 +2518,7 @@ static int ptlrpc_handle_rs(struct ptlrpc_reply_state *rs)
        rs->rs_scheduled = 0;
        rs->rs_convert_lock = 0;
 
-       if (!rs->rs_on_net) {
+       if (rs->rs_unlinked) {
                /* Off the net */
                spin_unlock(&rs->rs_lock);