Whamcloud - gitweb
LU-13468 fld: repeat rpc in fld_client_rpc after EAGAIN 02/38302/15
authorVladimir Saveliev <vlaidimir.saveliev@hpe.com>
Sun, 31 Oct 2021 06:42:35 +0000 (09:42 +0300)
committerOleg Drokin <green@whamcloud.com>
Thu, 20 Jan 2022 18:25:00 +0000 (18:25 +0000)
Timeout-ed rpc sent by fld_client_rpc() may lead to client operation
failure.

Have fld_client_rpc() to repeat rpc in case of EAGAIN after a while.

Test to illustrate the issue is added.

Typo in fld_client_rpc() in failure simulation is fixed.
recovery-small.sh:test_110k() is changed so that fld_client_rpc()
failed only once, otherwise it would fall into endless loop.

HPE-bug-id: LUS-8652
Fixes: e3f6111dfd1c ("LU-11761 fld: lets caller to retry FLD_QUERY")
Signed-off-by: Vladimir Saveliev <vlaidimir.saveliev@hpe.com>
Change-Id: I145e719ec2fb5f5dbf9b5aa4b2a5b7e62f98c19f
Reviewed-on: https://review.whamcloud.com/38302
Tested-by: jenkins <devops@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andriy Skulysh <andriy.skulysh@hpe.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/fld/fld_handler.c
lustre/fld/fld_request.c
lustre/tests/recovery-small.sh
lustre/tests/sanity.sh

index 0fbd690..6f01007 100644 (file)
@@ -159,7 +159,7 @@ int fld_update_from_controller(const struct lu_env *env,
        do {
                rc = fld_client_rpc(fld->lsf_control_exp, range, FLD_READ,
                                    &req);
-               if (rc != 0 && rc != -EAGAIN)
+               if (rc != 0)
                        GOTO(out, rc);
 
                LASSERT(req != NULL);
index 08387ee..3be93c2 100644 (file)
@@ -40,6 +40,7 @@
 #include <libcfs/libcfs.h>
 #include <linux/module.h>
 #include <linux/math64.h>
+#include <linux/delay.h>
 
 #include <obd.h>
 #include <obd_class.h>
@@ -333,6 +334,7 @@ int fld_client_rpc(struct obd_export *exp,
        LASSERT(exp != NULL);
 
        imp = class_exp2cliimp(exp);
+again:
        switch (fld_op) {
        case FLD_QUERY:
                req = ptlrpc_request_alloc_pack(imp, &RQF_FLD_QUERY,
@@ -382,7 +384,7 @@ int fld_client_rpc(struct obd_export *exp,
        req->rq_reply_portal = MDC_REPLY_PORTAL;
        ptlrpc_at_set_req_timeout(req);
 
-       if (OBD_FAIL_CHECK(OBD_FAIL_FLD_QUERY_REQ && req->rq_no_delay)) {
+       if (OBD_FAIL_CHECK(OBD_FAIL_FLD_QUERY_REQ) && req->rq_no_delay) {
                /* the same error returned by ptlrpc_import_delay_req */
                rc = -EAGAIN;
                req->rq_status = rc;
@@ -403,12 +405,16 @@ int fld_client_rpc(struct obd_export *exp,
                    imp->imp_connect_flags_orig & OBD_CONNECT_MDS_MDS &&
                    OCD_HAS_FLAG(&imp->imp_connect_data, LIGHTWEIGHT) &&
                    rc != -ENOTSUPP) {
-                       /*
-                        * Since LWP is not replayable, so notify the caller
-                        * to retry if needed after a while.
-                        */
+                       /* LWP is not replayable, retry after a while */
                        rc = -EAGAIN;
                }
+               if (rc == -EAGAIN) {
+                       ptlrpc_req_finished(req);
+                       if (msleep_interruptible(2 * MSEC_PER_SEC))
+                               GOTO(out_req, rc = -EINTR);
+                       rc = 0;
+                       goto again;
+               }
                GOTO(out_req, rc);
        }
 
index d8a3964..08e83aa 100755 (executable)
@@ -2273,7 +2273,7 @@ test_110k() {
        stop mds2 || error "stop mds2 failed"
 
 #define OBD_FAIL_FLD_QUERY_REQ 0x1103
-       do_facet mds2 lctl set_param fail_loc=0x1103
+       do_facet mds2 lctl set_param fail_loc=0x80001103
        local OPTS="$MDS_MOUNT_OPTS -o abort_recovery"
        start mds2 $(mdsdevname 2) $OPTS ||
                error "start MDS with abort_recovery should succeed"
index 58f311e..0878c8c 100755 (executable)
@@ -1500,6 +1500,18 @@ test_24G () {
 }
 run_test 24G "migrate symlink in rename"
 
+test_24H() {
+       [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
+       [[ $(hostname) != $(facet_active_host mds2) ]] ||
+               skip "MDT1 should be on another node"
+
+       test_mkdir -i 1 -c 1 $DIR/$tdir
+#define OBD_FAIL_FLD_QUERY_REQ           0x1103
+       do_facet mds2 $LCTL set_param fail_loc=0x80001103
+       touch $DIR/$tdir/$tfile || error "touch failed"
+}
+run_test 24H "repeat FLD_QUERY rpc"
+
 test_25a() {
        echo '== symlink sanity ============================================='