Whamcloud - gitweb
LU-11761 fld: let's caller to retry FLD_QUERY 62/34962/10
authorHongchao Zhang <hongchao@whamcloud.com>
Thu, 4 Jul 2019 13:39:24 +0000 (09:39 -0400)
committerOleg Drokin <green@whamcloud.com>
Wed, 17 Jul 2019 06:21:25 +0000 (06:21 +0000)
In fld_client_rpc(), if the FLD_QUERY request between MDTs fails
with -EWOUDBLOCK because the connection is lost, return -EAGAIN
to notify the caller to retry.

It also reverts the patch https://review.whamcloud.com/12586/, which
was landed on b2_6_90_0-5-g6db07f0 to avoid returning -EAGAIN from
lod_object_init() to confuse lu_object_find_at() (thinks the object
was dying when it encounters -EAGAIN). In current Lustre version,
lu_object_find_at() just returned found object and let's caller to
check whether it's dying.

Fixes: 6db07f095fba ("LU-5871 lod: Do not return EAGAIN in lod_object_init")
Change-Id: Ie83ebfdae2bd50c96a59a065f7f3c3dcfad04e42
Signed-off-by: Hongchao Zhang <hongchao@whamcloud.com>
Reviewed-on: https://review.whamcloud.com/34962
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Lai Siyao <lai.siyao@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/fld/fld_request.c
lustre/include/obd_support.h
lustre/lod/lod_object.c
lustre/tests/recovery-small.sh

index 123bc97..3dd616e 100644 (file)
@@ -346,7 +346,6 @@ int fld_client_rpc(struct obd_export *exp,
 
        LASSERT(exp != NULL);
 
-again:
        imp = class_exp2cliimp(exp);
        switch (fld_op) {
        case FLD_QUERY:
@@ -397,9 +396,15 @@ again:
        req->rq_reply_portal = MDC_REPLY_PORTAL;
        ptlrpc_at_set_req_timeout(req);
 
-       obd_get_request_slot(&exp->exp_obd->u.cli);
-       rc = ptlrpc_queue_wait(req);
-       obd_put_request_slot(&exp->exp_obd->u.cli);
+       if (OBD_FAIL_CHECK(OBD_FAIL_FLD_QUERY_REQ && req->rq_no_delay)) {
+               /* the same error returned by ptlrpc_import_delay_req */
+               rc = -EWOULDBLOCK;
+               req->rq_status = rc;
+       } else {
+               obd_get_request_slot(&exp->exp_obd->u.cli);
+               rc = ptlrpc_queue_wait(req);
+               obd_put_request_slot(&exp->exp_obd->u.cli);
+       }
 
        if (rc == -ENOENT) {
                /* Don't loop forever on non-existing FID sequences. */
@@ -413,15 +418,10 @@ again:
                    OCD_HAS_FLAG(&imp->imp_connect_data, LIGHTWEIGHT) &&
                    rc != -ENOTSUPP) {
                        /*
-                        * Since LWP is not replayable, so it will keep
-                        * trying unless umount happens or the remote
-                        * target does not support the operation, otherwise
-                        * it would cause unecessary failure of the
-                        * application.
+                        * Since LWP is not replayable, so notify the caller
+                        * to retry if needed after a while.
                         */
-                       ptlrpc_req_finished(req);
-                       rc = 0;
-                       goto again;
+                       rc = -EAGAIN;
                }
                GOTO(out_req, rc);
        }
index 916eb30..964bf35 100644 (file)
@@ -526,6 +526,7 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_FLD                     0x1100
 #define OBD_FAIL_FLD_QUERY_NET           0x1101
 #define OBD_FAIL_FLD_READ_NET           0x1102
+#define OBD_FAIL_FLD_QUERY_REQ          0x1103
 
 #define OBD_FAIL_SEC_CTX                 0x1200
 #define OBD_FAIL_SEC_CTX_INIT_NET        0x1201
index 10bfe4d..dd5e8e1 100644 (file)
@@ -7813,16 +7813,8 @@ static int lod_object_init(const struct lu_env *env, struct lu_object *lo,
        ENTRY;
 
        rc = lod_fld_lookup(env, lod, lu_object_fid(lo), &idx, &type);
-       if (rc != 0) {
-               /* Note: Sometimes, it will Return EAGAIN here, see
-                * ptrlpc_import_delay_req(), which might confuse
-                * lu_object_find_at() and make it wait there incorrectly.
-                * so we convert it to EIO here.*/
-               if (rc == -EAGAIN)
-                       rc = -EIO;
-
+       if (rc != 0)
                RETURN(rc);
-       }
 
        if (type == LU_SEQ_RANGE_MDT &&
            idx == lu_site2seq(lo->lo_dev->ld_site)->ss_node_id) {
index 365be60..dcf8c84 100755 (executable)
@@ -2260,6 +2260,28 @@ test_110j () {
 }
 run_test 110j "drop update reply during cross-MDT ln"
 
+test_110k() {
+       [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTS"
+       [[ $MDS1_VERSION -ge $(version_code 2.12.55) ]] ||
+               { skip "Need MDS version at least 2.12.55"; }
+
+       stop mds2 || error "stop mds2 failed"
+       umount $MOUNT
+
+#define OBD_FAIL_FLD_QUERY_REQ 0x1103
+       do_facet mds2 lctl set_param fail_loc=0x1103
+       start mds2 $(mdsdevname 2) -o abort_recovery ||
+               error "start MDS with abort_recovery should succeed"
+       do_facet mds2 lctl set_param fail_loc=0
+
+       # cleanup
+       stop mds2 || error "cleanup: stop mds2 failed"
+       start mds2 $(mdsdevname 2) || error "cleanup: start mds2 failed"
+       zconf_mount $(hostname) $MOUNT || error "cleanup: mount failed"
+       client_up || error "post-failover df failed"
+}
+run_test 110k "FID_QUERY failed during recovery"
+
 # LU-2844 mdt prepare fail should not cause umount oops
 test_111 ()
 {