LU-11761 fld: let's caller to retry FLD_QUERY

author Hongchao Zhang <hongchao@whamcloud.com>

Thu, 4 Jul 2019 13:39:24 +0000 (09:39 -0400)

committer Oleg Drokin <green@whamcloud.com>

Wed, 17 Jul 2019 06:21:25 +0000 (06:21 +0000)
author Hongchao Zhang <hongchao@whamcloud.com>
Thu, 4 Jul 2019 13:39:24 +0000 (09:39 -0400)
committer Oleg Drokin <green@whamcloud.com>
Wed, 17 Jul 2019 06:21:25 +0000 (06:21 +0000)
diff --git a/lustre/fld/fld_request.c b/lustre/fld/fld_request.c

index 123bc97..3dd616e 100644 (file)
--- a/lustre/fld/fld_request.c
+++ b/lustre/fld/fld_request.c
@@ -346,7 +346,6 @@ int fld_client_rpc(struct obd_export *exp,
  
         LASSERT(exp != NULL);
  
-again:
         imp = class_exp2cliimp(exp);
         switch (fld_op) {
         case FLD_QUERY:
@@ -397,9 +396,15 @@ again:
         req->rq_reply_portal = MDC_REPLY_PORTAL;
         ptlrpc_at_set_req_timeout(req);
  
-       obd_get_request_slot(&exp->exp_obd->u.cli);
-       rc = ptlrpc_queue_wait(req);
-       obd_put_request_slot(&exp->exp_obd->u.cli);
+       if (OBD_FAIL_CHECK(OBD_FAIL_FLD_QUERY_REQ && req->rq_no_delay)) {
+               /* the same error returned by ptlrpc_import_delay_req */
+               rc = -EWOULDBLOCK;
+               req->rq_status = rc;
+       } else {
+               obd_get_request_slot(&exp->exp_obd->u.cli);
+               rc = ptlrpc_queue_wait(req);
+               obd_put_request_slot(&exp->exp_obd->u.cli);
+       }
  
         if (rc == -ENOENT) {
                 /* Don't loop forever on non-existing FID sequences. */
@@ -413,15 +418,10 @@ again:
                     OCD_HAS_FLAG(&imp->imp_connect_data, LIGHTWEIGHT) &&
                     rc != -ENOTSUPP) {
                         /*
-                        * Since LWP is not replayable, so it will keep
-                        * trying unless umount happens or the remote
-                        * target does not support the operation, otherwise
-                        * it would cause unecessary failure of the
-                        * application.
+                        * Since LWP is not replayable, so notify the caller
+                        * to retry if needed after a while.
                          */
-                       ptlrpc_req_finished(req);
-                       rc = 0;
-                       goto again;
+                       rc = -EAGAIN;
                 }
                 GOTO(out_req, rc);
         }
diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h

index 916eb30..964bf35 100644 (file)
--- a/lustre/include/obd_support.h
+++ b/lustre/include/obd_support.h
@@ -526,6 +526,7 @@ extern char obd_jobid_var[];
  #define OBD_FAIL_FLD                     0x1100
  #define OBD_FAIL_FLD_QUERY_NET           0x1101
  #define OBD_FAIL_FLD_READ_NET           0x1102
+#define OBD_FAIL_FLD_QUERY_REQ          0x1103
  
  #define OBD_FAIL_SEC_CTX                 0x1200
  #define OBD_FAIL_SEC_CTX_INIT_NET        0x1201
diff --git a/lustre/lod/lod_object.c b/lustre/lod/lod_object.c

index 10bfe4d..dd5e8e1 100644 (file)
--- a/lustre/lod/lod_object.c
+++ b/lustre/lod/lod_object.c
@@ -7813,16 +7813,8 @@ static int lod_object_init(const struct lu_env *env, struct lu_object *lo,
         ENTRY;
  
         rc = lod_fld_lookup(env, lod, lu_object_fid(lo), &idx, &type);
-       if (rc != 0) {
-               /* Note: Sometimes, it will Return EAGAIN here, see
-                * ptrlpc_import_delay_req(), which might confuse
-                * lu_object_find_at() and make it wait there incorrectly.
-                * so we convert it to EIO here.*/
-               if (rc == -EAGAIN)
-                       rc = -EIO;
-
+       if (rc != 0)
                 RETURN(rc);
-       }
  
         if (type == LU_SEQ_RANGE_MDT &&
             idx == lu_site2seq(lo->lo_dev->ld_site)->ss_node_id) {
diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh

index 365be60..dcf8c84 100755 (executable)
--- a/lustre/tests/recovery-small.sh
+++ b/lustre/tests/recovery-small.sh
@@ -2260,6 +2260,28 @@ test_110j () {
  }
  run_test 110j "drop update reply during cross-MDT ln"
  
+test_110k() {
+       [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTS"
+       [[ $MDS1_VERSION -ge $(version_code 2.12.55) ]] ||
+               { skip "Need MDS version at least 2.12.55"; }
+
+       stop mds2 || error "stop mds2 failed"
+       umount $MOUNT
+
+#define OBD_FAIL_FLD_QUERY_REQ 0x1103
+       do_facet mds2 lctl set_param fail_loc=0x1103
+       start mds2 $(mdsdevname 2) -o abort_recovery ||
+               error "start MDS with abort_recovery should succeed"
+       do_facet mds2 lctl set_param fail_loc=0
+
+       # cleanup
+       stop mds2 || error "cleanup: stop mds2 failed"
+       start mds2 $(mdsdevname 2) || error "cleanup: start mds2 failed"
+       zconf_mount $(hostname) $MOUNT || error "cleanup: mount failed"
+       client_up || error "post-failover df failed"
+}
+run_test 110k "FID_QUERY failed during recovery"
+
  # LU-2844 mdt prepare fail should not cause umount oops
  test_111 ()
  {
author	Hongchao Zhang <hongchao@whamcloud.com>
	Thu, 4 Jul 2019 13:39:24 +0000 (09:39 -0400)
committer	Oleg Drokin <green@whamcloud.com>
	Wed, 17 Jul 2019 06:21:25 +0000 (06:21 +0000)
lustre/fld/fld_request.c		patch \| blob \| history
lustre/include/obd_support.h		patch \| blob \| history
lustre/lod/lod_object.c		patch \| blob \| history
lustre/tests/recovery-small.sh		patch \| blob \| history