From: Hongchao Zhang Date: Thu, 4 Jul 2019 13:39:24 +0000 (-0400) Subject: LU-11761 fld: let's caller to retry FLD_QUERY X-Git-Tag: 2.12.57~149 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=e3f6111dfd1c6f2266d0beef67e5a7514a6965d0 LU-11761 fld: let's caller to retry FLD_QUERY In fld_client_rpc(), if the FLD_QUERY request between MDTs fails with -EWOUDBLOCK because the connection is lost, return -EAGAIN to notify the caller to retry. It also reverts the patch https://review.whamcloud.com/12586/, which was landed on b2_6_90_0-5-g6db07f0 to avoid returning -EAGAIN from lod_object_init() to confuse lu_object_find_at() (thinks the object was dying when it encounters -EAGAIN). In current Lustre version, lu_object_find_at() just returned found object and let's caller to check whether it's dying. Fixes: 6db07f095fba ("LU-5871 lod: Do not return EAGAIN in lod_object_init") Change-Id: Ie83ebfdae2bd50c96a59a065f7f3c3dcfad04e42 Signed-off-by: Hongchao Zhang Reviewed-on: https://review.whamcloud.com/34962 Reviewed-by: Andreas Dilger Reviewed-by: Lai Siyao Tested-by: jenkins Tested-by: Maloo Reviewed-by: Oleg Drokin --- diff --git a/lustre/fld/fld_request.c b/lustre/fld/fld_request.c index 123bc97..3dd616e 100644 --- a/lustre/fld/fld_request.c +++ b/lustre/fld/fld_request.c @@ -346,7 +346,6 @@ int fld_client_rpc(struct obd_export *exp, LASSERT(exp != NULL); -again: imp = class_exp2cliimp(exp); switch (fld_op) { case FLD_QUERY: @@ -397,9 +396,15 @@ again: req->rq_reply_portal = MDC_REPLY_PORTAL; ptlrpc_at_set_req_timeout(req); - obd_get_request_slot(&exp->exp_obd->u.cli); - rc = ptlrpc_queue_wait(req); - obd_put_request_slot(&exp->exp_obd->u.cli); + if (OBD_FAIL_CHECK(OBD_FAIL_FLD_QUERY_REQ && req->rq_no_delay)) { + /* the same error returned by ptlrpc_import_delay_req */ + rc = -EWOULDBLOCK; + req->rq_status = rc; + } else { + obd_get_request_slot(&exp->exp_obd->u.cli); + rc = ptlrpc_queue_wait(req); + obd_put_request_slot(&exp->exp_obd->u.cli); + } if (rc == -ENOENT) { /* Don't loop forever on non-existing FID sequences. */ @@ -413,15 +418,10 @@ again: OCD_HAS_FLAG(&imp->imp_connect_data, LIGHTWEIGHT) && rc != -ENOTSUPP) { /* - * Since LWP is not replayable, so it will keep - * trying unless umount happens or the remote - * target does not support the operation, otherwise - * it would cause unecessary failure of the - * application. + * Since LWP is not replayable, so notify the caller + * to retry if needed after a while. */ - ptlrpc_req_finished(req); - rc = 0; - goto again; + rc = -EAGAIN; } GOTO(out_req, rc); } diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 916eb30..964bf35 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -526,6 +526,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_FLD 0x1100 #define OBD_FAIL_FLD_QUERY_NET 0x1101 #define OBD_FAIL_FLD_READ_NET 0x1102 +#define OBD_FAIL_FLD_QUERY_REQ 0x1103 #define OBD_FAIL_SEC_CTX 0x1200 #define OBD_FAIL_SEC_CTX_INIT_NET 0x1201 diff --git a/lustre/lod/lod_object.c b/lustre/lod/lod_object.c index 10bfe4d..dd5e8e1 100644 --- a/lustre/lod/lod_object.c +++ b/lustre/lod/lod_object.c @@ -7813,16 +7813,8 @@ static int lod_object_init(const struct lu_env *env, struct lu_object *lo, ENTRY; rc = lod_fld_lookup(env, lod, lu_object_fid(lo), &idx, &type); - if (rc != 0) { - /* Note: Sometimes, it will Return EAGAIN here, see - * ptrlpc_import_delay_req(), which might confuse - * lu_object_find_at() and make it wait there incorrectly. - * so we convert it to EIO here.*/ - if (rc == -EAGAIN) - rc = -EIO; - + if (rc != 0) RETURN(rc); - } if (type == LU_SEQ_RANGE_MDT && idx == lu_site2seq(lo->lo_dev->ld_site)->ss_node_id) { diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index 365be60..dcf8c84 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -2260,6 +2260,28 @@ test_110j () { } run_test 110j "drop update reply during cross-MDT ln" +test_110k() { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTS" + [[ $MDS1_VERSION -ge $(version_code 2.12.55) ]] || + { skip "Need MDS version at least 2.12.55"; } + + stop mds2 || error "stop mds2 failed" + umount $MOUNT + +#define OBD_FAIL_FLD_QUERY_REQ 0x1103 + do_facet mds2 lctl set_param fail_loc=0x1103 + start mds2 $(mdsdevname 2) -o abort_recovery || + error "start MDS with abort_recovery should succeed" + do_facet mds2 lctl set_param fail_loc=0 + + # cleanup + stop mds2 || error "cleanup: stop mds2 failed" + start mds2 $(mdsdevname 2) || error "cleanup: start mds2 failed" + zconf_mount $(hostname) $MOUNT || error "cleanup: mount failed" + client_up || error "post-failover df failed" +} +run_test 110k "FID_QUERY failed during recovery" + # LU-2844 mdt prepare fail should not cause umount oops test_111 () {