From b1acf734f31c13d291c5e1534d7a01f0fbd7e972 Mon Sep 17 00:00:00 2001 From: Vladimir Saveliev Date: Sun, 31 Oct 2021 09:42:35 +0300 Subject: [PATCH] LU-13468 fld: repeat rpc in fld_client_rpc after EAGAIN Timeout-ed rpc sent by fld_client_rpc() may lead to client operation failure. Have fld_client_rpc() to repeat rpc in case of EAGAIN after a while. Test to illustrate the issue is added. Typo in fld_client_rpc() in failure simulation is fixed. recovery-small.sh:test_110k() is changed so that fld_client_rpc() failed only once, otherwise it would fall into endless loop. HPE-bug-id: LUS-8652 Fixes: e3f6111dfd1c ("LU-11761 fld: lets caller to retry FLD_QUERY") Signed-off-by: Vladimir Saveliev Change-Id: I145e719ec2fb5f5dbf9b5aa4b2a5b7e62f98c19f Reviewed-on: https://review.whamcloud.com/38302 Tested-by: jenkins Reviewed-by: Andreas Dilger Tested-by: Maloo Reviewed-by: Andriy Skulysh Reviewed-by: Oleg Drokin --- lustre/fld/fld_handler.c | 2 +- lustre/fld/fld_request.c | 16 +++++++++++----- lustre/tests/recovery-small.sh | 2 +- lustre/tests/sanity.sh | 12 ++++++++++++ 4 files changed, 25 insertions(+), 7 deletions(-) diff --git a/lustre/fld/fld_handler.c b/lustre/fld/fld_handler.c index 0fbd690..6f01007 100644 --- a/lustre/fld/fld_handler.c +++ b/lustre/fld/fld_handler.c @@ -159,7 +159,7 @@ int fld_update_from_controller(const struct lu_env *env, do { rc = fld_client_rpc(fld->lsf_control_exp, range, FLD_READ, &req); - if (rc != 0 && rc != -EAGAIN) + if (rc != 0) GOTO(out, rc); LASSERT(req != NULL); diff --git a/lustre/fld/fld_request.c b/lustre/fld/fld_request.c index 08387ee..3be93c2 100644 --- a/lustre/fld/fld_request.c +++ b/lustre/fld/fld_request.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include @@ -333,6 +334,7 @@ int fld_client_rpc(struct obd_export *exp, LASSERT(exp != NULL); imp = class_exp2cliimp(exp); +again: switch (fld_op) { case FLD_QUERY: req = ptlrpc_request_alloc_pack(imp, &RQF_FLD_QUERY, @@ -382,7 +384,7 @@ int fld_client_rpc(struct obd_export *exp, req->rq_reply_portal = MDC_REPLY_PORTAL; ptlrpc_at_set_req_timeout(req); - if (OBD_FAIL_CHECK(OBD_FAIL_FLD_QUERY_REQ && req->rq_no_delay)) { + if (OBD_FAIL_CHECK(OBD_FAIL_FLD_QUERY_REQ) && req->rq_no_delay) { /* the same error returned by ptlrpc_import_delay_req */ rc = -EAGAIN; req->rq_status = rc; @@ -403,12 +405,16 @@ int fld_client_rpc(struct obd_export *exp, imp->imp_connect_flags_orig & OBD_CONNECT_MDS_MDS && OCD_HAS_FLAG(&imp->imp_connect_data, LIGHTWEIGHT) && rc != -ENOTSUPP) { - /* - * Since LWP is not replayable, so notify the caller - * to retry if needed after a while. - */ + /* LWP is not replayable, retry after a while */ rc = -EAGAIN; } + if (rc == -EAGAIN) { + ptlrpc_req_finished(req); + if (msleep_interruptible(2 * MSEC_PER_SEC)) + GOTO(out_req, rc = -EINTR); + rc = 0; + goto again; + } GOTO(out_req, rc); } diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index d8a3964..08e83aa 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -2273,7 +2273,7 @@ test_110k() { stop mds2 || error "stop mds2 failed" #define OBD_FAIL_FLD_QUERY_REQ 0x1103 - do_facet mds2 lctl set_param fail_loc=0x1103 + do_facet mds2 lctl set_param fail_loc=0x80001103 local OPTS="$MDS_MOUNT_OPTS -o abort_recovery" start mds2 $(mdsdevname 2) $OPTS || error "start MDS with abort_recovery should succeed" diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 58f311e..0878c8c 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -1500,6 +1500,18 @@ test_24G () { } run_test 24G "migrate symlink in rename" +test_24H() { + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" + [[ $(hostname) != $(facet_active_host mds2) ]] || + skip "MDT1 should be on another node" + + test_mkdir -i 1 -c 1 $DIR/$tdir +#define OBD_FAIL_FLD_QUERY_REQ 0x1103 + do_facet mds2 $LCTL set_param fail_loc=0x80001103 + touch $DIR/$tdir/$tfile || error "touch failed" +} +run_test 24H "repeat FLD_QUERY rpc" + test_25a() { echo '== symlink sanity =============================================' -- 1.8.3.1