From: Johann Lombardi Date: Tue, 3 Jul 2012 10:54:54 +0000 (+0200) Subject: LU-1329 ptlrpc: resend request on -EINPROGRESS X-Git-Tag: 2.2.92~15 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=7223b4746c71bf450c178ed21ddf99a0e3e26a1a;p=fs%2Flustre-release.git LU-1329 ptlrpc: resend request on -EINPROGRESS It seems the EINPROGRESS is going to be used for many different purposes (e.g. on statfs see review 3198). As a result, it sounds like ptlrpc is the right place to resend requests on EINPROGRESS. Upper layers can still decide to handle EINPROGRESS by themselves by setting rq_no_retry_einprogress to 1. Signed-off-by: Johann Lombardi Change-Id: Iae2a5976666e66be4f6e71f82c5653e5636ba07d Reviewed-on: http://review.whamcloud.com/3262 Tested-by: Hudson Reviewed-by: Andreas Dilger Tested-by: Maloo Reviewed-by: Niu Yawei Reviewed-by: Oleg Drokin --- diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h index e09b50e..b5d8985 100644 --- a/lustre/include/lustre_net.h +++ b/lustre/include/lustre_net.h @@ -718,7 +718,11 @@ struct ptlrpc_request { rq_committed:1, /* whether the "rq_set" is a valid one */ rq_invalid_rqset:1, - rq_generation_set:1; + rq_generation_set:1, + /* do not resend request on -EINPROGRESS */ + rq_no_retry_einprogress:1; + + unsigned int rq_nr_resend; enum rq_phase rq_phase; /* one of RQ_PHASE_* */ enum rq_phase rq_next_phase; /* one of RQ_PHASE_* to be used next */ diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 665dc89..bcaa60d 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -289,6 +289,7 @@ int obd_alloc_fail(const void *ptr, const char *name, const char *type, #define OBD_FAIL_OST_MAPBLK_ENOSPC 0x228 #define OBD_FAIL_OST_ENOINO 0x229 #define OBD_FAIL_OST_DQACQ_NET 0x230 +#define OBD_FAIL_OST_STATFS_EINPROGRESS 0x231 #define OBD_FAIL_LDLM 0x300 #define OBD_FAIL_LDLM_NAMESPACE_NEW 0x301 diff --git a/lustre/mdc/mdc_locks.c b/lustre/mdc/mdc_locks.c index 746012e..855d2ee 100644 --- a/lustre/mdc/mdc_locks.c +++ b/lustre/mdc/mdc_locks.c @@ -733,6 +733,11 @@ resend: if (IS_ERR(req)) RETURN(PTR_ERR(req)); + if (req != NULL && it && it->it_op & IT_CREAT) + /* ask ptlrpc not to resend on EINPROGRESS since we have our own + * retry logic */ + req->rq_no_retry_einprogress = 1; + if (resends) { req->rq_generation_set = 1; req->rq_import_generation = generation; @@ -791,7 +796,7 @@ resend: if (generation == obddev->u.cli.cl_import->imp_generation) { goto resend; } else { - CDEBUG(D_HA, "resned cross eviction\n"); + CDEBUG(D_HA, "resend cross eviction\n"); RETURN(-EIO); } } diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index 879b30c..ed0ac4b 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -1251,6 +1251,9 @@ static int osc_brw_prep_request(int cmd, struct client_obd *cli,struct obdo *oa, } req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */ ptlrpc_at_set_req_timeout(req); + /* ask ptlrpc not to resend on EINPROGRESS since BRWs have their own + * retry logic */ + req->rq_no_retry_einprogress = 1; if (opc == OST_WRITE) desc = ptlrpc_prep_bulk_imp(req, page_count, diff --git a/lustre/ost/ost_handler.c b/lustre/ost/ost_handler.c index b9405cb..17a0fb2 100644 --- a/lustre/ost/ost_handler.c +++ b/lustre/ost/ost_handler.c @@ -322,6 +322,9 @@ static int ost_statfs(struct ptlrpc_request *req) if (req->rq_status != 0) CERROR("ost: statfs failed: rc %d\n", req->rq_status); + if (OBD_FAIL_CHECK(OBD_FAIL_OST_STATFS_EINPROGRESS)) + req->rq_status = -EINPROGRESS; + RETURN(0); } diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index 3ad9a87..aefce46 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -1275,6 +1275,27 @@ static int after_reply(struct ptlrpc_request *req) RETURN(rc); } + /* retry indefinitely on EINPROGRESS */ + if (lustre_msg_get_status(req->rq_repmsg) == -EINPROGRESS && + ptlrpc_no_resend(req) == 0 && !req->rq_no_retry_einprogress) { + time_t now = cfs_time_current_sec(); + + DEBUG_REQ(D_RPCTRACE, req, "Resending request on EINPROGRESS"); + req->rq_resend = 1; + req->rq_nr_resend++; + + /* Readjust the timeout for current conditions */ + ptlrpc_at_set_req_timeout(req); + /* delay resend to give a chance to the server to get ready. + * The delay is increased by 1s on every resend and is capped to + * the current request timeout (i.e. obd_timeout if AT is off, + * or AT service time x 125% + 5s, see at_est2timeout) */ + if (req->rq_nr_resend > req->rq_timeout) + req->rq_sent = now + req->rq_timeout; + else + req->rq_sent = now + req->rq_nr_resend; + } + /* * Security layer unwrap might ask resend this request. */ @@ -1511,7 +1532,12 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set) /* delayed send - skip */ if (req->rq_phase == RQ_PHASE_NEW && req->rq_sent) - continue; + continue; + + /* delayed resend - skip */ + if (req->rq_phase == RQ_PHASE_RPC && req->rq_resend && + req->rq_sent > cfs_time_current_sec()) + continue; if (!(req->rq_phase == RQ_PHASE_RPC || req->rq_phase == RQ_PHASE_BULK || @@ -2042,6 +2068,8 @@ int ptlrpc_set_next_timeout(struct ptlrpc_request_set *set) if (req->rq_phase == RQ_PHASE_NEW) deadline = req->rq_sent; + else if (req->rq_phase == RQ_PHASE_RPC && req->rq_resend) + deadline = req->rq_sent; else deadline = req->rq_sent + req->rq_timeout; diff --git a/lustre/tests/replay-ost-single.sh b/lustre/tests/replay-ost-single.sh index 0f4194b..fa62200 100755 --- a/lustre/tests/replay-ost-single.sh +++ b/lustre/tests/replay-ost-single.sh @@ -306,8 +306,7 @@ test_8c() { } run_test 8c "Verify redo io: redo io should fail after eviction" - -test_9d() { +test_8d() { #define OBD_FAIL_MDS_DQACQ_NET 0x187 do_facet $SINGLEMDS "lctl set_param fail_loc=0x187" # test the non-intent create path @@ -338,7 +337,23 @@ test_9d() { wait $cpid || return 4 stat $TDIR/$tfile || error "open failed" } -run_test 9d "Verify redo creation on -EINPROGRESS" +run_test 8d "Verify redo creation on -EINPROGRESS" + +test_8e() { + sleep 1 # ensure we have a fresh statfs +#define OBD_FAIL_OST_STATFS_EINPROGRESS 0x231 + do_facet ost1 "lctl set_param fail_loc=0x231" + df $MOUNT & + dfpid=$! + sleep $TIMEOUT + if ! ps -p $dfpid > /dev/null 2>&1; then + do_facet ost1 "lctl set_param fail_loc=0" + error "df shouldn't have completed!" + return 1 + fi + do_facet ost1 "lctl set_param fail_loc=0" +} +run_test 8e "Verify that ptlrpc resends request on -EINPROGRESS" complete $(basename $0) $SECONDS check_and_cleanup_lustre