From cf7f66d87e52293535cde6e8cc7386e6c1bdfa46 Mon Sep 17 00:00:00 2001 From: wang di Date: Mon, 3 Feb 2014 13:19:21 -0800 Subject: [PATCH] LU-4571 fld: resend seq lookup RPC if it is on LWP Because Light Weight connection might be evicted after restart, then cause inflight RPC fails, to avoid this, we need resend seq lookup RPC. remove "-f" from "stop mdt" in sanity 17m, so umount can keep the the connection, and otherwise the OSP might be evicted. Signed-off-by: wang di Change-Id: I032dfb95e65da56b198129c6d6d6039bad08ab9c Reviewed-on: http://review.whamcloud.com/9106 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Jinshan Xiong Reviewed-by: Niu Yawei --- lustre/fld/fld_request.c | 21 +++++++++++++++++++-- lustre/tests/sanity.sh | 2 +- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/lustre/fld/fld_request.c b/lustre/fld/fld_request.c index 8d7539d..3c2d48f 100644 --- a/lustre/fld/fld_request.c +++ b/lustre/fld/fld_request.c @@ -430,6 +430,7 @@ int fld_client_rpc(struct obd_export *exp, LASSERT(exp != NULL); +again: imp = class_exp2cliimp(exp); switch (fld_op) { case FLD_QUERY: @@ -443,8 +444,14 @@ int fld_client_rpc(struct obd_export *exp, op = req_capsule_client_get(&req->rq_pill, &RMF_FLD_OPC); *op = FLD_LOOKUP; - if (imp->imp_connect_flags_orig & OBD_CONNECT_MDS_MDS) + /* For MDS_MDS seq lookup, it will always use LWP connection, + * but LWP will be evicted after restart, so cause the error. + * so we will set no_delay for seq lookup request, once the + * request fails because of the eviction. always retry here */ + if (imp->imp_connect_flags_orig & OBD_CONNECT_MDS_MDS) { req->rq_allow_replay = 1; + req->rq_no_delay = 1; + } break; case FLD_READ: req = ptlrpc_request_alloc_pack(imp, &RQF_FLD_READ, @@ -473,8 +480,18 @@ int fld_client_rpc(struct obd_export *exp, fld_enter_request(&exp->exp_obd->u.cli); rc = ptlrpc_queue_wait(req); fld_exit_request(&exp->exp_obd->u.cli); - if (rc) + if (rc != 0) { + if (rc == -EWOULDBLOCK) { + /* For no_delay req(see above), EWOULDBLOCK means the + * connection is being evicted, but this seq lookup + * should not return error, since it would cause + * unecessary failure of the application, instead + * it should retry here */ + ptlrpc_req_finished(req); + goto again; + } GOTO(out_req, rc); + } if (fld_op == FLD_QUERY) { prange = req_capsule_server_get(&req->rq_pill, diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 5b959cf..e25c2f6 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -571,7 +571,7 @@ test_17m() { echo "stop and checking mds${mds_index}: $cmd" # e2fsck should not return error - stop mds${mds_index} -f + stop mds${mds_index} do_facet mds${mds_index} $cmd || rc=$? start mds${mds_index} $devname $MDS_MOUNT_OPTS -- 1.8.3.1