From: Mikhail Pershin Date: Thu, 21 Apr 2011 19:17:43 +0000 (+0400) Subject: LU-178 prevent replays after recovery on server X-Git-Tag: 2.0.60.0~6 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=5d4ae6c905b8e8a38a6e4d3195f550a1e2b37ba8 LU-178 prevent replays after recovery on server - Drop requests with wrong flags during normal processing. - remove race in target_handle_connect which can cause replays over new export - test case for wrong replays handling Change-Id: Icbfc0ae549ab63a510e0d62fcac1ed661b324fb9 Signed-off-by: Mikhail Pershin Reviewed-on: http://review.whamcloud.com/391 Tested-by: Hudson Reviewed-by: Oleg Drokin Reviewed-by: Niu Yawei --- diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 4a8cd08..dd1d373 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -379,6 +379,7 @@ int obd_alloc_fail(const void *ptr, const char *name, const char *type, #define OBD_FAIL_TGT_REPLAY_DELAY 0x709 #define OBD_FAIL_TGT_LAST_REPLAY 0x710 #define OBD_FAIL_TGT_CLIENT_ADD 0x711 +#define OBD_FAIL_TGT_RCVG_FLAG 0x712 #define OBD_FAIL_MDC_REVALIDATE_PAUSE 0x800 #define OBD_FAIL_MDC_ENQUEUE_PAUSE 0x801 diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index a3da4c0..e2b0410 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -870,15 +870,10 @@ no_export: export, (long)cfs_time_current_sec(), export ? (long)export->exp_last_request_time : 0); - /* Tell the client if we're in recovery. */ - if (target->obd_recovering) { - lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_RECOVERING); - /* If this is the first time a client connects, - reset the recovery timer */ - if (rc == 0) - target_start_and_reset_recovery_timer(target, req, - !export); - } + /* If this is the first time a client connects, + * reset the recovery timer */ + if (rc == 0 && target->obd_recovering) + target_start_and_reset_recovery_timer(target, req, !export); /* We want to handle EALREADY but *not* -EALREADY from * target_handle_reconnect(), return reconnection state in a flag */ @@ -1019,6 +1014,11 @@ dont_check_exports: cfs_waitq_signal(&target->obd_next_transno_waitq); } cfs_spin_unlock(&target->obd_recovery_task_lock); + + /* Tell the client we're in recovery, when client is involved in it. */ + if (target->obd_recovering) + lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_RECOVERING); + tmp = req_capsule_client_get(&req->rq_pill, &RMF_CONN); conn = *tmp; diff --git a/lustre/mdt/mdt_handler.c b/lustre/mdt/mdt_handler.c index da0fc09..dc5e254 100644 --- a/lustre/mdt/mdt_handler.c +++ b/lustre/mdt/mdt_handler.c @@ -5028,6 +5028,9 @@ static int mdt_obd_connect(const struct lu_env *env, if (rc) GOTO(out, rc); + if (OBD_FAIL_CHECK(OBD_FAIL_TGT_RCVG_FLAG)) + lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_RECOVERING); + rc = mdt_connect_internal(lexp, mdt, data); if (rc == 0) { struct mdt_thread_info *mti; diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index 39b7824..ae3565f 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -2529,6 +2529,7 @@ static int ptlrpc_replay_interpret(const struct lu_env *env, imp->imp_vbr_failed = 1; imp->imp_no_lock_replay = 1; cfs_spin_unlock(&imp->imp_lock); + lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status); } else { /** The transno had better not change over replay. */ LASSERTF(lustre_msg_get_transno(req->rq_reqmsg) == @@ -2547,6 +2548,15 @@ static int ptlrpc_replay_interpret(const struct lu_env *env, cfs_spin_unlock(&imp->imp_lock); LASSERT(imp->imp_last_replay_transno); + /* transaction number shouldn't be bigger than the latest replayed */ + if (req->rq_transno > lustre_msg_get_transno(req->rq_reqmsg)) { + DEBUG_REQ(D_ERROR, req, + "Reported transno "LPU64" is bigger than the " + "replayed one: "LPU64, req->rq_transno, + lustre_msg_get_transno(req->rq_reqmsg)); + GOTO(out, rc = -EINVAL); + } + DEBUG_REQ(D_HA, req, "got rep"); /* let the callback do fixups, possibly including in the request */ @@ -2567,13 +2577,9 @@ static int ptlrpc_replay_interpret(const struct lu_env *env, * Errors while replay can set transno to 0, but * imp_last_replay_transno shouldn't be set to 0 anyway */ - if (req->rq_transno > 0) { - cfs_spin_lock(&imp->imp_lock); - LASSERT(req->rq_transno <= imp->imp_last_replay_transno); - imp->imp_last_replay_transno = req->rq_transno; - cfs_spin_unlock(&imp->imp_lock); - } else + if (req->rq_transno == 0) CERROR("Transno is 0 during replay!\n"); + /* continue with recovery */ rc = ptlrpc_import_recovery_state_machine(imp); out: diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c index d53232e..1e3c9d9 100644 --- a/lustre/ptlrpc/service.c +++ b/lustre/ptlrpc/service.c @@ -831,6 +831,8 @@ static void ptlrpc_update_export_timer(struct obd_export *exp, long extra_delay) */ static int ptlrpc_check_req(struct ptlrpc_request *req) { + int rc = 0; + if (unlikely(lustre_msg_get_conn_cnt(req->rq_reqmsg) < req->rq_export->exp_conn_cnt)) { DEBUG_REQ(D_ERROR, req, @@ -845,12 +847,28 @@ static int ptlrpc_check_req(struct ptlrpc_request *req) error response instead. */ CDEBUG(D_RPCTRACE, "Dropping req %p for failed obd %s\n", req, req->rq_export->exp_obd->obd_name); - req->rq_status = -ENODEV; + rc = -ENODEV; + } else if (lustre_msg_get_flags(req->rq_reqmsg) & + (MSG_REPLAY | MSG_REQ_REPLAY_DONE) && + !(req->rq_export->exp_obd->obd_recovering)) { + DEBUG_REQ(D_ERROR, req, + "Invalid replay without recovery"); + class_fail_export(req->rq_export); + rc = -ENODEV; + } else if (lustre_msg_get_transno(req->rq_reqmsg) != 0 && + !(req->rq_export->exp_obd->obd_recovering)) { + DEBUG_REQ(D_ERROR, req, "Invalid req with transno " + LPU64" without recovery", + lustre_msg_get_transno(req->rq_reqmsg)); + class_fail_export(req->rq_export); + rc = -ENODEV; + } + + if (unlikely(rc < 0)) { + req->rq_status = rc; ptlrpc_error(req); - return -ENODEV; } - - return 0; + return rc; } static void ptlrpc_at_set_timer(struct ptlrpc_service *svc) diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index 16e297d..e98a284 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -919,6 +919,19 @@ test_44b() { } run_test 44b "race in target handle connect" +test_44c() { + replay_barrier $SINGLEMDS + createmany -m $DIR/$tfile-%d 100 +#define OBD_FAIL_TGT_RCVG_FLAG 0x712 + do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000712" + fail_abort $SINGLEMDS + unlinkmany $DIR/$tfile-%d 100 && return 1 + fail $SINGLEMDS + unlinkmany $DIR/$tfile-%d 100 && return 1 + return 0 +} +run_test 44c "race in target handle connect" + # Handle failed close test_45() { mdcdev=`lctl get_param -n devices | awk '/MDT0000-mdc-/ {print $1}'`