From bc871f8ff53068bfe69ad7653479b42e6a6d2d93 Mon Sep 17 00:00:00 2001 From: Alexander Boyko Date: Thu, 7 Nov 2019 06:13:50 -0500 Subject: [PATCH] LU-12949 obdclass: don't extend timer if obd stops During umount all clients became stale, so the first check at check_for_recovery_ready() is passed, but there is no guarantee that recovery timer was started. So, we need to check obd_stopping. The test 138 is added to recovery-smal.sh. It reproduces the issue when MDT is waiting for clients during recovery and MDT umount happens. extend_recovery_timer()) ASSERTION( obd->obd_recovery_start != 0 ) failed Cray-bug-id: LUS-7917 Signed-off-by: Alexander Boyko Change-Id: I1906fdfcc10606912a1f81560bb60b9d424db149 Reviewed-on: https://review.whamcloud.com/36703 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Alex Zhuravlev Reviewed-by: Andriy Skulysh Reviewed-by: Sergey Cheremencev Reviewed-by: Oleg Drokin --- lustre/include/obd_support.h | 1 + lustre/ldlm/ldlm_lib.c | 9 +++++++-- lustre/lod/lod_dev.c | 10 +++++++++- lustre/tests/recovery-small.sh | 26 ++++++++++++++++++++++++++ 4 files changed, 43 insertions(+), 3 deletions(-) diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 347be7a..6e11bd8 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -489,6 +489,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_TGT_RCVD_EIO 0x720 #define OBD_FAIL_TGT_RECOVERY_REQ_RACE 0x721 #define OBD_FAIL_TGT_REPLY_DATA_RACE 0x722 +#define OBD_FAIL_TGT_RECOVERY_CONNECT 0x724 #define OBD_FAIL_MDC_REVALIDATE_PAUSE 0x800 #define OBD_FAIL_MDC_ENQUEUE_PAUSE 0x801 diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index 3ffe771..e94d72a 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -1244,6 +1244,10 @@ int target_handle_connect(struct ptlrpc_request *req) rc = -EALREADY; class_export_put(export); export = NULL; + } else if (OBD_FAIL_PRECHECK(OBD_FAIL_TGT_RECOVERY_CONNECT) && + !lw_client) { + spin_unlock(&export->exp_lock); + rc = -EAGAIN; } else { export->exp_connecting = 1; spin_unlock(&export->exp_lock); @@ -1830,7 +1834,8 @@ static void extend_recovery_timer(struct obd_device *obd, time_t dr_timeout, time_t left; spin_lock(&obd->obd_dev_lock); - if (!obd->obd_recovering || obd->obd_abort_recovery) { + if (!obd->obd_recovering || obd->obd_abort_recovery || + obd->obd_stopping) { spin_unlock(&obd->obd_dev_lock); return; } @@ -2328,7 +2333,7 @@ static int check_for_recovery_ready(struct lu_target *lut) if (lut->lut_tdtd != NULL) { if (!lut->lut_tdtd->tdtd_replay_ready && - !obd->obd_abort_recovery) { + !obd->obd_abort_recovery && !obd->obd_stopping) { /* * Let's extend recovery timer, in case the recovery * timer expired, and some clients got evicted diff --git a/lustre/lod/lod_dev.c b/lustre/lod/lod_dev.c index 927c764..5b2b38f 100644 --- a/lustre/lod/lod_dev.c +++ b/lustre/lod/lod_dev.c @@ -386,7 +386,14 @@ static int lod_sub_recovery_thread(void *arg) start = ktime_get_real_seconds(); again: - rc = lod_sub_prep_llog(&env, lod, dt, lrd->lrd_idx); + + if (unlikely(OBD_FAIL_PRECHECK(OBD_FAIL_TGT_RECOVERY_CONNECT)) && + lrd->lrd_ltd) { + OBD_FAIL_TIMEOUT(OBD_FAIL_TGT_RECOVERY_CONNECT, cfs_fail_val); + rc = -EIO; + } else { + rc = lod_sub_prep_llog(&env, lod, dt, lrd->lrd_idx); + } if (!rc && !lod->lod_child->dd_rdonly) { /* Process the recovery record */ ctxt = llog_get_context(dt->dd_lu_dev.ld_obd, @@ -1039,6 +1046,7 @@ static int lod_process_config(const struct lu_env *env, case LCFG_PRE_CLEANUP: { lod_sub_process_config(env, lod, &lod->lod_mdt_descs, lcfg); lod_sub_process_config(env, lod, &lod->lod_ost_descs, lcfg); + OBD_FAIL_TIMEOUT(OBD_FAIL_TGT_RECOVERY_CONNECT, cfs_fail_val * 2); next = &lod->lod_child->dd_lu_dev; rc = next->ld_ops->ldo_process_config(env, next, lcfg); if (rc != 0) diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index 48de1b9..389d530 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -2878,6 +2878,32 @@ test_137() { } run_test 137 "late resend must be skipped if already applied" +test_138() { + remote_mds_nodsh && skip "remote MDS with nodsh" + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 + [[ "$MDS1_VERSION" -ge $(version_code 2.12.59) ]] || + skip "Need server version newer than 2.12.59" + + zconf_umount_clients $CLIENTS $MOUNT + +#define OBD_FAIL_TGT_RECOVERY_CONNECT 0x724 + #delay a first step of recovey when MDS waiting clients + #and failing to get osp logs + do_facet $SINGLEMDS $LCTL set_param fail_loc=0x724 fail_val=5 + + facet_failover $SINGLEMDS + + #waiting failover and recovery timer + #the valuse is based on target_recovery_overseer() wait_event timeout + sleep 55 + stop $SINGLEMDS || error "stop MDS failed" + do_facet $SINGLEMDS $LCTL set_param fail_loc=0 + start $SINGLEMDS $(mdsdevname ${SINGLEMDS//mds/}) || + error "start MDS failed" + zconf_mount_clients $CLIENTS $MOUNT +} +run_test 138 "Umount MDT during recovery" + complete $SECONDS check_and_cleanup_lustre exit_status -- 1.8.3.1